regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 /* Returns a boolean as to whether the input unsigned number is a power of 2
 100  * (2**0, 2**1, etc).  In other words if it has just a single bit set.
 101  * If not, subtracting 1 would leave the uppermost bit set, so the & would
 102  * yield non-zero */
 103 #define isPOWER_OF_2(n) ((n & (n-1)) == 0)
 104
 105 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 106     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 107     goto target;                                                         \
 108 } STMT_END
 109
 110 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 111
 112 #ifndef STATIC
 113 #define STATIC  static
 114 #endif
 115
 116 /* Valid only if 'c', the character being looke-up, is an invariant under
 117  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 118  * everything matchable is straight forward in the bitmap */
 119 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 120                                 ? reginclass(prog,p,c,c+1,u)                \
 121                                 : ANYOF_BITMAP_TEST(p,*(c)))
 122
 123 /*
 124  * Forwards.
 125  */
 126
 127 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 128
 129 #define HOPc(pos,off) \
 130         (char *)(reginfo->is_utf8_target \
 131             ? reghop3((U8*)pos, off, \
 132                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 133             : (U8*)(pos + off))
 134
 135 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 136 #define HOPBACK3(pos, off, lim) \
 137         (reginfo->is_utf8_target                          \
 138             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 139             : (pos - off >= lim)                                 \
 140                 ? (U8*)pos - off                                 \
 141                 : NULL)
 142
 143 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 144
 145 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 146 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 147
 148 /* lim must be +ve. Returns NULL on overshoot */
 149 #define HOPMAYBE3(pos,off,lim) \
 150         (reginfo->is_utf8_target                        \
 151             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 152             : ((U8*)pos + off <= lim)                   \
 153                 ? (U8*)pos + off                        \
 154                 : NULL)
 155
 156 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 157  * off must be >=0; args should be vars rather than expressions */
 158 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 159     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 160     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 161 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 162
 163 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 164     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 165     : (U8*)(pos + off))
 166 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 167
 168 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 169 #define NEXTCHR_IS_EOS (nextchr < 0)
 170
 171 #define SET_nextchr \
 172     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 173
 174 #define SET_locinput(p) \
 175     locinput = (p);  \
 176     SET_nextchr
 177
 178 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 179 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 180
 181 /* for use after a quantifier and before an EXACT-like node -- japhy */
 182 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 183  *
 184  * NOTE that *nothing* that affects backtracking should be in here, specifically
 185  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 186  * node that is in between two EXACT like nodes when ascertaining what the required
 187  * "follow" character is. This should probably be moved to regex compile time
 188  * although it may be done at run time beause of the REF possibility - more
 189  * investigation required. -- demerphq
 190 */
 191 #define JUMPABLE(rn) (                                                             \
 192     OP(rn) == OPEN ||                                                              \
 193     (OP(rn) == CLOSE &&                                                            \
 194      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 195     OP(rn) == EVAL ||                                                              \
 196     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 197     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 198     OP(rn) == KEEPS ||                                                             \
 199     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 200 )
 201 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 202
 203 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 204
 205 #if 0
 206 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 207    we don't need this definition.  XXX These are now out-of-sync*/
 208 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 209 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFAA || OP(rn)==EXACTFAA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 210 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 211
 212 #else
 213 /* ... so we use this as its faster. */
 214 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 215 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFAA || OP(rn) == EXACTFAA_NO_TRIE)
 216 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 217 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 218
 219 #endif
 220
 221 /*
 222   Search for mandatory following text node; for lookahead, the text must
 223   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 224 */
 225 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 226     while (JUMPABLE(rn)) { \
 227         const OPCODE type = OP(rn); \
 228         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 229             rn = NEXTOPER(NEXTOPER(rn)); \
 230         else if (type == PLUS) \
 231             rn = NEXTOPER(rn); \
 232         else if (type == IFMATCH) \
 233             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 234         else rn += NEXT_OFF(rn); \
 235     } \
 236 } STMT_END
 237
 238 #define SLAB_FIRST(s) (&(s)->states[0])
 239 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 240
 241 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 242 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 243 static regmatch_state * S_push_slab(pTHX);
 244
 245 #define REGCP_PAREN_ELEMS 3
 246 #define REGCP_OTHER_ELEMS 3
 247 #define REGCP_FRAME_ELEMS 1
 248 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 249  * are needed for the regexp context stack bookkeeping. */
 250
 251 STATIC CHECKPOINT
 252 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 253 {
 254     const int retval = PL_savestack_ix;
 255     const int paren_elems_to_push =
 256                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 257     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 258     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 259     I32 p;
 260     GET_RE_DEBUG_FLAGS_DECL;
 261
 262     PERL_ARGS_ASSERT_REGCPPUSH;
 263
 264     if (paren_elems_to_push < 0)
 265         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 266                    (int)paren_elems_to_push, (int)maxopenparen,
 267                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 268
 269     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 270         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 271                    " out of range (%lu-%ld)",
 272                    total_elems,
 273                    (unsigned long)maxopenparen,
 274                    (long)parenfloor);
 275
 276     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 277
 278     DEBUG_BUFFERS_r(
 279         if ((int)maxopenparen > (int)parenfloor)
 280             Perl_re_exec_indentf( aTHX_
 281                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 282                 depth,
 283                 PTR2UV(rex),
 284                 PTR2UV(rex->offs)
 285             );
 286     );
 287     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 288 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 289         SSPUSHIV(rex->offs[p].end);
 290         SSPUSHIV(rex->offs[p].start);
 291         SSPUSHINT(rex->offs[p].start_tmp);
 292         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 293             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 294             depth,
 295             (UV)p,
 296             (IV)rex->offs[p].start,
 297             (IV)rex->offs[p].start_tmp,
 298             (IV)rex->offs[p].end
 299         ));
 300     }
 301 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 302     SSPUSHINT(maxopenparen);
 303     SSPUSHINT(rex->lastparen);
 304     SSPUSHINT(rex->lastcloseparen);
 305     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 306
 307     return retval;
 308 }
 309
 310 /* These are needed since we do not localize EVAL nodes: */
 311 #define REGCP_SET(cp)                                           \
 312     DEBUG_STATE_r(                                              \
 313         Perl_re_exec_indentf( aTHX_                             \
 314             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 315             depth, (IV)PL_savestack_ix                          \
 316         )                                                       \
 317     );                                                          \
 318     cp = PL_savestack_ix
 319
 320 #define REGCP_UNWIND(cp)                                        \
 321     DEBUG_STATE_r(                                              \
 322         if (cp != PL_savestack_ix)                              \
 323             Perl_re_exec_indentf( aTHX_                         \
 324                 "Clearing an EVAL scope, savestack=%"           \
 325                 IVdf "..%" IVdf "\n",                           \
 326                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 327             )                                                   \
 328     );                                                          \
 329     regcpblow(cp)
 330
 331 /* set the start and end positions of capture ix */
 332 #define CLOSE_CAPTURE(ix, s, e)                                            \
 333     rex->offs[ix].start = s;                                               \
 334     rex->offs[ix].end = e;                                                 \
 335     if (ix > rex->lastparen)                                               \
 336         rex->lastparen = ix;                                               \
 337     rex->lastcloseparen = ix;                                              \
 338     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
 339         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
 340         depth,                                                             \
 341         PTR2UV(rex),                                                       \
 342         PTR2UV(rex->offs),                                                 \
 343         (UV)ix,                                                            \
 344         (IV)rex->offs[ix].start,                                           \
 345         (IV)rex->offs[ix].end,                                             \
 346         (UV)rex->lastparen                                                 \
 347     ))
 348
 349 #define UNWIND_PAREN(lp, lcp)               \
 350     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
 351         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
 352         depth,                              \
 353         PTR2UV(rex),                        \
 354         PTR2UV(rex->offs),                  \
 355         (UV)(lp),                           \
 356         (UV)(rex->lastparen),               \
 357         (UV)(lcp)                           \
 358     ));                                     \
 359     for (n = rex->lastparen; n > lp; n--)   \
 360         rex->offs[n].end = -1;              \
 361     rex->lastparen = n;                     \
 362     rex->lastcloseparen = lcp;
 363
 364
 365 STATIC void
 366 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 367 {
 368     UV i;
 369     U32 paren;
 370     GET_RE_DEBUG_FLAGS_DECL;
 371
 372     PERL_ARGS_ASSERT_REGCPPOP;
 373
 374     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 375     i = SSPOPUV;
 376     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 377     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 378     rex->lastcloseparen = SSPOPINT;
 379     rex->lastparen = SSPOPINT;
 380     *maxopenparen_p = SSPOPINT;
 381
 382     i -= REGCP_OTHER_ELEMS;
 383     /* Now restore the parentheses context. */
 384     DEBUG_BUFFERS_r(
 385         if (i || rex->lastparen + 1 <= rex->nparens)
 386             Perl_re_exec_indentf( aTHX_
 387                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 388                 depth,
 389                 PTR2UV(rex),
 390                 PTR2UV(rex->offs)
 391             );
 392     );
 393     paren = *maxopenparen_p;
 394     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 395         SSize_t tmps;
 396         rex->offs[paren].start_tmp = SSPOPINT;
 397         rex->offs[paren].start = SSPOPIV;
 398         tmps = SSPOPIV;
 399         if (paren <= rex->lastparen)
 400             rex->offs[paren].end = tmps;
 401         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 402             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 403             depth,
 404             (UV)paren,
 405             (IV)rex->offs[paren].start,
 406             (IV)rex->offs[paren].start_tmp,
 407             (IV)rex->offs[paren].end,
 408             (paren > rex->lastparen ? "(skipped)" : ""));
 409         );
 410         paren--;
 411     }
 412 #if 1
 413     /* It would seem that the similar code in regtry()
 414      * already takes care of this, and in fact it is in
 415      * a better location to since this code can #if 0-ed out
 416      * but the code in regtry() is needed or otherwise tests
 417      * requiring null fields (pat.t#187 and split.t#{13,14}
 418      * (as of patchlevel 7877)  will fail.  Then again,
 419      * this code seems to be necessary or otherwise
 420      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 421      * --jhi updated by dapm */
 422     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 423         if (i > *maxopenparen_p)
 424             rex->offs[i].start = -1;
 425         rex->offs[i].end = -1;
 426         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 427             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 428             depth,
 429             (UV)i,
 430             (i > *maxopenparen_p) ? "-1" : "  "
 431         ));
 432     }
 433 #endif
 434 }
 435
 436 /* restore the parens and associated vars at savestack position ix,
 437  * but without popping the stack */
 438
 439 STATIC void
 440 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 441 {
 442     I32 tmpix = PL_savestack_ix;
 443     PERL_ARGS_ASSERT_REGCP_RESTORE;
 444
 445     PL_savestack_ix = ix;
 446     regcppop(rex, maxopenparen_p);
 447     PL_savestack_ix = tmpix;
 448 }
 449
 450 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 451
 452 #ifndef PERL_IN_XSUB_RE
 453
 454 bool
 455 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 456 {
 457     /* Returns a boolean as to whether or not 'character' is a member of the
 458      * Posix character class given by 'classnum' that should be equivalent to a
 459      * value in the typedef '_char_class_number'.
 460      *
 461      * Ideally this could be replaced by a just an array of function pointers
 462      * to the C library functions that implement the macros this calls.
 463      * However, to compile, the precise function signatures are required, and
 464      * these may vary from platform to to platform.  To avoid having to figure
 465      * out what those all are on each platform, I (khw) am using this method,
 466      * which adds an extra layer of function call overhead (unless the C
 467      * optimizer strips it away).  But we don't particularly care about
 468      * performance with locales anyway. */
 469
 470     switch ((_char_class_number) classnum) {
 471         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 472         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 473         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 474         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 475         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 476                                         || isUPPER_LC(character);
 477         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 478         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 479         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 480         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 481         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 482         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 483         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 484         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 485         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 486         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 487         default:    /* VERTSPACE should never occur in locales */
 488             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 489     }
 490
 491     NOT_REACHED; /* NOTREACHED */
 492     return FALSE;
 493 }
 494
 495 #endif
 496
 497 STATIC bool
 498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 499 {
 500     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 501      * 'character' is a member of the Posix character class given by 'classnum'
 502      * that should be equivalent to a value in the typedef
 503      * '_char_class_number'.
 504      *
 505      * This just calls isFOO_lc on the code point for the character if it is in
 506      * the range 0-255.  Outside that range, all characters use Unicode
 507      * rules, ignoring any locale.  So use the Unicode function if this class
 508      * requires a swash, and use the Unicode macro otherwise. */
 509
 510     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 511
 512     if (UTF8_IS_INVARIANT(*character)) {
 513         return isFOO_lc(classnum, *character);
 514     }
 515     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 516         return isFOO_lc(classnum,
 517                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 518     }
 519
 520     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 521
 522     switch ((_char_class_number) classnum) {
 523         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 524         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 525         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 526         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 527         default:
 528             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 529                                         utf8_to_uvchr_buf(character, e, NULL));
 530     }
 531
 532     return FALSE; /* Things like CNTRL are always below 256 */
 533 }
 534
 535 STATIC char *
 536 S_find_next_ascii(char * s, const char * send, const bool utf8_target)
 537 {
 538     /* Returns the position of the first ASCII byte in the sequence between 's'
 539      * and 'send-1' inclusive; returns 'send' if none found */
 540
 541     PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
 542
 543 #ifndef EBCDIC
 544
 545     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 546
 547                             /* This term is wordsize if subword; 0 if not */
 548                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 549
 550                             /* 'offset' */
 551                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 552     {
 553
 554         /* Process per-byte until reach word boundary.  XXX This loop could be
 555          * eliminated if we knew that this platform had fast unaligned reads */
 556         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 557             if (isASCII(*s)) {
 558                 return s;
 559             }
 560             s++;    /* khw didn't bother creating a separate loop for
 561                        utf8_target */
 562         }
 563
 564         /* Here, we know we have at least one full word to process.  Process
 565          * per-word as long as we have at least a full word left */
 566         do {
 567             PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
 568             if (complemented & PERL_VARIANTS_WORD_MASK)  {
 569
 570 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 571      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 572
 573                 s += _variant_byte_number(complemented);
 574                 return s;
 575
 576 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 577            checks. */
 578
 579                 break;
 580 #  endif
 581             }
 582
 583             s += PERL_WORDSIZE;
 584
 585         } while (s + PERL_WORDSIZE <= send);
 586     }
 587
 588 #endif
 589
 590     /* Process per-character */
 591     if (utf8_target) {
 592         while (s < send) {
 593             if (isASCII(*s)) {
 594                 return s;
 595             }
 596             s += UTF8SKIP(s);
 597         }
 598     }
 599     else {
 600         while (s < send) {
 601             if (isASCII(*s)) {
 602                 return s;
 603             }
 604             s++;
 605         }
 606     }
 607
 608     return s;
 609 }
 610
 611 STATIC char *
 612 S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
 613 {
 614     /* Returns the position of the first non-ASCII byte in the sequence between
 615      * 's' and 'send-1' inclusive; returns 'send' if none found */
 616
 617 #ifdef EBCDIC
 618
 619     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 620
 621     if (utf8_target) {
 622         while (s < send) {
 623             if ( ! isASCII(*s)) {
 624                 return s;
 625             }
 626             s += UTF8SKIP(s);
 627         }
 628     }
 629     else {
 630         while (s < send) {
 631             if ( ! isASCII(*s)) {
 632                 return s;
 633             }
 634             s++;
 635         }
 636     }
 637
 638     return s;
 639
 640 #else
 641
 642     const U8 * next_non_ascii = NULL;
 643
 644     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 645     PERL_UNUSED_ARG(utf8_target);
 646
 647     /* On ASCII platforms invariants and ASCII are identical, so if the string
 648      * is entirely invariants, there is no non-ASCII character */
 649     return (is_utf8_invariant_string_loc((U8 *) s,
 650                                          (STRLEN) (send - s),
 651                                          &next_non_ascii))
 652             ? (char *) send
 653             : (char *) next_non_ascii;
 654
 655 #endif
 656
 657 }
 658
 659 STATIC U8 *
 660 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 661 {
 662     /* Returns the position of the first byte in the sequence between 's' and
 663      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 664      * */
 665
 666     PERL_ARGS_ASSERT_FIND_SPAN_END;
 667
 668     assert(send >= s);
 669
 670     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 671                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 672                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 673     {
 674         PERL_UINTMAX_T span_word;
 675
 676         /* Process per-byte until reach word boundary.  XXX This loop could be
 677          * eliminated if we knew that this platform had fast unaligned reads */
 678         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 679             if (*s != span_byte) {
 680                 return s;
 681             }
 682             s++;
 683         }
 684
 685         /* Create a word filled with the bytes we are spanning */
 686         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 687
 688         /* Process per-word as long as we have at least a full word left */
 689         do {
 690
 691             /* Keep going if the whole word is composed of 'span_byte's */
 692             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 693                 s += PERL_WORDSIZE;
 694                 continue;
 695             }
 696
 697             /* Here, at least one byte in the word isn't 'span_byte'. */
 698
 699 #ifdef EBCDIC
 700
 701             break;
 702
 703 #else
 704
 705             /* This xor leaves 1 bits only in those non-matching bytes */
 706             span_word ^= * (PERL_UINTMAX_T *) s;
 707
 708             /* Make sure the upper bit of each non-matching byte is set.  This
 709              * makes each such byte look like an ASCII platform variant byte */
 710             span_word |= span_word << 1;
 711             span_word |= span_word << 2;
 712             span_word |= span_word << 4;
 713
 714             /* That reduces the problem to what this function solves */
 715             return s + _variant_byte_number(span_word);
 716
 717 #endif
 718
 719         } while (s + PERL_WORDSIZE <= send);
 720     }
 721
 722     /* Process the straggler bytes beyond the final word boundary */
 723     while (s < send) {
 724         if (*s != span_byte) {
 725             return s;
 726         }
 727         s++;
 728     }
 729
 730     return s;
 731 }
 732
 733 STATIC U8 *
 734 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 735 {
 736     /* Returns the position of the first byte in the sequence between 's'
 737      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 738      * returns 'send' if none found.  It uses word-level operations instead of
 739      * byte to speed up the process */
 740
 741     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 742
 743     assert(send >= s);
 744     assert((byte & mask) == byte);
 745
 746 #ifndef EBCDIC
 747
 748     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 749                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 750                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 751     {
 752         PERL_UINTMAX_T word_complemented, mask_word;
 753
 754         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 755             if (((*s) & mask) == byte) {
 756                 return s;
 757             }
 758             s++;
 759         }
 760
 761         word_complemented = ~ (PERL_COUNT_MULTIPLIER * byte);
 762         mask_word =            PERL_COUNT_MULTIPLIER * mask;
 763
 764         do {
 765             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 766
 767             /* If 'masked' contains 'byte' within it, anding with the
 768              * complement will leave those 8 bits 0 */
 769             masked &= word_complemented;
 770
 771             /* This causes the most significant bit to be set to 1 for any
 772              * bytes in the word that aren't completely 0 */
 773             masked |= masked << 1;
 774             masked |= masked << 2;
 775             masked |= masked << 4;
 776
 777             /* The msbits are the same as what marks a byte as variant, so we
 778              * can use this mask.  If all msbits are 1, the word doesn't
 779              * contain 'byte' */
 780             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 781                 s += PERL_WORDSIZE;
 782                 continue;
 783             }
 784
 785             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 786              * and any that are, are 0.  Complement and re-AND to swap that */
 787             masked = ~ masked;
 788             masked &= PERL_VARIANTS_WORD_MASK;
 789
 790             /* This reduces the problem to that solved by this function */
 791             s += _variant_byte_number(masked);
 792             return s;
 793
 794         } while (s + PERL_WORDSIZE <= send);
 795     }
 796
 797 #endif
 798
 799     while (s < send) {
 800         if (((*s) & mask) == byte) {
 801             return s;
 802         }
 803         s++;
 804     }
 805
 806     return s;
 807 }
 808
 809 STATIC U8 *
 810 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 811 {
 812     /* Returns the position of the first byte in the sequence between 's' and
 813      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 814      * 'span_byte' should have been ANDed with 'mask' in the call of this
 815      * function.  Returns 'send' if none found.  Works like find_span_end(),
 816      * except for the AND */
 817
 818     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 819
 820     assert(send >= s);
 821     assert((span_byte & mask) == span_byte);
 822
 823     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 824                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 825                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 826     {
 827         PERL_UINTMAX_T span_word, mask_word;
 828
 829         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 830             if (((*s) & mask) != span_byte) {
 831                 return s;
 832             }
 833             s++;
 834         }
 835
 836         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 837         mask_word = PERL_COUNT_MULTIPLIER * mask;
 838
 839         do {
 840             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 841
 842             if (masked == span_word) {
 843                 s += PERL_WORDSIZE;
 844                 continue;
 845             }
 846
 847 #ifdef EBCDIC
 848
 849             break;
 850
 851 #else
 852
 853             masked ^= span_word;
 854             masked |= masked << 1;
 855             masked |= masked << 2;
 856             masked |= masked << 4;
 857             return s + _variant_byte_number(masked);
 858
 859 #endif
 860
 861         } while (s + PERL_WORDSIZE <= send);
 862     }
 863
 864     while (s < send) {
 865         if (((*s) & mask) != span_byte) {
 866             return s;
 867         }
 868         s++;
 869     }
 870
 871     return s;
 872 }
 873
 874 /*
 875  * pregexec and friends
 876  */
 877
 878 #ifndef PERL_IN_XSUB_RE
 879 /*
 880  - pregexec - match a regexp against a string
 881  */
 882 I32
 883 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 884          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 885 /* stringarg: the point in the string at which to begin matching */
 886 /* strend:    pointer to null at end of string */
 887 /* strbeg:    real beginning of string */
 888 /* minend:    end of match must be >= minend bytes after stringarg. */
 889 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 890  *            itself is accessed via the pointers above */
 891 /* nosave:    For optimizations. */
 892 {
 893     PERL_ARGS_ASSERT_PREGEXEC;
 894
 895     return
 896         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 897                       nosave ? 0 : REXEC_COPY_STR);
 898 }
 899 #endif
 900
 901
 902
 903 /* re_intuit_start():
 904  *
 905  * Based on some optimiser hints, try to find the earliest position in the
 906  * string where the regex could match.
 907  *
 908  *   rx:     the regex to match against
 909  *   sv:     the SV being matched: only used for utf8 flag; the string
 910  *           itself is accessed via the pointers below. Note that on
 911  *           something like an overloaded SV, SvPOK(sv) may be false
 912  *           and the string pointers may point to something unrelated to
 913  *           the SV itself.
 914  *   strbeg: real beginning of string
 915  *   strpos: the point in the string at which to begin matching
 916  *   strend: pointer to the byte following the last char of the string
 917  *   flags   currently unused; set to 0
 918  *   data:   currently unused; set to NULL
 919  *
 920  * The basic idea of re_intuit_start() is to use some known information
 921  * about the pattern, namely:
 922  *
 923  *   a) the longest known anchored substring (i.e. one that's at a
 924  *      constant offset from the beginning of the pattern; but not
 925  *      necessarily at a fixed offset from the beginning of the
 926  *      string);
 927  *   b) the longest floating substring (i.e. one that's not at a constant
 928  *      offset from the beginning of the pattern);
 929  *   c) Whether the pattern is anchored to the string; either
 930  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 931  *      or anchored to pos(): /\G/;
 932  *   d) A start class: a real or synthetic character class which
 933  *      represents which characters are legal at the start of the pattern;
 934  *
 935  * to either quickly reject the match, or to find the earliest position
 936  * within the string at which the pattern might match, thus avoiding
 937  * running the full NFA engine at those earlier locations, only to
 938  * eventually fail and retry further along.
 939  *
 940  * Returns NULL if the pattern can't match, or returns the address within
 941  * the string which is the earliest place the match could occur.
 942  *
 943  * The longest of the anchored and floating substrings is called 'check'
 944  * and is checked first. The other is called 'other' and is checked
 945  * second. The 'other' substring may not be present.  For example,
 946  *
 947  *    /(abc|xyz)ABC\d{0,3}DEFG/
 948  *
 949  * will have
 950  *
 951  *   check substr (float)    = "DEFG", offset 6..9 chars
 952  *   other substr (anchored) = "ABC",  offset 3..3 chars
 953  *   stclass = [ax]
 954  *
 955  * Be aware that during the course of this function, sometimes 'anchored'
 956  * refers to a substring being anchored relative to the start of the
 957  * pattern, and sometimes to the pattern itself being anchored relative to
 958  * the string. For example:
 959  *
 960  *   /\dabc/:   "abc" is anchored to the pattern;
 961  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 962  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 963  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 964  *                    but the pattern is anchored to the string.
 965  */
 966
 967 char *
 968 Perl_re_intuit_start(pTHX_
 969                     REGEXP * const rx,
 970                     SV *sv,
 971                     const char * const strbeg,
 972                     char *strpos,
 973                     char *strend,
 974                     const U32 flags,
 975                     re_scream_pos_data *data)
 976 {
 977     struct regexp *const prog = ReANY(rx);
 978     SSize_t start_shift = prog->check_offset_min;
 979     /* Should be nonnegative! */
 980     SSize_t end_shift   = 0;
 981     /* current lowest pos in string where the regex can start matching */
 982     char *rx_origin = strpos;
 983     SV *check;
 984     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 985     U8   other_ix = 1 - prog->substrs->check_ix;
 986     bool ml_anch = 0;
 987     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 988     char *check_at = NULL;              /* check substr found at this pos */
 989     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 990     RXi_GET_DECL(prog,progi);
 991     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 992     regmatch_info *const reginfo = &reginfo_buf;
 993     GET_RE_DEBUG_FLAGS_DECL;
 994
 995     PERL_ARGS_ASSERT_RE_INTUIT_START;
 996     PERL_UNUSED_ARG(flags);
 997     PERL_UNUSED_ARG(data);
 998
 999     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1000                 "Intuit: trying to determine minimum start position...\n"));
1001
1002     /* for now, assume that all substr offsets are positive. If at some point
1003      * in the future someone wants to do clever things with lookbehind and
1004      * -ve offsets, they'll need to fix up any code in this function
1005      * which uses these offsets. See the thread beginning
1006      * <20140113145929.GF27210@iabyn.com>
1007      */
1008     assert(prog->substrs->data[0].min_offset >= 0);
1009     assert(prog->substrs->data[0].max_offset >= 0);
1010     assert(prog->substrs->data[1].min_offset >= 0);
1011     assert(prog->substrs->data[1].max_offset >= 0);
1012     assert(prog->substrs->data[2].min_offset >= 0);
1013     assert(prog->substrs->data[2].max_offset >= 0);
1014
1015     /* for now, assume that if both present, that the floating substring
1016      * doesn't start before the anchored substring.
1017      * If you break this assumption (e.g. doing better optimisations
1018      * with lookahead/behind), then you'll need to audit the code in this
1019      * function carefully first
1020      */
1021     assert(
1022             ! (  (prog->anchored_utf8 || prog->anchored_substr)
1023               && (prog->float_utf8    || prog->float_substr))
1024            || (prog->float_min_offset >= prog->anchored_offset));
1025
1026     /* byte rather than char calculation for efficiency. It fails
1027      * to quickly reject some cases that can't match, but will reject
1028      * them later after doing full char arithmetic */
1029     if (prog->minlen > strend - strpos) {
1030         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1031                               "  String too short...\n"));
1032         goto fail;
1033     }
1034
1035     RXp_MATCH_UTF8_set(prog, utf8_target);
1036     reginfo->is_utf8_target = cBOOL(utf8_target);
1037     reginfo->info_aux = NULL;
1038     reginfo->strbeg = strbeg;
1039     reginfo->strend = strend;
1040     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1041     reginfo->intuit = 1;
1042     /* not actually used within intuit, but zero for safety anyway */
1043     reginfo->poscache_maxiter = 0;
1044
1045     if (utf8_target) {
1046         if ((!prog->anchored_utf8 && prog->anchored_substr)
1047                 || (!prog->float_utf8 && prog->float_substr))
1048             to_utf8_substr(prog);
1049         check = prog->check_utf8;
1050     } else {
1051         if (!prog->check_substr && prog->check_utf8) {
1052             if (! to_byte_substr(prog)) {
1053                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1054             }
1055         }
1056         check = prog->check_substr;
1057     }
1058
1059     /* dump the various substring data */
1060     DEBUG_OPTIMISE_MORE_r({
1061         int i;
1062         for (i=0; i<=2; i++) {
1063             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1064                                   : prog->substrs->data[i].substr);
1065             if (!sv)
1066                 continue;
1067
1068             Perl_re_printf( aTHX_
1069                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1070                 " useful=%" IVdf " utf8=%d [%s]\n",
1071                 i,
1072                 (IV)prog->substrs->data[i].min_offset,
1073                 (IV)prog->substrs->data[i].max_offset,
1074                 (IV)prog->substrs->data[i].end_shift,
1075                 BmUSEFUL(sv),
1076                 utf8_target ? 1 : 0,
1077                 SvPEEK(sv));
1078         }
1079     });
1080
1081     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1082
1083         /* ml_anch: check after \n?
1084          *
1085          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1086          * with /.*.../, these flags will have been added by the
1087          * compiler:
1088          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
1089          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
1090          */
1091         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
1092                    && !(prog->intflags & PREGf_IMPLICIT);
1093
1094         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1095             /* we are only allowed to match at BOS or \G */
1096
1097             /* trivially reject if there's a BOS anchor and we're not at BOS.
1098              *
1099              * Note that we don't try to do a similar quick reject for
1100              * \G, since generally the caller will have calculated strpos
1101              * based on pos() and gofs, so the string is already correctly
1102              * anchored by definition; and handling the exceptions would
1103              * be too fiddly (e.g. REXEC_IGNOREPOS).
1104              */
1105             if (   strpos != strbeg
1106                 && (prog->intflags & PREGf_ANCH_SBOL))
1107             {
1108                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1109                                 "  Not at start...\n"));
1110                 goto fail;
1111             }
1112
1113             /* in the presence of an anchor, the anchored (relative to the
1114              * start of the regex) substr must also be anchored relative
1115              * to strpos. So quickly reject if substr isn't found there.
1116              * This works for \G too, because the caller will already have
1117              * subtracted gofs from pos, and gofs is the offset from the
1118              * \G to the start of the regex. For example, in /.abc\Gdef/,
1119              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1120              * caller will have set strpos=pos()-4; we look for the substr
1121              * at position pos()-4+1, which lines up with the "a" */
1122
1123             if (prog->check_offset_min == prog->check_offset_max) {
1124                 /* Substring at constant offset from beg-of-str... */
1125                 SSize_t slen = SvCUR(check);
1126                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1127
1128                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1129                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1130                     (IV)prog->check_offset_min));
1131
1132                 if (SvTAIL(check)) {
1133                     /* In this case, the regex is anchored at the end too.
1134                      * Unless it's a multiline match, the lengths must match
1135                      * exactly, give or take a \n.  NB: slen >= 1 since
1136                      * the last char of check is \n */
1137                     if (!multiline
1138                         && (   strend - s > slen
1139                             || strend - s < slen - 1
1140                             || (strend - s == slen && strend[-1] != '\n')))
1141                     {
1142                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1143                                             "  String too long...\n"));
1144                         goto fail_finish;
1145                     }
1146                     /* Now should match s[0..slen-2] */
1147                     slen--;
1148                 }
1149                 if (slen && (strend - s < slen
1150                     || *SvPVX_const(check) != *s
1151                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1152                 {
1153                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1154                                     "  String not equal...\n"));
1155                     goto fail_finish;
1156                 }
1157
1158                 check_at = s;
1159                 goto success_at_start;
1160             }
1161         }
1162     }
1163
1164     end_shift = prog->check_end_shift;
1165
1166 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1167     if (end_shift < 0)
1168         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1169                    (IV)end_shift, RX_PRECOMP(rx));
1170 #endif
1171
1172   restart:
1173
1174     /* This is the (re)entry point of the main loop in this function.
1175      * The goal of this loop is to:
1176      * 1) find the "check" substring in the region rx_origin..strend
1177      *    (adjusted by start_shift / end_shift). If not found, reject
1178      *    immediately.
1179      * 2) If it exists, look for the "other" substr too if defined; for
1180      *    example, if the check substr maps to the anchored substr, then
1181      *    check the floating substr, and vice-versa. If not found, go
1182      *    back to (1) with rx_origin suitably incremented.
1183      * 3) If we find an rx_origin position that doesn't contradict
1184      *    either of the substrings, then check the possible additional
1185      *    constraints on rx_origin of /^.../m or a known start class.
1186      *    If these fail, then depending on which constraints fail, jump
1187      *    back to here, or to various other re-entry points further along
1188      *    that skip some of the first steps.
1189      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1190      *    substring. If the start position was determined to be at the
1191      *    beginning of the string  - so, not rejected, but not optimised,
1192      *    since we have to run regmatch from position 0 - decrement the
1193      *    BmUSEFUL() count. Otherwise increment it.
1194      */
1195
1196
1197     /* first, look for the 'check' substring */
1198
1199     {
1200         U8* start_point;
1201         U8* end_point;
1202
1203         DEBUG_OPTIMISE_MORE_r({
1204             Perl_re_printf( aTHX_
1205                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1206                 " Start shift: %" IVdf " End shift %" IVdf
1207                 " Real end Shift: %" IVdf "\n",
1208                 (IV)(rx_origin - strbeg),
1209                 (IV)prog->check_offset_min,
1210                 (IV)start_shift,
1211                 (IV)end_shift,
1212                 (IV)prog->check_end_shift);
1213         });
1214
1215         end_point = HOPBACK3(strend, end_shift, rx_origin);
1216         if (!end_point)
1217             goto fail_finish;
1218         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1219         if (!start_point)
1220             goto fail_finish;
1221
1222
1223         /* If the regex is absolutely anchored to either the start of the
1224          * string (SBOL) or to pos() (ANCH_GPOS), then
1225          * check_offset_max represents an upper bound on the string where
1226          * the substr could start. For the ANCH_GPOS case, we assume that
1227          * the caller of intuit will have already set strpos to
1228          * pos()-gofs, so in this case strpos + offset_max will still be
1229          * an upper bound on the substr.
1230          */
1231         if (!ml_anch
1232             && prog->intflags & PREGf_ANCH
1233             && prog->check_offset_max != SSize_t_MAX)
1234         {
1235             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1236             const char * const anchor =
1237                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1238             SSize_t targ_len = (char*)end_point - anchor;
1239
1240             if (check_len > targ_len) {
1241                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1242                               "Target string too short to match required substring...\n"));
1243                 goto fail_finish;
1244             }
1245
1246             /* do a bytes rather than chars comparison. It's conservative;
1247              * so it skips doing the HOP if the result can't possibly end
1248              * up earlier than the old value of end_point.
1249              */
1250             assert(anchor + check_len <= (char *)end_point);
1251             if (prog->check_offset_max + check_len < targ_len) {
1252                 end_point = HOP3lim((U8*)anchor,
1253                                 prog->check_offset_max,
1254                                 end_point - check_len
1255                             )
1256                             + check_len;
1257                 if (end_point < start_point)
1258                     goto fail_finish;
1259             }
1260         }
1261
1262         check_at = fbm_instr( start_point, end_point,
1263                       check, multiline ? FBMrf_MULTILINE : 0);
1264
1265         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1266             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1267             (IV)((char*)start_point - strbeg),
1268             (IV)((char*)end_point   - strbeg),
1269             (IV)(check_at ? check_at - strbeg : -1)
1270         ));
1271
1272         /* Update the count-of-usability, remove useless subpatterns,
1273             unshift s.  */
1274
1275         DEBUG_EXECUTE_r({
1276             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1277                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1278             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1279                               (check_at ? "Found" : "Did not find"),
1280                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1281                     ? "anchored" : "floating"),
1282                 quoted,
1283                 RE_SV_TAIL(check),
1284                 (check_at ? " at offset " : "...\n") );
1285         });
1286
1287         if (!check_at)
1288             goto fail_finish;
1289         /* set rx_origin to the minimum position where the regex could start
1290          * matching, given the constraint of the just-matched check substring.
1291          * But don't set it lower than previously.
1292          */
1293
1294         if (check_at - rx_origin > prog->check_offset_max)
1295             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1296         /* Finish the diagnostic message */
1297         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298             "%ld (rx_origin now %" IVdf ")...\n",
1299             (long)(check_at - strbeg),
1300             (IV)(rx_origin - strbeg)
1301         ));
1302     }
1303
1304
1305     /* now look for the 'other' substring if defined */
1306
1307     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1308                     : prog->substrs->data[other_ix].substr)
1309     {
1310         /* Take into account the "other" substring. */
1311         char *last, *last1;
1312         char *s;
1313         SV* must;
1314         struct reg_substr_datum *other;
1315
1316       do_other_substr:
1317         other = &prog->substrs->data[other_ix];
1318
1319         /* if "other" is anchored:
1320          * we've previously found a floating substr starting at check_at.
1321          * This means that the regex origin must lie somewhere
1322          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1323          * and max:                 HOP3(check_at, -check_offset_min)
1324          * (except that min will be >= strpos)
1325          * So the fixed  substr must lie somewhere between
1326          *  HOP3(min, anchored_offset)
1327          *  HOP3(max, anchored_offset) + SvCUR(substr)
1328          */
1329
1330         /* if "other" is floating
1331          * Calculate last1, the absolute latest point where the
1332          * floating substr could start in the string, ignoring any
1333          * constraints from the earlier fixed match. It is calculated
1334          * as follows:
1335          *
1336          * strend - prog->minlen (in chars) is the absolute latest
1337          * position within the string where the origin of the regex
1338          * could appear. The latest start point for the floating
1339          * substr is float_min_offset(*) on from the start of the
1340          * regex.  last1 simply combines thee two offsets.
1341          *
1342          * (*) You might think the latest start point should be
1343          * float_max_offset from the regex origin, and technically
1344          * you'd be correct. However, consider
1345          *    /a\d{2,4}bcd\w/
1346          * Here, float min, max are 3,5 and minlen is 7.
1347          * This can match either
1348          *    /a\d\dbcd\w/
1349          *    /a\d\d\dbcd\w/
1350          *    /a\d\d\d\dbcd\w/
1351          * In the first case, the regex matches minlen chars; in the
1352          * second, minlen+1, in the third, minlen+2.
1353          * In the first case, the floating offset is 3 (which equals
1354          * float_min), in the second, 4, and in the third, 5 (which
1355          * equals float_max). In all cases, the floating string bcd
1356          * can never start more than 4 chars from the end of the
1357          * string, which equals minlen - float_min. As the substring
1358          * starts to match more than float_min from the start of the
1359          * regex, it makes the regex match more than minlen chars,
1360          * and the two cancel each other out. So we can always use
1361          * float_min - minlen, rather than float_max - minlen for the
1362          * latest position in the string.
1363          *
1364          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1365          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1366          */
1367
1368         assert(prog->minlen >= other->min_offset);
1369         last1 = HOP3c(strend,
1370                         other->min_offset - prog->minlen, strbeg);
1371
1372         if (other_ix) {/* i.e. if (other-is-float) */
1373             /* last is the latest point where the floating substr could
1374              * start, *given* any constraints from the earlier fixed
1375              * match. This constraint is that the floating string starts
1376              * <= float_max_offset chars from the regex origin (rx_origin).
1377              * If this value is less than last1, use it instead.
1378              */
1379             assert(rx_origin <= last1);
1380             last =
1381                 /* this condition handles the offset==infinity case, and
1382                  * is a short-cut otherwise. Although it's comparing a
1383                  * byte offset to a char length, it does so in a safe way,
1384                  * since 1 char always occupies 1 or more bytes,
1385                  * so if a string range is  (last1 - rx_origin) bytes,
1386                  * it will be less than or equal to  (last1 - rx_origin)
1387                  * chars; meaning it errs towards doing the accurate HOP3
1388                  * rather than just using last1 as a short-cut */
1389                 (last1 - rx_origin) < other->max_offset
1390                     ? last1
1391                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1392         }
1393         else {
1394             assert(strpos + start_shift <= check_at);
1395             last = HOP4c(check_at, other->min_offset - start_shift,
1396                         strbeg, strend);
1397         }
1398
1399         s = HOP3c(rx_origin, other->min_offset, strend);
1400         if (s < other_last)     /* These positions already checked */
1401             s = other_last;
1402
1403         must = utf8_target ? other->utf8_substr : other->substr;
1404         assert(SvPOK(must));
1405         {
1406             char *from = s;
1407             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1408
1409             if (to > strend)
1410                 to = strend;
1411             if (from > to) {
1412                 s = NULL;
1413                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1414                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1415                     (IV)(from - strbeg),
1416                     (IV)(to   - strbeg)
1417                 ));
1418             }
1419             else {
1420                 s = fbm_instr(
1421                     (unsigned char*)from,
1422                     (unsigned char*)to,
1423                     must,
1424                     multiline ? FBMrf_MULTILINE : 0
1425                 );
1426                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1427                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1428                     (IV)(from - strbeg),
1429                     (IV)(to   - strbeg),
1430                     (IV)(s ? s - strbeg : -1)
1431                 ));
1432             }
1433         }
1434
1435         DEBUG_EXECUTE_r({
1436             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1437                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1438             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1439                 s ? "Found" : "Contradicts",
1440                 other_ix ? "floating" : "anchored",
1441                 quoted, RE_SV_TAIL(must));
1442         });
1443
1444
1445         if (!s) {
1446             /* last1 is latest possible substr location. If we didn't
1447              * find it before there, we never will */
1448             if (last >= last1) {
1449                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1450                                         "; giving up...\n"));
1451                 goto fail_finish;
1452             }
1453
1454             /* try to find the check substr again at a later
1455              * position. Maybe next time we'll find the "other" substr
1456              * in range too */
1457             other_last = HOP3c(last, 1, strend) /* highest failure */;
1458             rx_origin =
1459                 other_ix /* i.e. if other-is-float */
1460                     ? HOP3c(rx_origin, 1, strend)
1461                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1462             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1463                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1464                 (other_ix ? "floating" : "anchored"),
1465                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1466                 (IV)(rx_origin - strbeg)
1467             ));
1468             goto restart;
1469         }
1470         else {
1471             if (other_ix) { /* if (other-is-float) */
1472                 /* other_last is set to s, not s+1, since its possible for
1473                  * a floating substr to fail first time, then succeed
1474                  * second time at the same floating position; e.g.:
1475                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1476                  * The first time round, anchored and float match at
1477                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1478                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1479                  */
1480                 other_last = s;
1481             }
1482             else {
1483                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1484                 other_last = HOP3c(s, 1, strend);
1485             }
1486             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1487                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1488                   (long)(s - strbeg),
1489                 (IV)(rx_origin - strbeg)
1490               ));
1491
1492         }
1493     }
1494     else {
1495         DEBUG_OPTIMISE_MORE_r(
1496             Perl_re_printf( aTHX_
1497                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1498                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1499                 " strend:%" IVdf "\n",
1500                 (IV)prog->check_offset_min,
1501                 (IV)prog->check_offset_max,
1502                 (IV)(check_at-strbeg),
1503                 (IV)(rx_origin-strbeg),
1504                 (IV)(rx_origin-check_at),
1505                 (IV)(strend-strbeg)
1506             )
1507         );
1508     }
1509
1510   postprocess_substr_matches:
1511
1512     /* handle the extra constraint of /^.../m if present */
1513
1514     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1515         char *s;
1516
1517         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1518                         "  looking for /^/m anchor"));
1519
1520         /* we have failed the constraint of a \n before rx_origin.
1521          * Find the next \n, if any, even if it's beyond the current
1522          * anchored and/or floating substrings. Whether we should be
1523          * scanning ahead for the next \n or the next substr is debatable.
1524          * On the one hand you'd expect rare substrings to appear less
1525          * often than \n's. On the other hand, searching for \n means
1526          * we're effectively flipping between check_substr and "\n" on each
1527          * iteration as the current "rarest" string candidate, which
1528          * means for example that we'll quickly reject the whole string if
1529          * hasn't got a \n, rather than trying every substr position
1530          * first
1531          */
1532
1533         s = HOP3c(strend, - prog->minlen, strpos);
1534         if (s <= rx_origin ||
1535             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1536         {
1537             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1538                             "  Did not find /%s^%s/m...\n",
1539                             PL_colors[0], PL_colors[1]));
1540             goto fail_finish;
1541         }
1542
1543         /* earliest possible origin is 1 char after the \n.
1544          * (since *rx_origin == '\n', it's safe to ++ here rather than
1545          * HOP(rx_origin, 1)) */
1546         rx_origin++;
1547
1548         if (prog->substrs->check_ix == 0  /* check is anchored */
1549             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1550         {
1551             /* Position contradicts check-string; either because
1552              * check was anchored (and thus has no wiggle room),
1553              * or check was float and rx_origin is above the float range */
1554             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1555                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1556                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1557             goto restart;
1558         }
1559
1560         /* if we get here, the check substr must have been float,
1561          * is in range, and we may or may not have had an anchored
1562          * "other" substr which still contradicts */
1563         assert(prog->substrs->check_ix); /* check is float */
1564
1565         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1566             /* whoops, the anchored "other" substr exists, so we still
1567              * contradict. On the other hand, the float "check" substr
1568              * didn't contradict, so just retry the anchored "other"
1569              * substr */
1570             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1571                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1572                 PL_colors[0], PL_colors[1],
1573                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1574                 (IV)(rx_origin - strbeg)
1575             ));
1576             goto do_other_substr;
1577         }
1578
1579         /* success: we don't contradict the found floating substring
1580          * (and there's no anchored substr). */
1581         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1582             "  Found /%s^%s/m with rx_origin %ld...\n",
1583             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1584     }
1585     else {
1586         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1587             "  (multiline anchor test skipped)\n"));
1588     }
1589
1590   success_at_start:
1591
1592
1593     /* if we have a starting character class, then test that extra constraint.
1594      * (trie stclasses are too expensive to use here, we are better off to
1595      * leave it to regmatch itself) */
1596
1597     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1598         const U8* const str = (U8*)STRING(progi->regstclass);
1599
1600         /* XXX this value could be pre-computed */
1601         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1602                     ?  (reginfo->is_utf8_pat
1603                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1604                         : STR_LEN(progi->regstclass))
1605                     : 1);
1606         char * endpos;
1607         char *s;
1608         /* latest pos that a matching float substr constrains rx start to */
1609         char *rx_max_float = NULL;
1610
1611         /* if the current rx_origin is anchored, either by satisfying an
1612          * anchored substring constraint, or a /^.../m constraint, then we
1613          * can reject the current origin if the start class isn't found
1614          * at the current position. If we have a float-only match, then
1615          * rx_origin is constrained to a range; so look for the start class
1616          * in that range. if neither, then look for the start class in the
1617          * whole rest of the string */
1618
1619         /* XXX DAPM it's not clear what the minlen test is for, and why
1620          * it's not used in the floating case. Nothing in the test suite
1621          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1622          * Here are some old comments, which may or may not be correct:
1623          *
1624          *   minlen == 0 is possible if regstclass is \b or \B,
1625          *   and the fixed substr is ''$.
1626          *   Since minlen is already taken into account, rx_origin+1 is
1627          *   before strend; accidentally, minlen >= 1 guaranties no false
1628          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1629          *   0) below assumes that regstclass does not come from lookahead...
1630          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1631          *   This leaves EXACTF-ish only, which are dealt with in
1632          *   find_byclass().
1633          */
1634
1635         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1636             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1637         else if (prog->float_substr || prog->float_utf8) {
1638             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1639             endpos = HOP3clim(rx_max_float, cl_l, strend);
1640         }
1641         else
1642             endpos= strend;
1643
1644         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1645             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1646             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1647               (IV)start_shift, (IV)(check_at - strbeg),
1648               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1649
1650         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1651                             reginfo);
1652         if (!s) {
1653             if (endpos == strend) {
1654                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1655                                 "  Could not match STCLASS...\n") );
1656                 goto fail;
1657             }
1658             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1659                                "  This position contradicts STCLASS...\n") );
1660             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1661                         && !(prog->intflags & PREGf_IMPLICIT))
1662                 goto fail;
1663
1664             /* Contradict one of substrings */
1665             if (prog->anchored_substr || prog->anchored_utf8) {
1666                 if (prog->substrs->check_ix == 1) { /* check is float */
1667                     /* Have both, check_string is floating */
1668                     assert(rx_origin + start_shift <= check_at);
1669                     if (rx_origin + start_shift != check_at) {
1670                         /* not at latest position float substr could match:
1671                          * Recheck anchored substring, but not floating.
1672                          * The condition above is in bytes rather than
1673                          * chars for efficiency. It's conservative, in
1674                          * that it errs on the side of doing 'goto
1675                          * do_other_substr'. In this case, at worst,
1676                          * an extra anchored search may get done, but in
1677                          * practice the extra fbm_instr() is likely to
1678                          * get skipped anyway. */
1679                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1680                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1681                             (long)(other_last - strbeg),
1682                             (IV)(rx_origin - strbeg)
1683                         ));
1684                         goto do_other_substr;
1685                     }
1686                 }
1687             }
1688             else {
1689                 /* float-only */
1690
1691                 if (ml_anch) {
1692                     /* In the presence of ml_anch, we might be able to
1693                      * find another \n without breaking the current float
1694                      * constraint. */
1695
1696                     /* strictly speaking this should be HOP3c(..., 1, ...),
1697                      * but since we goto a block of code that's going to
1698                      * search for the next \n if any, its safe here */
1699                     rx_origin++;
1700                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1701                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1702                               PL_colors[0], PL_colors[1],
1703                               (long)(rx_origin - strbeg)) );
1704                     goto postprocess_substr_matches;
1705                 }
1706
1707                 /* strictly speaking this can never be true; but might
1708                  * be if we ever allow intuit without substrings */
1709                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1710                     goto fail;
1711
1712                 rx_origin = rx_max_float;
1713             }
1714
1715             /* at this point, any matching substrings have been
1716              * contradicted. Start again... */
1717
1718             rx_origin = HOP3c(rx_origin, 1, strend);
1719
1720             /* uses bytes rather than char calculations for efficiency.
1721              * It's conservative: it errs on the side of doing 'goto restart',
1722              * where there is code that does a proper char-based test */
1723             if (rx_origin + start_shift + end_shift > strend) {
1724                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1725                                        "  Could not match STCLASS...\n") );
1726                 goto fail;
1727             }
1728             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1729                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1730                 (prog->substrs->check_ix ? "floating" : "anchored"),
1731                 (long)(rx_origin + start_shift - strbeg),
1732                 (IV)(rx_origin - strbeg)
1733             ));
1734             goto restart;
1735         }
1736
1737         /* Success !!! */
1738
1739         if (rx_origin != s) {
1740             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1741                         "  By STCLASS: moving %ld --> %ld\n",
1742                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1743                    );
1744         }
1745         else {
1746             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1747                                   "  Does not contradict STCLASS...\n");
1748                    );
1749         }
1750     }
1751
1752     /* Decide whether using the substrings helped */
1753
1754     if (rx_origin != strpos) {
1755         /* Fixed substring is found far enough so that the match
1756            cannot start at strpos. */
1757
1758         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1759         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1760     }
1761     else {
1762         /* The found rx_origin position does not prohibit matching at
1763          * strpos, so calling intuit didn't gain us anything. Decrement
1764          * the BmUSEFUL() count on the check substring, and if we reach
1765          * zero, free it.  */
1766         if (!(prog->intflags & PREGf_NAUGHTY)
1767             && (utf8_target ? (
1768                 prog->check_utf8                /* Could be deleted already */
1769                 && --BmUSEFUL(prog->check_utf8) < 0
1770                 && (prog->check_utf8 == prog->float_utf8)
1771             ) : (
1772                 prog->check_substr              /* Could be deleted already */
1773                 && --BmUSEFUL(prog->check_substr) < 0
1774                 && (prog->check_substr == prog->float_substr)
1775             )))
1776         {
1777             /* If flags & SOMETHING - do not do it many times on the same match */
1778             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1779             /* XXX Does the destruction order has to change with utf8_target? */
1780             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1781             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1782             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1783             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1784             check = NULL;                       /* abort */
1785             /* XXXX This is a remnant of the old implementation.  It
1786                     looks wasteful, since now INTUIT can use many
1787                     other heuristics. */
1788             prog->extflags &= ~RXf_USE_INTUIT;
1789         }
1790     }
1791
1792     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1793             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1794              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1795
1796     return rx_origin;
1797
1798   fail_finish:                          /* Substring not found */
1799     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1800         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1801   fail:
1802     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1803                           PL_colors[4], PL_colors[5]));
1804     return NULL;
1805 }
1806
1807
1808 #define DECL_TRIE_TYPE(scan) \
1809     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1810                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1811                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1812                     trie_type = ((scan->flags == EXACT)                             \
1813                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1814                                  : (scan->flags == EXACTL)                          \
1815                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1816                                     : (scan->flags == EXACTFAA)                     \
1817                                       ? (utf8_target                                \
1818                                          ? trie_utf8_exactfa_fold                   \
1819                                          : trie_latin_utf8_exactfa_fold)            \
1820                                       : (scan->flags == EXACTFLU8                   \
1821                                          ? (utf8_target                             \
1822                                            ? trie_flu8                              \
1823                                            : trie_flu8_latin)                       \
1824                                          : (utf8_target                             \
1825                                            ? trie_utf8_fold                         \
1826                                            : trie_latin_utf8_fold)))
1827
1828 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1829  * 'foldbuf+sizeof(foldbuf)' */
1830 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1831 STMT_START {                                                                        \
1832     STRLEN skiplen;                                                                 \
1833     U8 flags = FOLD_FLAGS_FULL;                                                     \
1834     switch (trie_type) {                                                            \
1835     case trie_flu8:                                                                 \
1836         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1837         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1838             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1839         }                                                                           \
1840         goto do_trie_utf8_fold;                                                     \
1841     case trie_utf8_exactfa_fold:                                                    \
1842         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1843         /* FALLTHROUGH */                                                           \
1844     case trie_utf8_fold:                                                            \
1845       do_trie_utf8_fold:                                                            \
1846         if ( foldlen>0 ) {                                                          \
1847             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1848             foldlen -= len;                                                         \
1849             uscan += len;                                                           \
1850             len=0;                                                                  \
1851         } else {                                                                    \
1852             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1853                                                                             flags); \
1854             len = UTF8SKIP(uc);                                                     \
1855             skiplen = UVCHR_SKIP( uvc );                                            \
1856             foldlen -= skiplen;                                                     \
1857             uscan = foldbuf + skiplen;                                              \
1858         }                                                                           \
1859         break;                                                                      \
1860     case trie_flu8_latin:                                                           \
1861         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1862         goto do_trie_latin_utf8_fold;                                               \
1863     case trie_latin_utf8_exactfa_fold:                                              \
1864         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1865         /* FALLTHROUGH */                                                           \
1866     case trie_latin_utf8_fold:                                                      \
1867       do_trie_latin_utf8_fold:                                                      \
1868         if ( foldlen>0 ) {                                                          \
1869             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1870             foldlen -= len;                                                         \
1871             uscan += len;                                                           \
1872             len=0;                                                                  \
1873         } else {                                                                    \
1874             len = 1;                                                                \
1875             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1876             skiplen = UVCHR_SKIP( uvc );                                            \
1877             foldlen -= skiplen;                                                     \
1878             uscan = foldbuf + skiplen;                                              \
1879         }                                                                           \
1880         break;                                                                      \
1881     case trie_utf8l:                                                                \
1882         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1883         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1884             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1885         }                                                                           \
1886         /* FALLTHROUGH */                                                           \
1887     case trie_utf8:                                                                 \
1888         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1889         break;                                                                      \
1890     case trie_plain:                                                                \
1891         uvc = (UV)*uc;                                                              \
1892         len = 1;                                                                    \
1893     }                                                                               \
1894     if (uvc < 256) {                                                                \
1895         charid = trie->charmap[ uvc ];                                              \
1896     }                                                                               \
1897     else {                                                                          \
1898         charid = 0;                                                                 \
1899         if (widecharmap) {                                                          \
1900             SV** const svpp = hv_fetch(widecharmap,                                 \
1901                         (char*)&uvc, sizeof(UV), 0);                                \
1902             if (svpp)                                                               \
1903                 charid = (U16)SvIV(*svpp);                                          \
1904         }                                                                           \
1905     }                                                                               \
1906 } STMT_END
1907
1908 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1909     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1910                 startpos, doutf8, depth)
1911
1912 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1913     STMT_START {                                            \
1914         while (s < strend) {                                \
1915             CODE                                            \
1916             s += ((UTF8) ? UTF8SKIP(s) : 1);                \
1917         }                                                   \
1918     } STMT_END
1919
1920 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1921     STMT_START {                                            \
1922         while (s < strend) {                                \
1923             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1924         }                                                   \
1925     } STMT_END
1926
1927 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1928     if (COND) {                                                \
1929         FBC_CHECK_AND_TRY                                      \
1930         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1931         previous_occurrence_end = s;                           \
1932     }                                                          \
1933     else {                                                     \
1934         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1935     }
1936
1937 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1938     if (utf8_target) {                                         \
1939         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1940     }                                                          \
1941     else {                                                     \
1942         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1943     }
1944
1945 /* We keep track of where the next character should start after an occurrence
1946  * of the one we're looking for.  Knowing that, we can see right away if the
1947  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1948  * don't accept the 2nd and succeeding adjacent occurrences */
1949 #define FBC_CHECK_AND_TRY                                      \
1950         if (   (   doevery                                     \
1951                 || s != previous_occurrence_end)               \
1952             && (reginfo->intuit || regtry(reginfo, &s)))       \
1953         {                                                      \
1954             goto got_it;                                       \
1955         }
1956
1957
1958 /* This differs from the above macros in that it calls a function which returns
1959  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1960  * there is no such occurrence. */
1961 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1962     while (s < strend) {                                    \
1963         s = (f);                                            \
1964         if (s >= strend) {                                  \
1965             break;                                          \
1966         }                                                   \
1967                                                             \
1968         FBC_CHECK_AND_TRY                                   \
1969         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1970         previous_occurrence_end = s;                        \
1971     }
1972
1973 /* The three macros below are slightly different versions of the same logic.
1974  *
1975  * The first is for /a and /aa when the target string is UTF-8.  This can only
1976  * match ascii, but it must advance based on UTF-8.   The other two handle the
1977  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1978  * for the boundary (or non-boundary) between a word and non-word character.
1979  * The utf8 and non-utf8 cases have the same logic, but the details must be
1980  * different.  Find the "wordness" of the character just prior to this one, and
1981  * compare it with the wordness of this one.  If they differ, we have a
1982  * boundary.  At the beginning of the string, pretend that the previous
1983  * character was a new-line.
1984  *
1985  * All these macros uncleanly have side-effects with each other and outside
1986  * variables.  So far it's been too much trouble to clean-up
1987  *
1988  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1989  *               a word character or not.
1990  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1991  *               word/non-word
1992  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1993  *
1994  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1995  * are looking for a boundary or for a non-boundary.  If we are looking for a
1996  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1997  * see if this tentative match actually works, and if so, to quit the loop
1998  * here.  And vice-versa if we are looking for a non-boundary.
1999  *
2000  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
2001  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
2002  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
2003  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2004  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2005  * complement.  But in that branch we complement tmp, meaning that at the
2006  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2007  * which means at the top of the loop in the next iteration, it is
2008  * TEST_NON_UTF8(s-1) */
2009 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
2010     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
2011     tmp = TEST_NON_UTF8(tmp);                                                  \
2012     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
2013         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
2014             tmp = !tmp;                                                        \
2015             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
2016         }                                                                      \
2017         else {                                                                 \
2018             IF_FAIL;                                                           \
2019         }                                                                      \
2020     );                                                                         \
2021
2022 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2023  * TEST_UTF8 is a macro that for the same input code points returns identically
2024  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
2025 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
2026     if (s == reginfo->strbeg) {                                                \
2027         tmp = '\n';                                                            \
2028     }                                                                          \
2029     else { /* Back-up to the start of the previous character */                \
2030         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
2031         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
2032                                                        0, UTF8_ALLOW_DEFAULT); \
2033     }                                                                          \
2034     tmp = TEST_UV(tmp);                                                        \
2035     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
2036         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
2037             tmp = !tmp;                                                        \
2038             IF_SUCCESS;                                                        \
2039         }                                                                      \
2040         else {                                                                 \
2041             IF_FAIL;                                                           \
2042         }                                                                      \
2043     );
2044
2045 /* Like the above two macros.  UTF8_CODE is the complete code for handling
2046  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
2047  * macros below */
2048 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
2049     if (utf8_target) {                                                         \
2050         UTF8_CODE                                                              \
2051     }                                                                          \
2052     else {  /* Not utf8 */                                                     \
2053         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
2054         tmp = TEST_NON_UTF8(tmp);                                              \
2055         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
2056             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
2057                 IF_SUCCESS;                                                    \
2058                 tmp = !tmp;                                                    \
2059             }                                                                  \
2060             else {                                                             \
2061                 IF_FAIL;                                                       \
2062             }                                                                  \
2063         );                                                                     \
2064     }                                                                          \
2065     /* Here, things have been set up by the previous code so that tmp is the   \
2066      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
2067      * utf8ness of the target).  We also have to check if this matches against \
2068      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
2069      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
2070      * string */                                                               \
2071     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2072         IF_SUCCESS;                                                            \
2073     }                                                                          \
2074     else {                                                                     \
2075         IF_FAIL;                                                               \
2076     }
2077
2078 /* This is the macro to use when we want to see if something that looks like it
2079  * could match, actually does, and if so exits the loop */
2080 #define REXEC_FBC_TRYIT                            \
2081     if ((reginfo->intuit || regtry(reginfo, &s)))  \
2082         goto got_it
2083
2084 /* The only difference between the BOUND and NBOUND cases is that
2085  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2086  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2087  * with the other one being empty (PLACEHOLDER is defined as empty).
2088  *
2089  * The TEST_FOO parameters are for operating on different forms of input, but
2090  * all should be ones that return identically for the same underlying code
2091  * points */
2092 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
2093     FBC_BOUND_COMMON(                                                          \
2094           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
2095           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2096
2097 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2098     FBC_BOUND_COMMON(                                                          \
2099             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2100             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2101
2102 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2103     FBC_BOUND_COMMON(                                                          \
2104           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2105           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2106
2107 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2108     FBC_BOUND_COMMON(                                                          \
2109             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2110             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2111
2112 #ifdef DEBUGGING
2113 static IV
2114 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2115   IV cp_out = _invlist_search(invlist, cp_in);
2116   assert(cp_out >= 0);
2117   return cp_out;
2118 }
2119 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2120         invmap[S_get_break_val_cp_checked(invlist, cp)]
2121 #else
2122 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2123         invmap[_invlist_search(invlist, cp)]
2124 #endif
2125
2126 /* Takes a pointer to an inversion list, a pointer to its corresponding
2127  * inversion map, and a code point, and returns the code point's value
2128  * according to the two arrays.  It assumes that all code points have a value.
2129  * This is used as the base macro for macros for particular properties */
2130 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2131         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2132
2133 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2134  * of a code point, returning the value for the first code point in the string.
2135  * And it takes the particular macro name that finds the desired value given a
2136  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2137 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2138              (__ASSERT_(pos < strend)                                          \
2139                  /* Note assumes is valid UTF-8 */                             \
2140              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2141
2142 /* Returns the GCB value for the input code point */
2143 #define getGCB_VAL_CP(cp)                                                      \
2144           _generic_GET_BREAK_VAL_CP(                                           \
2145                                     PL_GCB_invlist,                            \
2146                                     _Perl_GCB_invmap,                          \
2147                                     (cp))
2148
2149 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2150  * bounded by pos and strend */
2151 #define getGCB_VAL_UTF8(pos, strend)                                           \
2152     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2153
2154 /* Returns the LB value for the input code point */
2155 #define getLB_VAL_CP(cp)                                                       \
2156           _generic_GET_BREAK_VAL_CP(                                           \
2157                                     PL_LB_invlist,                             \
2158                                     _Perl_LB_invmap,                           \
2159                                     (cp))
2160
2161 /* Returns the LB value for the first code point in the UTF-8 encoded string
2162  * bounded by pos and strend */
2163 #define getLB_VAL_UTF8(pos, strend)                                            \
2164     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2165
2166
2167 /* Returns the SB value for the input code point */
2168 #define getSB_VAL_CP(cp)                                                       \
2169           _generic_GET_BREAK_VAL_CP(                                           \
2170                                     PL_SB_invlist,                             \
2171                                     _Perl_SB_invmap,                     \
2172                                     (cp))
2173
2174 /* Returns the SB value for the first code point in the UTF-8 encoded string
2175  * bounded by pos and strend */
2176 #define getSB_VAL_UTF8(pos, strend)                                            \
2177     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2178
2179 /* Returns the WB value for the input code point */
2180 #define getWB_VAL_CP(cp)                                                       \
2181           _generic_GET_BREAK_VAL_CP(                                           \
2182                                     PL_WB_invlist,                             \
2183                                     _Perl_WB_invmap,                         \
2184                                     (cp))
2185
2186 /* Returns the WB value for the first code point in the UTF-8 encoded string
2187  * bounded by pos and strend */
2188 #define getWB_VAL_UTF8(pos, strend)                                            \
2189     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2190
2191 /* We know what class REx starts with.  Try to find this position... */
2192 /* if reginfo->intuit, its a dryrun */
2193 /* annoyingly all the vars in this routine have different names from their counterparts
2194    in regmatch. /grrr */
2195 STATIC char *
2196 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2197     const char *strend, regmatch_info *reginfo)
2198 {
2199     dVAR;
2200
2201     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2202     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2203
2204     char *pat_string;   /* The pattern's exactish string */
2205     char *pat_end;          /* ptr to end char of pat_string */
2206     re_fold_t folder;   /* Function for computing non-utf8 folds */
2207     const U8 *fold_array;   /* array for folding ords < 256 */
2208     STRLEN ln;
2209     STRLEN lnc;
2210     U8 c1;
2211     U8 c2;
2212     char *e;
2213
2214     /* In some cases we accept only the first occurence of 'x' in a sequence of
2215      * them.  This variable points to just beyond the end of the previous
2216      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2217      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2218      * hop back.) */
2219     char * previous_occurrence_end = 0;
2220
2221     I32 tmp;            /* Scratch variable */
2222     const bool utf8_target = reginfo->is_utf8_target;
2223     UV utf8_fold_flags = 0;
2224     const bool is_utf8_pat = reginfo->is_utf8_pat;
2225     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2226                                    with a result inverts that result, as 0^1 =
2227                                    1 and 1^1 = 0 */
2228     _char_class_number classnum;
2229
2230     RXi_GET_DECL(prog,progi);
2231
2232     PERL_ARGS_ASSERT_FIND_BYCLASS;
2233
2234     /* We know what class it must start with. */
2235     switch (OP(c)) {
2236     case ANYOFPOSIXL:
2237     case ANYOFL:
2238         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2239
2240         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2241             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2242         }
2243
2244         /* FALLTHROUGH */
2245     case ANYOFD:
2246     case ANYOF:
2247         if (utf8_target) {
2248             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2249                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2250         }
2251         else if (ANYOF_FLAGS(c)) {
2252             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2253         }
2254         else {
2255             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2256         }
2257         break;
2258
2259     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2260         /* UTF-8ness doesn't matter, so use 0 */
2261         REXEC_FBC_FIND_NEXT_SCAN(0,
2262          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2263                                    (U8) ARG(c), FLAGS(c)));
2264         break;
2265
2266     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2267         assert(! is_utf8_pat);
2268         /* FALLTHROUGH */
2269     case EXACTFAA:
2270         if (is_utf8_pat || utf8_target) {
2271             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2272             goto do_exactf_utf8;
2273         }
2274         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
2275         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
2276         goto do_exactf_non_utf8;        /* isn't dealt with by these */
2277
2278     case EXACTF:   /* This node only generated for non-utf8 patterns */
2279         assert(! is_utf8_pat);
2280         if (utf8_target) {
2281             utf8_fold_flags = 0;
2282             goto do_exactf_utf8;
2283         }
2284         fold_array = PL_fold;
2285         folder = foldEQ;
2286         goto do_exactf_non_utf8;
2287
2288     case EXACTFL:
2289         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2290         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2291             utf8_fold_flags = FOLDEQ_LOCALE;
2292             goto do_exactf_utf8;
2293         }
2294         fold_array = PL_fold_locale;
2295         folder = foldEQ_locale;
2296         goto do_exactf_non_utf8;
2297
2298     case EXACTFU_SS:
2299         if (is_utf8_pat) {
2300             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2301         }
2302         goto do_exactf_utf8;
2303
2304     case EXACTFLU8:
2305             if (! utf8_target) {    /* All code points in this node require
2306                                        UTF-8 to express.  */
2307                 break;
2308             }
2309             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2310                                              | FOLDEQ_S2_FOLDS_SANE;
2311             goto do_exactf_utf8;
2312
2313     case EXACTFU:
2314         if (is_utf8_pat || utf8_target) {
2315             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
2316             goto do_exactf_utf8;
2317         }
2318
2319         /* Any 'ss' in the pattern should have been replaced by regcomp,
2320          * so we don't have to worry here about this single special case
2321          * in the Latin1 range */
2322         fold_array = PL_fold_latin1;
2323         folder = foldEQ_latin1;
2324
2325         /* FALLTHROUGH */
2326
2327       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2328                            are no glitches with fold-length differences
2329                            between the target string and pattern */
2330
2331         /* The idea in the non-utf8 EXACTF* cases is to first find the
2332          * first character of the EXACTF* node and then, if necessary,
2333          * case-insensitively compare the full text of the node.  c1 is the
2334          * first character.  c2 is its fold.  This logic will not work for
2335          * Unicode semantics and the german sharp ss, which hence should
2336          * not be compiled into a node that gets here. */
2337         pat_string = STRING(c);
2338         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2339
2340         /* We know that we have to match at least 'ln' bytes (which is the
2341          * same as characters, since not utf8).  If we have to match 3
2342          * characters, and there are only 2 availabe, we know without
2343          * trying that it will fail; so don't start a match past the
2344          * required minimum number from the far end */
2345         e = HOP3c(strend, -((SSize_t)ln), s);
2346         if (e < s)
2347             break;
2348
2349         c1 = *pat_string;
2350         c2 = fold_array[c1];
2351         if (c1 == c2) { /* If char and fold are the same */
2352             while (s <= e) {
2353                 s = (char *) memchr(s, c1, e + 1 - s);
2354                 if (s == NULL) {
2355                     break;
2356                 }
2357
2358                 /* Check that the rest of the node matches */
2359                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2360                     && (reginfo->intuit || regtry(reginfo, &s)) )
2361                 {
2362                     goto got_it;
2363                 }
2364                 s++;
2365             }
2366         }
2367         else {
2368             U8 bits_differing = c1 ^ c2;
2369
2370             /* If the folds differ in one bit position only, we can mask to
2371              * match either of them, and can use this faster find method.  Both
2372              * ASCII and EBCDIC tend to have their case folds differ in only
2373              * one position, so this is very likely */
2374             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2375                 bits_differing = ~ bits_differing;
2376                 while (s <= e) {
2377                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2378                                         (c1 & bits_differing), bits_differing);
2379                     if (s > e) {
2380                         break;
2381                     }
2382
2383                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2384                         && (reginfo->intuit || regtry(reginfo, &s)) )
2385                     {
2386                         goto got_it;
2387                     }
2388                     s++;
2389                 }
2390             }
2391             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2392                        should actually happen only in EXACTFL nodes */
2393                 while (s <= e) {
2394                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2395                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2396                         && (reginfo->intuit || regtry(reginfo, &s)) )
2397                     {
2398                         goto got_it;
2399                     }
2400                     s++;
2401                 }
2402             }
2403         }
2404         break;
2405
2406       do_exactf_utf8:
2407       {
2408         unsigned expansion;
2409
2410         /* If one of the operands is in utf8, we can't use the simpler folding
2411          * above, due to the fact that many different characters can have the
2412          * same fold, or portion of a fold, or different- length fold */
2413         pat_string = STRING(c);
2414         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2415         pat_end = pat_string + ln;
2416         lnc = is_utf8_pat       /* length to match in characters */
2417                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2418                 : ln;
2419
2420         /* We have 'lnc' characters to match in the pattern, but because of
2421          * multi-character folding, each character in the target can match
2422          * up to 3 characters (Unicode guarantees it will never exceed
2423          * this) if it is utf8-encoded; and up to 2 if not (based on the
2424          * fact that the Latin 1 folds are already determined, and the
2425          * only multi-char fold in that range is the sharp-s folding to
2426          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2427          * string character.  Adjust lnc accordingly, rounding up, so that
2428          * if we need to match at least 4+1/3 chars, that really is 5. */
2429         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2430         lnc = (lnc + expansion - 1) / expansion;
2431
2432         /* As in the non-UTF8 case, if we have to match 3 characters, and
2433          * only 2 are left, it's guaranteed to fail, so don't start a
2434          * match that would require us to go beyond the end of the string
2435          */
2436         e = HOP3c(strend, -((SSize_t)lnc), s);
2437
2438         /* XXX Note that we could recalculate e to stop the loop earlier,
2439          * as the worst case expansion above will rarely be met, and as we
2440          * go along we would usually find that e moves further to the left.
2441          * This would happen only after we reached the point in the loop
2442          * where if there were no expansion we should fail.  Unclear if
2443          * worth the expense */
2444
2445         while (s <= e) {
2446             char *my_strend= (char *)strend;
2447             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2448                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2449                 && (reginfo->intuit || regtry(reginfo, &s)) )
2450             {
2451                 goto got_it;
2452             }
2453             s += (utf8_target) ? UTF8SKIP(s) : 1;
2454         }
2455         break;
2456     }
2457
2458     case BOUNDL:
2459         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2460         if (FLAGS(c) != TRADITIONAL_BOUND) {
2461             if (! IN_UTF8_CTYPE_LOCALE) {
2462                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2463                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2464             }
2465             goto do_boundu;
2466         }
2467
2468         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2469         break;
2470
2471     case NBOUNDL:
2472         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2473         if (FLAGS(c) != TRADITIONAL_BOUND) {
2474             if (! IN_UTF8_CTYPE_LOCALE) {
2475                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2476                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2477             }
2478             goto do_nboundu;
2479         }
2480
2481         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2482         break;
2483
2484     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2485                    meaning */
2486         assert(FLAGS(c) == TRADITIONAL_BOUND);
2487
2488         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2489         break;
2490
2491     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2492                    meaning */
2493         assert(FLAGS(c) == TRADITIONAL_BOUND);
2494
2495         FBC_BOUND_A(isWORDCHAR_A);
2496         break;
2497
2498     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2499                    meaning */
2500         assert(FLAGS(c) == TRADITIONAL_BOUND);
2501
2502         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2503         break;
2504
2505     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2506                    meaning */
2507         assert(FLAGS(c) == TRADITIONAL_BOUND);
2508
2509         FBC_NBOUND_A(isWORDCHAR_A);
2510         break;
2511
2512     case NBOUNDU:
2513         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2514             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2515             break;
2516         }
2517
2518       do_nboundu:
2519
2520         to_complement = 1;
2521         /* FALLTHROUGH */
2522
2523     case BOUNDU:
2524       do_boundu:
2525         switch((bound_type) FLAGS(c)) {
2526             case TRADITIONAL_BOUND:
2527                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2528                 break;
2529             case GCB_BOUND:
2530                 if (s == reginfo->strbeg) {
2531                     if (reginfo->intuit || regtry(reginfo, &s))
2532                     {
2533                         goto got_it;
2534                     }
2535
2536                     /* Didn't match.  Try at the next position (if there is one) */
2537                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2538                     if (UNLIKELY(s >= reginfo->strend)) {
2539                         break;
2540                     }
2541                 }
2542
2543                 if (utf8_target) {
2544                     GCB_enum before = getGCB_VAL_UTF8(
2545                                                reghop3((U8*)s, -1,
2546                                                        (U8*)(reginfo->strbeg)),
2547                                                (U8*) reginfo->strend);
2548                     while (s < strend) {
2549                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2550                                                         (U8*) reginfo->strend);
2551                         if (   (to_complement ^ isGCB(before,
2552                                                       after,
2553                                                       (U8*) reginfo->strbeg,
2554                                                       (U8*) s,
2555                                                       utf8_target))
2556                             && (reginfo->intuit || regtry(reginfo, &s)))
2557                         {
2558                             goto got_it;
2559                         }
2560                         before = after;
2561                         s += UTF8SKIP(s);
2562                     }
2563                 }
2564                 else {  /* Not utf8.  Everything is a GCB except between CR and
2565                            LF */
2566                     while (s < strend) {
2567                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2568                                               || UCHARAT(s) != '\n'))
2569                             && (reginfo->intuit || regtry(reginfo, &s)))
2570                         {
2571                             goto got_it;
2572                         }
2573                         s++;
2574                     }
2575                 }
2576
2577                 /* And, since this is a bound, it can match after the final
2578                  * character in the string */
2579                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2580                     goto got_it;
2581                 }
2582                 break;
2583
2584             case LB_BOUND:
2585                 if (s == reginfo->strbeg) {
2586                     if (reginfo->intuit || regtry(reginfo, &s)) {
2587                         goto got_it;
2588                     }
2589                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2590                     if (UNLIKELY(s >= reginfo->strend)) {
2591                         break;
2592                     }
2593                 }
2594
2595                 if (utf8_target) {
2596                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2597                                                                -1,
2598                                                                (U8*)(reginfo->strbeg)),
2599                                                        (U8*) reginfo->strend);
2600                     while (s < strend) {
2601                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2602                         if (to_complement ^ isLB(before,
2603                                                  after,
2604                                                  (U8*) reginfo->strbeg,
2605                                                  (U8*) s,
2606                                                  (U8*) reginfo->strend,
2607                                                  utf8_target)
2608                             && (reginfo->intuit || regtry(reginfo, &s)))
2609                         {
2610                             goto got_it;
2611                         }
2612                         before = after;
2613                         s += UTF8SKIP(s);
2614                     }
2615                 }
2616                 else {  /* Not utf8. */
2617                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2618                     while (s < strend) {
2619                         LB_enum after = getLB_VAL_CP((U8) *s);
2620                         if (to_complement ^ isLB(before,
2621                                                  after,
2622                                                  (U8*) reginfo->strbeg,
2623                                                  (U8*) s,
2624                                                  (U8*) reginfo->strend,
2625                                                  utf8_target)
2626                             && (reginfo->intuit || regtry(reginfo, &s)))
2627                         {
2628                             goto got_it;
2629                         }
2630                         before = after;
2631                         s++;
2632                     }
2633                 }
2634
2635                 if (reginfo->intuit || regtry(reginfo, &s)) {
2636                     goto got_it;
2637                 }
2638
2639                 break;
2640
2641             case SB_BOUND:
2642                 if (s == reginfo->strbeg) {
2643                     if (reginfo->intuit || regtry(reginfo, &s)) {
2644                         goto got_it;
2645                     }
2646                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2647                     if (UNLIKELY(s >= reginfo->strend)) {
2648                         break;
2649                     }
2650                 }
2651
2652                 if (utf8_target) {
2653                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2654                                                         -1,
2655                                                         (U8*)(reginfo->strbeg)),
2656                                                       (U8*) reginfo->strend);
2657                     while (s < strend) {
2658                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2659                                                          (U8*) reginfo->strend);
2660                         if ((to_complement ^ isSB(before,
2661                                                   after,
2662                                                   (U8*) reginfo->strbeg,
2663                                                   (U8*) s,
2664                                                   (U8*) reginfo->strend,
2665                                                   utf8_target))
2666                             && (reginfo->intuit || regtry(reginfo, &s)))
2667                         {
2668                             goto got_it;
2669                         }
2670                         before = after;
2671                         s += UTF8SKIP(s);
2672                     }
2673                 }
2674                 else {  /* Not utf8. */
2675                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2676                     while (s < strend) {
2677                         SB_enum after = getSB_VAL_CP((U8) *s);
2678                         if ((to_complement ^ isSB(before,
2679                                                   after,
2680                                                   (U8*) reginfo->strbeg,
2681                                                   (U8*) s,
2682                                                   (U8*) reginfo->strend,
2683                                                   utf8_target))
2684                             && (reginfo->intuit || regtry(reginfo, &s)))
2685                         {
2686                             goto got_it;
2687                         }
2688                         before = after;
2689                         s++;
2690                     }
2691                 }
2692
2693                 /* Here are at the final position in the target string.  The SB
2694                  * value is always true here, so matches, depending on other
2695                  * constraints */
2696                 if (reginfo->intuit || regtry(reginfo, &s)) {
2697                     goto got_it;
2698                 }
2699
2700                 break;
2701
2702             case WB_BOUND:
2703                 if (s == reginfo->strbeg) {
2704                     if (reginfo->intuit || regtry(reginfo, &s)) {
2705                         goto got_it;
2706                     }
2707                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2708                     if (UNLIKELY(s >= reginfo->strend)) {
2709                         break;
2710                     }
2711                 }
2712
2713                 if (utf8_target) {
2714                     /* We are at a boundary between char_sub_0 and char_sub_1.
2715                      * We also keep track of the value for char_sub_-1 as we
2716                      * loop through the line.   Context may be needed to make a
2717                      * determination, and if so, this can save having to
2718                      * recalculate it */
2719                     WB_enum previous = WB_UNKNOWN;
2720                     WB_enum before = getWB_VAL_UTF8(
2721                                               reghop3((U8*)s,
2722                                                       -1,
2723                                                       (U8*)(reginfo->strbeg)),
2724                                               (U8*) reginfo->strend);
2725                     while (s < strend) {
2726                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2727                                                         (U8*) reginfo->strend);
2728                         if ((to_complement ^ isWB(previous,
2729                                                   before,
2730                                                   after,
2731                                                   (U8*) reginfo->strbeg,
2732                                                   (U8*) s,
2733                                                   (U8*) reginfo->strend,
2734                                                   utf8_target))
2735                             && (reginfo->intuit || regtry(reginfo, &s)))
2736                         {
2737                             goto got_it;
2738                         }
2739                         previous = before;
2740                         before = after;
2741                         s += UTF8SKIP(s);
2742                     }
2743                 }
2744                 else {  /* Not utf8. */
2745                     WB_enum previous = WB_UNKNOWN;
2746                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2747                     while (s < strend) {
2748                         WB_enum after = getWB_VAL_CP((U8) *s);
2749                         if ((to_complement ^ isWB(previous,
2750                                                   before,
2751                                                   after,
2752                                                   (U8*) reginfo->strbeg,
2753                                                   (U8*) s,
2754                                                   (U8*) reginfo->strend,
2755                                                   utf8_target))
2756                             && (reginfo->intuit || regtry(reginfo, &s)))
2757                         {
2758                             goto got_it;
2759                         }
2760                         previous = before;
2761                         before = after;
2762                         s++;
2763                     }
2764                 }
2765
2766                 if (reginfo->intuit || regtry(reginfo, &s)) {
2767                     goto got_it;
2768                 }
2769         }
2770         break;
2771
2772     case LNBREAK:
2773         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2774                         is_LNBREAK_latin1_safe(s, strend)
2775         );
2776         break;
2777
2778     case ASCII:
2779         REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
2780         break;
2781
2782     case NASCII:
2783         if (utf8_target) {
2784             REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
2785                                                             utf8_target));
2786         }
2787         else {
2788             REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
2789                                                             utf8_target));
2790         }
2791
2792         break;
2793
2794     /* The argument to all the POSIX node types is the class number to pass to
2795      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2796
2797     case NPOSIXL:
2798         to_complement = 1;
2799         /* FALLTHROUGH */
2800
2801     case POSIXL:
2802         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2803         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2804                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2805         break;
2806
2807     case NPOSIXD:
2808         to_complement = 1;
2809         /* FALLTHROUGH */
2810
2811     case POSIXD:
2812         if (utf8_target) {
2813             goto posix_utf8;
2814         }
2815         goto posixa;
2816
2817     case NPOSIXA:
2818         if (utf8_target) {
2819             /* The complement of something that matches only ASCII matches all
2820              * non-ASCII, plus everything in ASCII that isn't in the class. */
2821             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2822                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2823             break;
2824         }
2825
2826         to_complement = 1;
2827         goto posixa;
2828
2829     case POSIXA:
2830         /* Don't need to worry about utf8, as it can match only a single
2831          * byte invariant character.  But we do anyway for performance reasons,
2832          * as otherwise we would have to examine all the continuation
2833          * characters */
2834         if (utf8_target) {
2835             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2836             break;
2837         }
2838
2839       posixa:
2840         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2841                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2842         break;
2843
2844     case NPOSIXU:
2845         to_complement = 1;
2846         /* FALLTHROUGH */
2847
2848     case POSIXU:
2849         if (! utf8_target) {
2850             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2851                                  to_complement ^ cBOOL(_generic_isCC(*s,
2852                                                                     FLAGS(c))));
2853         }
2854         else {
2855
2856           posix_utf8:
2857             classnum = (_char_class_number) FLAGS(c);
2858             switch (classnum) {
2859                 default:
2860                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2861                         to_complement ^ cBOOL(_invlist_contains_cp(
2862                                               PL_XPosix_ptrs[classnum],
2863                                               utf8_to_uvchr_buf((U8 *) s,
2864                                                                 (U8 *) strend,
2865                                                                 NULL))));
2866                     break;
2867                 case _CC_ENUM_SPACE:
2868                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2869                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2870                     break;
2871
2872                 case _CC_ENUM_BLANK:
2873                     REXEC_FBC_CLASS_SCAN(1,
2874                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2875                     break;
2876
2877                 case _CC_ENUM_XDIGIT:
2878                     REXEC_FBC_CLASS_SCAN(1,
2879                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2880                     break;
2881
2882                 case _CC_ENUM_VERTSPACE:
2883                     REXEC_FBC_CLASS_SCAN(1,
2884                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2885                     break;
2886
2887                 case _CC_ENUM_CNTRL:
2888                     REXEC_FBC_CLASS_SCAN(1,
2889                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2890                     break;
2891             }
2892         }
2893         break;
2894
2895     case AHOCORASICKC:
2896     case AHOCORASICK:
2897         {
2898             DECL_TRIE_TYPE(c);
2899             /* what trie are we using right now */
2900             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2901             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2902             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2903
2904             const char *last_start = strend - trie->minlen;
2905 #ifdef DEBUGGING
2906             const char *real_start = s;
2907 #endif
2908             STRLEN maxlen = trie->maxlen;
2909             SV *sv_points;
2910             U8 **points; /* map of where we were in the input string
2911                             when reading a given char. For ASCII this
2912                             is unnecessary overhead as the relationship
2913                             is always 1:1, but for Unicode, especially
2914                             case folded Unicode this is not true. */
2915             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2916             U8 *bitmap=NULL;
2917
2918
2919             GET_RE_DEBUG_FLAGS_DECL;
2920
2921             /* We can't just allocate points here. We need to wrap it in
2922              * an SV so it gets freed properly if there is a croak while
2923              * running the match */
2924             ENTER;
2925             SAVETMPS;
2926             sv_points=newSV(maxlen * sizeof(U8 *));
2927             SvCUR_set(sv_points,
2928                 maxlen * sizeof(U8 *));
2929             SvPOK_on(sv_points);
2930             sv_2mortal(sv_points);
2931             points=(U8**)SvPV_nolen(sv_points );
2932             if ( trie_type != trie_utf8_fold
2933                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2934             {
2935                 if (trie->bitmap)
2936                     bitmap=(U8*)trie->bitmap;
2937                 else
2938                     bitmap=(U8*)ANYOF_BITMAP(c);
2939             }
2940             /* this is the Aho-Corasick algorithm modified a touch
2941                to include special handling for long "unknown char" sequences.
2942                The basic idea being that we use AC as long as we are dealing
2943                with a possible matching char, when we encounter an unknown char
2944                (and we have not encountered an accepting state) we scan forward
2945                until we find a legal starting char.
2946                AC matching is basically that of trie matching, except that when
2947                we encounter a failing transition, we fall back to the current
2948                states "fail state", and try the current char again, a process
2949                we repeat until we reach the root state, state 1, or a legal
2950                transition. If we fail on the root state then we can either
2951                terminate if we have reached an accepting state previously, or
2952                restart the entire process from the beginning if we have not.
2953
2954              */
2955             while (s <= last_start) {
2956                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2957                 U8 *uc = (U8*)s;
2958                 U16 charid = 0;
2959                 U32 base = 1;
2960                 U32 state = 1;
2961                 UV uvc = 0;
2962                 STRLEN len = 0;
2963                 STRLEN foldlen = 0;
2964                 U8 *uscan = (U8*)NULL;
2965                 U8 *leftmost = NULL;
2966 #ifdef DEBUGGING
2967                 U32 accepted_word= 0;
2968 #endif
2969                 U32 pointpos = 0;
2970
2971                 while ( state && uc <= (U8*)strend ) {
2972                     int failed=0;
2973                     U32 word = aho->states[ state ].wordnum;
2974
2975                     if( state==1 ) {
2976                         if ( bitmap ) {
2977                             DEBUG_TRIE_EXECUTE_r(
2978                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2979                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2980                                         (char *)uc, utf8_target, 0 );
2981                                     Perl_re_printf( aTHX_
2982                                         " Scanning for legal start char...\n");
2983                                 }
2984                             );
2985                             if (utf8_target) {
2986                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2987                                     uc += UTF8SKIP(uc);
2988                                 }
2989                             } else {
2990                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2991                                     uc++;
2992                                 }
2993                             }
2994                             s= (char *)uc;
2995                         }
2996                         if (uc >(U8*)last_start) break;
2997                     }
2998
2999                     if ( word ) {
3000                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
3001                         if (!leftmost || lpos < leftmost) {
3002                             DEBUG_r(accepted_word=word);
3003                             leftmost= lpos;
3004                         }
3005                         if (base==0) break;
3006
3007                     }
3008                     points[pointpos++ % maxlen]= uc;
3009                     if (foldlen || uc < (U8*)strend) {
3010                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3011                                              (U8 *) strend, uscan, len, uvc,
3012                                              charid, foldlen, foldbuf,
3013                                              uniflags);
3014                         DEBUG_TRIE_EXECUTE_r({
3015                             dump_exec_pos( (char *)uc, c, strend,
3016                                         real_start, s, utf8_target, 0);
3017                             Perl_re_printf( aTHX_
3018                                 " Charid:%3u CP:%4" UVxf " ",
3019                                  charid, uvc);
3020                         });
3021                     }
3022                     else {
3023                         len = 0;
3024                         charid = 0;
3025                     }
3026
3027
3028                     do {
3029 #ifdef DEBUGGING
3030                         word = aho->states[ state ].wordnum;
3031 #endif
3032                         base = aho->states[ state ].trans.base;
3033
3034                         DEBUG_TRIE_EXECUTE_r({
3035                             if (failed)
3036                                 dump_exec_pos( (char *)uc, c, strend, real_start,
3037                                     s,   utf8_target, 0 );
3038                             Perl_re_printf( aTHX_
3039                                 "%sState: %4" UVxf ", word=%" UVxf,
3040                                 failed ? " Fail transition to " : "",
3041                                 (UV)state, (UV)word);
3042                         });
3043                         if ( base ) {
3044                             U32 tmp;
3045                             I32 offset;
3046                             if (charid &&
3047                                  ( ((offset = base + charid
3048                                     - 1 - trie->uniquecharcount)) >= 0)
3049                                  && ((U32)offset < trie->lasttrans)
3050                                  && trie->trans[offset].check == state
3051                                  && (tmp=trie->trans[offset].next))
3052                             {
3053                                 DEBUG_TRIE_EXECUTE_r(
3054                                     Perl_re_printf( aTHX_ " - legal\n"));
3055                                 state = tmp;
3056                                 break;
3057                             }
3058                             else {
3059                                 DEBUG_TRIE_EXECUTE_r(
3060                                     Perl_re_printf( aTHX_ " - fail\n"));
3061                                 failed = 1;
3062                                 state = aho->fail[state];
3063                             }
3064                         }
3065                         else {
3066                             /* we must be accepting here */
3067                             DEBUG_TRIE_EXECUTE_r(
3068                                     Perl_re_printf( aTHX_ " - accepting\n"));
3069                             failed = 1;
3070                             break;
3071                         }
3072                     } while(state);
3073                     uc += len;
3074                     if (failed) {
3075                         if (leftmost)
3076                             break;
3077                         if (!state) state = 1;
3078                     }
3079                 }
3080                 if ( aho->states[ state ].wordnum ) {
3081                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3082                     if (!leftmost || lpos < leftmost) {
3083                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3084                         leftmost = lpos;
3085                     }
3086                 }
3087                 if (leftmost) {
3088                     s = (char*)leftmost;
3089                     DEBUG_TRIE_EXECUTE_r({
3090                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3091                             (UV)accepted_word, (IV)(s - real_start)
3092                         );
3093                     });
3094                     if (reginfo->intuit || regtry(reginfo, &s)) {
3095                         FREETMPS;
3096                         LEAVE;
3097                         goto got_it;
3098                     }
3099                     s = HOPc(s,1);
3100                     DEBUG_TRIE_EXECUTE_r({
3101                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3102                     });
3103                 } else {
3104                     DEBUG_TRIE_EXECUTE_r(
3105                         Perl_re_printf( aTHX_ "No match.\n"));
3106                     break;
3107                 }
3108             }
3109             FREETMPS;
3110             LEAVE;
3111         }
3112         break;
3113     default:
3114         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3115     }
3116     return 0;
3117   got_it:
3118     return s;
3119 }
3120
3121 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3122  * flags have same meanings as with regexec_flags() */
3123
3124 static void
3125 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3126                             char *strbeg,
3127                             char *strend,
3128                             SV *sv,
3129                             U32 flags,
3130                             bool utf8_target)
3131 {
3132     struct regexp *const prog = ReANY(rx);
3133
3134     if (flags & REXEC_COPY_STR) {
3135 #ifdef PERL_ANY_COW
3136         if (SvCANCOW(sv)) {
3137             DEBUG_C(Perl_re_printf( aTHX_
3138                               "Copy on write: regexp capture, type %d\n",
3139                                     (int) SvTYPE(sv)));
3140             /* Create a new COW SV to share the match string and store
3141              * in saved_copy, unless the current COW SV in saved_copy
3142              * is valid and suitable for our purpose */
3143             if ((   prog->saved_copy
3144                  && SvIsCOW(prog->saved_copy)
3145                  && SvPOKp(prog->saved_copy)
3146                  && SvIsCOW(sv)
3147                  && SvPOKp(sv)
3148                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3149             {
3150                 /* just reuse saved_copy SV */
3151                 if (RXp_MATCH_COPIED(prog)) {
3152                     Safefree(prog->subbeg);
3153                     RXp_MATCH_COPIED_off(prog);
3154                 }
3155             }
3156             else {
3157                 /* create new COW SV to share string */
3158                 RXp_MATCH_COPY_FREE(prog);
3159                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3160             }
3161             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3162             assert (SvPOKp(prog->saved_copy));
3163             prog->sublen  = strend - strbeg;
3164             prog->suboffset = 0;
3165             prog->subcoffset = 0;
3166         } else
3167 #endif
3168         {
3169             SSize_t min = 0;
3170             SSize_t max = strend - strbeg;
3171             SSize_t sublen;
3172
3173             if (    (flags & REXEC_COPY_SKIP_POST)
3174                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3175                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3176             ) { /* don't copy $' part of string */
3177                 U32 n = 0;
3178                 max = -1;
3179                 /* calculate the right-most part of the string covered
3180                  * by a capture. Due to lookahead, this may be to
3181                  * the right of $&, so we have to scan all captures */
3182                 while (n <= prog->lastparen) {
3183                     if (prog->offs[n].end > max)
3184                         max = prog->offs[n].end;
3185                     n++;
3186                 }
3187                 if (max == -1)
3188                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3189                             ? prog->offs[0].start
3190                             : 0;
3191                 assert(max >= 0 && max <= strend - strbeg);
3192             }
3193
3194             if (    (flags & REXEC_COPY_SKIP_PRE)
3195                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3196                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3197             ) { /* don't copy $` part of string */
3198                 U32 n = 0;
3199                 min = max;
3200                 /* calculate the left-most part of the string covered
3201                  * by a capture. Due to lookbehind, this may be to
3202                  * the left of $&, so we have to scan all captures */
3203                 while (min && n <= prog->lastparen) {
3204                     if (   prog->offs[n].start != -1
3205                         && prog->offs[n].start < min)
3206                     {
3207                         min = prog->offs[n].start;
3208                     }
3209                     n++;
3210                 }
3211                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3212                     && min >  prog->offs[0].end
3213                 )
3214                     min = prog->offs[0].end;
3215
3216             }
3217
3218             assert(min >= 0 && min <= max && min <= strend - strbeg);
3219             sublen = max - min;
3220
3221             if (RXp_MATCH_COPIED(prog)) {
3222                 if (sublen > prog->sublen)
3223                     prog->subbeg =
3224                             (char*)saferealloc(prog->subbeg, sublen+1);
3225             }
3226             else
3227                 prog->subbeg = (char*)safemalloc(sublen+1);
3228             Copy(strbeg + min, prog->subbeg, sublen, char);
3229             prog->subbeg[sublen] = '\0';
3230             prog->suboffset = min;
3231             prog->sublen = sublen;
3232             RXp_MATCH_COPIED_on(prog);
3233         }
3234         prog->subcoffset = prog->suboffset;
3235         if (prog->suboffset && utf8_target) {
3236             /* Convert byte offset to chars.
3237              * XXX ideally should only compute this if @-/@+
3238              * has been seen, a la PL_sawampersand ??? */
3239
3240             /* If there's a direct correspondence between the
3241              * string which we're matching and the original SV,
3242              * then we can use the utf8 len cache associated with
3243              * the SV. In particular, it means that under //g,
3244              * sv_pos_b2u() will use the previously cached
3245              * position to speed up working out the new length of
3246              * subcoffset, rather than counting from the start of
3247              * the string each time. This stops
3248              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3249              * from going quadratic */
3250             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3251                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3252                                                 SV_GMAGIC|SV_CONST_RETURN);
3253             else
3254                 prog->subcoffset = utf8_length((U8*)strbeg,
3255                                     (U8*)(strbeg+prog->suboffset));
3256         }
3257     }
3258     else {
3259         RXp_MATCH_COPY_FREE(prog);
3260         prog->subbeg = strbeg;
3261         prog->suboffset = 0;
3262         prog->subcoffset = 0;
3263         prog->sublen = strend - strbeg;
3264     }
3265 }
3266
3267
3268
3269
3270 /*
3271  - regexec_flags - match a regexp against a string
3272  */
3273 I32
3274 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3275               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3276 /* stringarg: the point in the string at which to begin matching */
3277 /* strend:    pointer to null at end of string */
3278 /* strbeg:    real beginning of string */
3279 /* minend:    end of match must be >= minend bytes after stringarg. */
3280 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3281  *            itself is accessed via the pointers above */
3282 /* data:      May be used for some additional optimizations.
3283               Currently unused. */
3284 /* flags:     For optimizations. See REXEC_* in regexp.h */
3285
3286 {
3287     struct regexp *const prog = ReANY(rx);
3288     char *s;
3289     regnode *c;
3290     char *startpos;
3291     SSize_t minlen;             /* must match at least this many chars */
3292     SSize_t dontbother = 0;     /* how many characters not to try at end */
3293     const bool utf8_target = cBOOL(DO_UTF8(sv));
3294     I32 multiline;
3295     RXi_GET_DECL(prog,progi);
3296     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3297     regmatch_info *const reginfo = &reginfo_buf;
3298     regexp_paren_pair *swap = NULL;
3299     I32 oldsave;
3300     GET_RE_DEBUG_FLAGS_DECL;
3301
3302     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3303     PERL_UNUSED_ARG(data);
3304
3305     /* Be paranoid... */
3306     if (prog == NULL) {
3307         Perl_croak(aTHX_ "NULL regexp parameter");
3308     }
3309
3310     DEBUG_EXECUTE_r(
3311         debug_start_match(rx, utf8_target, stringarg, strend,
3312         "Matching");
3313     );
3314
3315     startpos = stringarg;
3316
3317     /* set these early as they may be used by the HOP macros below */
3318     reginfo->strbeg = strbeg;
3319     reginfo->strend = strend;
3320     reginfo->is_utf8_target = cBOOL(utf8_target);
3321
3322     if (prog->intflags & PREGf_GPOS_SEEN) {
3323         MAGIC *mg;
3324
3325         /* set reginfo->ganch, the position where \G can match */
3326
3327         reginfo->ganch =
3328             (flags & REXEC_IGNOREPOS)
3329             ? stringarg /* use start pos rather than pos() */
3330             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3331               /* Defined pos(): */
3332             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3333             : strbeg; /* pos() not defined; use start of string */
3334
3335         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3336             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3337
3338         /* in the presence of \G, we may need to start looking earlier in
3339          * the string than the suggested start point of stringarg:
3340          * if prog->gofs is set, then that's a known, fixed minimum
3341          * offset, such as
3342          * /..\G/:   gofs = 2
3343          * /ab|c\G/: gofs = 1
3344          * or if the minimum offset isn't known, then we have to go back
3345          * to the start of the string, e.g. /w+\G/
3346          */
3347
3348         if (prog->intflags & PREGf_ANCH_GPOS) {
3349             if (prog->gofs) {
3350                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3351                 if (!startpos ||
3352                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3353                 {
3354                     DEBUG_r(Perl_re_printf( aTHX_
3355                             "fail: ganch-gofs before earliest possible start\n"));
3356                     return 0;
3357                 }
3358             }
3359             else
3360                 startpos = reginfo->ganch;
3361         }
3362         else if (prog->gofs) {
3363             startpos = HOPBACKc(startpos, prog->gofs);
3364             if (!startpos)
3365                 startpos = strbeg;
3366         }
3367         else if (prog->intflags & PREGf_GPOS_FLOAT)
3368             startpos = strbeg;
3369     }
3370
3371     minlen = prog->minlen;
3372     if ((startpos + minlen) > strend || startpos < strbeg) {
3373         DEBUG_r(Perl_re_printf( aTHX_
3374                     "Regex match can't succeed, so not even tried\n"));
3375         return 0;
3376     }
3377
3378     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3379      * which will call destuctors to reset PL_regmatch_state, free higher
3380      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3381      * regmatch_info_aux_eval */
3382
3383     oldsave = PL_savestack_ix;
3384
3385     s = startpos;
3386
3387     if ((prog->extflags & RXf_USE_INTUIT)
3388         && !(flags & REXEC_CHECKED))
3389     {
3390         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3391                                     flags, NULL);
3392         if (!s)
3393             return 0;
3394
3395         if (prog->extflags & RXf_CHECK_ALL) {
3396             /* we can match based purely on the result of INTUIT.
3397              * Set up captures etc just for $& and $-[0]
3398              * (an intuit-only match wont have $1,$2,..) */
3399             assert(!prog->nparens);
3400
3401             /* s/// doesn't like it if $& is earlier than where we asked it to
3402              * start searching (which can happen on something like /.\G/) */
3403             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3404                     && (s < stringarg))
3405             {
3406                 /* this should only be possible under \G */
3407                 assert(prog->intflags & PREGf_GPOS_SEEN);
3408                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3409                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3410                 goto phooey;
3411             }
3412
3413             /* match via INTUIT shouldn't have any captures.
3414              * Let @-, @+, $^N know */
3415             prog->lastparen = prog->lastcloseparen = 0;
3416             RXp_MATCH_UTF8_set(prog, utf8_target);
3417             prog->offs[0].start = s - strbeg;
3418             prog->offs[0].end = utf8_target
3419                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3420                 : s - strbeg + prog->minlenret;
3421             if ( !(flags & REXEC_NOT_FIRST) )
3422                 S_reg_set_capture_string(aTHX_ rx,
3423                                         strbeg, strend,
3424                                         sv, flags, utf8_target);
3425
3426             return 1;
3427         }
3428     }
3429
3430     multiline = prog->extflags & RXf_PMf_MULTILINE;
3431
3432     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3433         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3434                               "String too short [regexec_flags]...\n"));
3435         goto phooey;
3436     }
3437
3438     /* Check validity of program. */
3439     if (UCHARAT(progi->program) != REG_MAGIC) {
3440         Perl_croak(aTHX_ "corrupted regexp program");
3441     }
3442
3443     RXp_MATCH_TAINTED_off(prog);
3444     RXp_MATCH_UTF8_set(prog, utf8_target);
3445
3446     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3447     reginfo->intuit = 0;
3448     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3449     reginfo->warned = FALSE;
3450     reginfo->sv = sv;
3451     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3452     /* see how far we have to get to not match where we matched before */
3453     reginfo->till = stringarg + minend;
3454
3455     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3456         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3457            S_cleanup_regmatch_info_aux has executed (registered by
3458            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3459            magic belonging to this SV.
3460            Not newSVsv, either, as it does not COW.
3461         */
3462         reginfo->sv = newSV(0);
3463         SvSetSV_nosteal(reginfo->sv, sv);
3464         SAVEFREESV(reginfo->sv);
3465     }
3466
3467     /* reserve next 2 or 3 slots in PL_regmatch_state:
3468      * slot N+0: may currently be in use: skip it
3469      * slot N+1: use for regmatch_info_aux struct
3470      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3471      * slot N+3: ready for use by regmatch()
3472      */
3473
3474     {
3475         regmatch_state *old_regmatch_state;
3476         regmatch_slab  *old_regmatch_slab;
3477         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3478
3479         /* on first ever match, allocate first slab */
3480         if (!PL_regmatch_slab) {
3481             Newx(PL_regmatch_slab, 1, regmatch_slab);
3482             PL_regmatch_slab->prev = NULL;
3483             PL_regmatch_slab->next = NULL;
3484             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3485         }
3486
3487         old_regmatch_state = PL_regmatch_state;
3488         old_regmatch_slab  = PL_regmatch_slab;
3489
3490         for (i=0; i <= max; i++) {
3491             if (i == 1)
3492                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3493             else if (i ==2)
3494                 reginfo->info_aux_eval =
3495                 reginfo->info_aux->info_aux_eval =
3496                             &(PL_regmatch_state->u.info_aux_eval);
3497
3498             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3499                 PL_regmatch_state = S_push_slab(aTHX);
3500         }
3501
3502         /* note initial PL_regmatch_state position; at end of match we'll
3503          * pop back to there and free any higher slabs */
3504
3505         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3506         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3507         reginfo->info_aux->poscache = NULL;
3508
3509         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3510
3511         if ((prog->extflags & RXf_EVAL_SEEN))
3512             S_setup_eval_state(aTHX_ reginfo);
3513         else
3514             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3515     }
3516
3517     /* If there is a "must appear" string, look for it. */
3518
3519     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3520         /* We have to be careful. If the previous successful match
3521            was from this regex we don't want a subsequent partially
3522            successful match to clobber the old results.
3523            So when we detect this possibility we add a swap buffer
3524            to the re, and switch the buffer each match. If we fail,
3525            we switch it back; otherwise we leave it swapped.
3526         */
3527         swap = prog->offs;
3528         /* do we need a save destructor here for eval dies? */
3529         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3530         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3531             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3532             0,
3533             PTR2UV(prog),
3534             PTR2UV(swap),
3535             PTR2UV(prog->offs)
3536         ));
3537     }
3538
3539     if (prog->recurse_locinput)
3540         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3541
3542     /* Simplest case: anchored match need be tried only once, or with
3543      * MBOL, only at the beginning of each line.
3544      *
3545      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3546      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3547      * match at the start of the string then it won't match anywhere else
3548      * either; while with /.*.../, if it doesn't match at the beginning,
3549      * the earliest it could match is at the start of the next line */
3550
3551     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3552         char *end;
3553
3554         if (regtry(reginfo, &s))
3555             goto got_it;
3556
3557         if (!(prog->intflags & PREGf_ANCH_MBOL))
3558             goto phooey;
3559
3560         /* didn't match at start, try at other newline positions */
3561
3562         if (minlen)
3563             dontbother = minlen - 1;
3564         end = HOP3c(strend, -dontbother, strbeg) - 1;
3565
3566         /* skip to next newline */
3567
3568         while (s <= end) { /* note it could be possible to match at the end of the string */
3569             /* NB: newlines are the same in unicode as they are in latin */
3570             if (*s++ != '\n')
3571                 continue;
3572             if (prog->check_substr || prog->check_utf8) {
3573             /* note that with PREGf_IMPLICIT, intuit can only fail
3574              * or return the start position, so it's of limited utility.
3575              * Nevertheless, I made the decision that the potential for
3576              * quick fail was still worth it - DAPM */
3577                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3578                 if (!s)
3579                     goto phooey;
3580             }
3581             if (regtry(reginfo, &s))
3582                 goto got_it;
3583         }
3584         goto phooey;
3585     } /* end anchored search */
3586
3587     if (prog->intflags & PREGf_ANCH_GPOS)
3588     {
3589         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3590         assert(prog->intflags & PREGf_GPOS_SEEN);
3591         /* For anchored \G, the only position it can match from is
3592          * (ganch-gofs); we already set startpos to this above; if intuit
3593          * moved us on from there, we can't possibly succeed */
3594         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3595         if (s == startpos && regtry(reginfo, &s))
3596             goto got_it;
3597         goto phooey;
3598     }
3599
3600     /* Messy cases:  unanchored match. */
3601     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3602         /* we have /x+whatever/ */
3603         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3604         char ch;
3605 #ifdef DEBUGGING
3606         int did_match = 0;
3607 #endif
3608         if (utf8_target) {
3609             if (! prog->anchored_utf8) {
3610                 to_utf8_substr(prog);
3611             }
3612             ch = SvPVX_const(prog->anchored_utf8)[0];
3613             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3614                 if (*s == ch) {
3615                     DEBUG_EXECUTE_r( did_match = 1 );
3616                     if (regtry(reginfo, &s)) goto got_it;
3617                     s += UTF8SKIP(s);
3618                     while (s < strend && *s == ch)
3619                         s += UTF8SKIP(s);
3620                 }
3621             );
3622
3623         }
3624         else {
3625             if (! prog->anchored_substr) {
3626                 if (! to_byte_substr(prog)) {
3627                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3628                 }
3629             }
3630             ch = SvPVX_const(prog->anchored_substr)[0];
3631             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3632                 if (*s == ch) {
3633                     DEBUG_EXECUTE_r( did_match = 1 );
3634                     if (regtry(reginfo, &s)) goto got_it;
3635                     s++;
3636                     while (s < strend && *s == ch)
3637                         s++;
3638                 }
3639             );
3640         }
3641         DEBUG_EXECUTE_r(if (!did_match)
3642                 Perl_re_printf( aTHX_
3643                                   "Did not find anchored character...\n")
3644                );
3645     }
3646     else if (prog->anchored_substr != NULL
3647               || prog->anchored_utf8 != NULL
3648               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3649                   && prog->float_max_offset < strend - s)) {
3650         SV *must;
3651         SSize_t back_max;
3652         SSize_t back_min;
3653         char *last;
3654         char *last1;            /* Last position checked before */
3655 #ifdef DEBUGGING
3656         int did_match = 0;
3657 #endif
3658         if (prog->anchored_substr || prog->anchored_utf8) {
3659             if (utf8_target) {
3660                 if (! prog->anchored_utf8) {
3661                     to_utf8_substr(prog);
3662                 }
3663                 must = prog->anchored_utf8;
3664             }
3665             else {
3666                 if (! prog->anchored_substr) {
3667                     if (! to_byte_substr(prog)) {
3668                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3669                     }
3670                 }
3671                 must = prog->anchored_substr;
3672             }
3673             back_max = back_min = prog->anchored_offset;
3674         } else {
3675             if (utf8_target) {
3676                 if (! prog->float_utf8) {
3677                     to_utf8_substr(prog);
3678                 }
3679                 must = prog->float_utf8;
3680             }
3681             else {
3682                 if (! prog->float_substr) {
3683                     if (! to_byte_substr(prog)) {
3684                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3685                     }
3686                 }
3687                 must = prog->float_substr;
3688             }
3689             back_max = prog->float_max_offset;
3690             back_min = prog->float_min_offset;
3691         }
3692
3693         if (back_min<0) {
3694             last = strend;
3695         } else {
3696             last = HOP3c(strend,        /* Cannot start after this */
3697                   -(SSize_t)(CHR_SVLEN(must)
3698                          - (SvTAIL(must) != 0) + back_min), strbeg);
3699         }
3700         if (s > reginfo->strbeg)
3701             last1 = HOPc(s, -1);
3702         else
3703             last1 = s - 1;      /* bogus */
3704
3705         /* XXXX check_substr already used to find "s", can optimize if
3706            check_substr==must. */
3707         dontbother = 0;
3708         strend = HOPc(strend, -dontbother);
3709         while ( (s <= last) &&
3710                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3711                                   (unsigned char*)strend, must,
3712                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3713             DEBUG_EXECUTE_r( did_match = 1 );
3714             if (HOPc(s, -back_max) > last1) {
3715                 last1 = HOPc(s, -back_min);
3716                 s = HOPc(s, -back_max);
3717             }
3718             else {
3719                 char * const t = (last1 >= reginfo->strbeg)
3720                                     ? HOPc(last1, 1) : last1 + 1;
3721
3722                 last1 = HOPc(s, -back_min);
3723                 s = t;
3724             }
3725             if (utf8_target) {
3726                 while (s <= last1) {
3727                     if (regtry(reginfo, &s))
3728                         goto got_it;
3729                     if (s >= last1) {
3730                         s++; /* to break out of outer loop */
3731                         break;
3732                     }
3733                     s += UTF8SKIP(s);
3734                 }
3735             }
3736             else {
3737                 while (s <= last1) {
3738                     if (regtry(reginfo, &s))
3739                         goto got_it;
3740                     s++;
3741                 }
3742             }
3743         }
3744         DEBUG_EXECUTE_r(if (!did_match) {
3745             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3746                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3747             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3748                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3749                                ? "anchored" : "floating"),
3750                 quoted, RE_SV_TAIL(must));
3751         });
3752         goto phooey;
3753     }
3754     else if ( (c = progi->regstclass) ) {
3755         if (minlen) {
3756             const OPCODE op = OP(progi->regstclass);
3757             /* don't bother with what can't match */
3758             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3759                 strend = HOPc(strend, -(minlen - 1));
3760         }
3761         DEBUG_EXECUTE_r({
3762             SV * const prop = sv_newmortal();
3763             regprop(prog, prop, c, reginfo, NULL);
3764             {
3765                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3766                     s,strend-s,PL_dump_re_max_len);
3767                 Perl_re_printf( aTHX_
3768                     "Matching stclass %.*s against %s (%d bytes)\n",
3769                     (int)SvCUR(prop), SvPVX_const(prop),
3770                      quoted, (int)(strend - s));
3771             }
3772         });
3773         if (find_byclass(prog, c, s, strend, reginfo))
3774             goto got_it;
3775         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3776     }
3777     else {
3778         dontbother = 0;
3779         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3780             /* Trim the end. */
3781             char *last= NULL;
3782             SV* float_real;
3783             STRLEN len;
3784             const char *little;
3785
3786             if (utf8_target) {
3787                 if (! prog->float_utf8) {
3788                     to_utf8_substr(prog);
3789                 }
3790                 float_real = prog->float_utf8;
3791             }
3792             else {
3793                 if (! prog->float_substr) {
3794                     if (! to_byte_substr(prog)) {
3795                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3796                     }
3797                 }
3798                 float_real = prog->float_substr;
3799             }
3800
3801             little = SvPV_const(float_real, len);
3802             if (SvTAIL(float_real)) {
3803                     /* This means that float_real contains an artificial \n on
3804                      * the end due to the presence of something like this:
3805                      * /foo$/ where we can match both "foo" and "foo\n" at the
3806                      * end of the string.  So we have to compare the end of the
3807                      * string first against the float_real without the \n and
3808                      * then against the full float_real with the string.  We
3809                      * have to watch out for cases where the string might be
3810                      * smaller than the float_real or the float_real without
3811                      * the \n. */
3812                     char *checkpos= strend - len;
3813                     DEBUG_OPTIMISE_r(
3814                         Perl_re_printf( aTHX_
3815                             "%sChecking for float_real.%s\n",
3816                             PL_colors[4], PL_colors[5]));
3817                     if (checkpos + 1 < strbeg) {
3818                         /* can't match, even if we remove the trailing \n
3819                          * string is too short to match */
3820                         DEBUG_EXECUTE_r(
3821                             Perl_re_printf( aTHX_
3822                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3823                                 PL_colors[4], PL_colors[5]));
3824                         goto phooey;
3825                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3826                         /* can match, the end of the string matches without the
3827                          * "\n" */
3828                         last = checkpos + 1;
3829                     } else if (checkpos < strbeg) {
3830                         /* cant match, string is too short when the "\n" is
3831                          * included */
3832                         DEBUG_EXECUTE_r(
3833                             Perl_re_printf( aTHX_
3834                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3835                                 PL_colors[4], PL_colors[5]));
3836                         goto phooey;
3837                     } else if (!multiline) {
3838                         /* non multiline match, so compare with the "\n" at the
3839                          * end of the string */
3840                         if (memEQ(checkpos, little, len)) {
3841                             last= checkpos;
3842                         } else {
3843                             DEBUG_EXECUTE_r(
3844                                 Perl_re_printf( aTHX_
3845                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3846                                     PL_colors[4], PL_colors[5]));
3847                             goto phooey;
3848                         }
3849                     } else {
3850                         /* multiline match, so we have to search for a place
3851                          * where the full string is located */
3852                         goto find_last;
3853                     }
3854             } else {
3855                   find_last:
3856                     if (len)
3857                         last = rninstr(s, strend, little, little + len);
3858                     else
3859                         last = strend;  /* matching "$" */
3860             }
3861             if (!last) {
3862                 /* at one point this block contained a comment which was
3863                  * probably incorrect, which said that this was a "should not
3864                  * happen" case.  Even if it was true when it was written I am
3865                  * pretty sure it is not anymore, so I have removed the comment
3866                  * and replaced it with this one. Yves */
3867                 DEBUG_EXECUTE_r(
3868                     Perl_re_printf( aTHX_
3869                         "%sString does not contain required substring, cannot match.%s\n",
3870                         PL_colors[4], PL_colors[5]
3871                     ));
3872                 goto phooey;
3873             }
3874             dontbother = strend - last + prog->float_min_offset;
3875         }
3876         if (minlen && (dontbother < minlen))
3877             dontbother = minlen - 1;
3878         strend -= dontbother;              /* this one's always in bytes! */
3879         /* We don't know much -- general case. */
3880         if (utf8_target) {
3881             for (;;) {
3882                 if (regtry(reginfo, &s))
3883                     goto got_it;
3884                 if (s >= strend)
3885                     break;
3886                 s += UTF8SKIP(s);
3887             };
3888         }
3889         else {
3890             do {
3891                 if (regtry(reginfo, &s))
3892                     goto got_it;
3893             } while (s++ < strend);
3894         }
3895     }
3896
3897     /* Failure. */
3898     goto phooey;
3899
3900   got_it:
3901     /* s/// doesn't like it if $& is earlier than where we asked it to
3902      * start searching (which can happen on something like /.\G/) */
3903     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3904             && (prog->offs[0].start < stringarg - strbeg))
3905     {
3906         /* this should only be possible under \G */
3907         assert(prog->intflags & PREGf_GPOS_SEEN);
3908         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3909             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3910         goto phooey;
3911     }
3912
3913     DEBUG_BUFFERS_r(
3914         if (swap)
3915             Perl_re_exec_indentf( aTHX_
3916                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3917                 0,
3918                 PTR2UV(prog),
3919                 PTR2UV(swap)
3920             );
3921     );
3922     Safefree(swap);
3923
3924     /* clean up; this will trigger destructors that will free all slabs
3925      * above the current one, and cleanup the regmatch_info_aux
3926      * and regmatch_info_aux_eval sructs */
3927
3928     LEAVE_SCOPE(oldsave);
3929
3930     if (RXp_PAREN_NAMES(prog))
3931         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3932
3933     /* make sure $`, $&, $', and $digit will work later */
3934     if ( !(flags & REXEC_NOT_FIRST) )
3935         S_reg_set_capture_string(aTHX_ rx,
3936                                     strbeg, reginfo->strend,
3937                                     sv, flags, utf8_target);
3938
3939     return 1;
3940
3941   phooey:
3942     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3943                           PL_colors[4], PL_colors[5]));
3944
3945     /* clean up; this will trigger destructors that will free all slabs
3946      * above the current one, and cleanup the regmatch_info_aux
3947      * and regmatch_info_aux_eval sructs */
3948
3949     LEAVE_SCOPE(oldsave);
3950
3951     if (swap) {
3952         /* we failed :-( roll it back */
3953         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3954             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3955             0,
3956             PTR2UV(prog),
3957             PTR2UV(prog->offs),
3958             PTR2UV(swap)
3959         ));
3960         Safefree(prog->offs);
3961         prog->offs = swap;
3962     }
3963     return 0;
3964 }
3965
3966
3967 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3968  * Do inc before dec, in case old and new rex are the same */
3969 #define SET_reg_curpm(Re2)                          \
3970     if (reginfo->info_aux_eval) {                   \
3971         (void)ReREFCNT_inc(Re2);                    \
3972         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3973         PM_SETRE((PL_reg_curpm), (Re2));            \
3974     }
3975
3976
3977 /*
3978  - regtry - try match at specific point
3979  */
3980 STATIC bool                     /* 0 failure, 1 success */
3981 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3982 {
3983     CHECKPOINT lastcp;
3984     REGEXP *const rx = reginfo->prog;
3985     regexp *const prog = ReANY(rx);
3986     SSize_t result;
3987 #ifdef DEBUGGING
3988     U32 depth = 0; /* used by REGCP_SET */
3989 #endif
3990     RXi_GET_DECL(prog,progi);
3991     GET_RE_DEBUG_FLAGS_DECL;
3992
3993     PERL_ARGS_ASSERT_REGTRY;
3994
3995     reginfo->cutpoint=NULL;
3996
3997     prog->offs[0].start = *startposp - reginfo->strbeg;
3998     prog->lastparen = 0;
3999     prog->lastcloseparen = 0;
4000
4001     /* XXXX What this code is doing here?!!!  There should be no need
4002        to do this again and again, prog->lastparen should take care of
4003        this!  --ilya*/
4004
4005     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4006      * Actually, the code in regcppop() (which Ilya may be meaning by
4007      * prog->lastparen), is not needed at all by the test suite
4008      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4009      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4010      * Meanwhile, this code *is* needed for the
4011      * above-mentioned test suite tests to succeed.  The common theme
4012      * on those tests seems to be returning null fields from matches.
4013      * --jhi updated by dapm */
4014
4015     /* After encountering a variant of the issue mentioned above I think
4016      * the point Ilya was making is that if we properly unwind whenever
4017      * we set lastparen to a smaller value then we should not need to do
4018      * this every time, only when needed. So if we have tests that fail if
4019      * we remove this, then it suggests somewhere else we are improperly
4020      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4021      * places it is called, and related regcp() routines. - Yves */
4022 #if 1
4023     if (prog->nparens) {
4024         regexp_paren_pair *pp = prog->offs;
4025         I32 i;
4026         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4027             ++pp;
4028             pp->start = -1;
4029             pp->end = -1;
4030         }
4031     }
4032 #endif
4033     REGCP_SET(lastcp);
4034     result = regmatch(reginfo, *startposp, progi->program + 1);
4035     if (result != -1) {
4036         prog->offs[0].end = result;
4037         return 1;
4038     }
4039     if (reginfo->cutpoint)
4040         *startposp= reginfo->cutpoint;
4041     REGCP_UNWIND(lastcp);
4042     return 0;
4043 }
4044
4045
4046 #define sayYES goto yes
4047 #define sayNO goto no
4048 #define sayNO_SILENT goto no_silent
4049
4050 /* we dont use STMT_START/END here because it leads to
4051    "unreachable code" warnings, which are bogus, but distracting. */
4052 #define CACHEsayNO \
4053     if (ST.cache_mask) \
4054        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
4055     sayNO
4056
4057 /* this is used to determine how far from the left messages like
4058    'failed...' are printed in regexec.c. It should be set such that
4059    messages are inline with the regop output that created them.
4060 */
4061 #define REPORT_CODE_OFF 29
4062 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4063 #ifdef DEBUGGING
4064 int
4065 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4066 {
4067     va_list ap;
4068     int result;
4069     PerlIO *f= Perl_debug_log;
4070     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4071     va_start(ap, depth);
4072     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4073     result = PerlIO_vprintf(f, fmt, ap);
4074     va_end(ap);
4075     return result;
4076 }
4077 #endif /* DEBUGGING */
4078
4079
4080 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4081 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4082 #define CHRTEST_NOT_A_CP_1 -999
4083 #define CHRTEST_NOT_A_CP_2 -998
4084
4085 /* grab a new slab and return the first slot in it */
4086
4087 STATIC regmatch_state *
4088 S_push_slab(pTHX)
4089 {
4090     regmatch_slab *s = PL_regmatch_slab->next;
4091     if (!s) {
4092         Newx(s, 1, regmatch_slab);
4093         s->prev = PL_regmatch_slab;
4094         s->next = NULL;
4095         PL_regmatch_slab->next = s;
4096     }
4097     PL_regmatch_slab = s;
4098     return SLAB_FIRST(s);
4099 }
4100
4101
4102 /* push a new state then goto it */
4103
4104 #define PUSH_STATE_GOTO(state, node, input) \
4105     pushinput = input; \
4106     scan = node; \
4107     st->resume_state = state; \
4108     goto push_state;
4109
4110 /* push a new state with success backtracking, then goto it */
4111
4112 #define PUSH_YES_STATE_GOTO(state, node, input) \
4113     pushinput = input; \
4114     scan = node; \
4115     st->resume_state = state; \
4116     goto push_yes_state;
4117
4118
4119
4120
4121 /*
4122
4123 regmatch() - main matching routine
4124
4125 This is basically one big switch statement in a loop. We execute an op,
4126 set 'next' to point the next op, and continue. If we come to a point which
4127 we may need to backtrack to on failure such as (A|B|C), we push a
4128 backtrack state onto the backtrack stack. On failure, we pop the top
4129 state, and re-enter the loop at the state indicated. If there are no more
4130 states to pop, we return failure.
4131
4132 Sometimes we also need to backtrack on success; for example /A+/, where
4133 after successfully matching one A, we need to go back and try to
4134 match another one; similarly for lookahead assertions: if the assertion
4135 completes successfully, we backtrack to the state just before the assertion
4136 and then carry on.  In these cases, the pushed state is marked as
4137 'backtrack on success too'. This marking is in fact done by a chain of
4138 pointers, each pointing to the previous 'yes' state. On success, we pop to
4139 the nearest yes state, discarding any intermediate failure-only states.
4140 Sometimes a yes state is pushed just to force some cleanup code to be
4141 called at the end of a successful match or submatch; e.g. (??{$re}) uses
4142 it to free the inner regex.
4143
4144 Note that failure backtracking rewinds the cursor position, while
4145 success backtracking leaves it alone.
4146
4147 A pattern is complete when the END op is executed, while a subpattern
4148 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
4149 ops trigger the "pop to last yes state if any, otherwise return true"
4150 behaviour.
4151
4152 A common convention in this function is to use A and B to refer to the two
4153 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
4154 the subpattern to be matched possibly multiple times, while B is the entire
4155 rest of the pattern. Variable and state names reflect this convention.
4156
4157 The states in the main switch are the union of ops and failure/success of
4158 substates associated with with that op.  For example, IFMATCH is the op
4159 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
4160 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
4161 successfully matched A and IFMATCH_A_fail is a state saying that we have
4162 just failed to match A. Resume states always come in pairs. The backtrack
4163 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
4164 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
4165 on success or failure.
4166
4167 The struct that holds a backtracking state is actually a big union, with
4168 one variant for each major type of op. The variable st points to the
4169 top-most backtrack struct. To make the code clearer, within each
4170 block of code we #define ST to alias the relevant union.
4171
4172 Here's a concrete example of a (vastly oversimplified) IFMATCH
4173 implementation:
4174
4175     switch (state) {
4176     ....
4177
4178 #define ST st->u.ifmatch
4179
4180     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4181         ST.foo = ...; // some state we wish to save
4182         ...
4183         // push a yes backtrack state with a resume value of
4184         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
4185         // first node of A:
4186         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
4187         // NOTREACHED
4188
4189     case IFMATCH_A: // we have successfully executed A; now continue with B
4190         next = B;
4191         bar = ST.foo; // do something with the preserved value
4192         break;
4193
4194     case IFMATCH_A_fail: // A failed, so the assertion failed
4195         ...;   // do some housekeeping, then ...
4196         sayNO; // propagate the failure
4197
4198 #undef ST
4199
4200     ...
4201     }
4202
4203 For any old-timers reading this who are familiar with the old recursive
4204 approach, the code above is equivalent to:
4205
4206     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4207     {
4208         int foo = ...
4209         ...
4210         if (regmatch(A)) {
4211             next = B;
4212             bar = foo;
4213             break;
4214         }
4215         ...;   // do some housekeeping, then ...
4216         sayNO; // propagate the failure
4217     }
4218
4219 The topmost backtrack state, pointed to by st, is usually free. If you
4220 want to claim it, populate any ST.foo fields in it with values you wish to
4221 save, then do one of
4222
4223         PUSH_STATE_GOTO(resume_state, node, newinput);
4224         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
4225
4226 which sets that backtrack state's resume value to 'resume_state', pushes a
4227 new free entry to the top of the backtrack stack, then goes to 'node'.
4228 On backtracking, the free slot is popped, and the saved state becomes the
4229 new free state. An ST.foo field in this new top state can be temporarily
4230 accessed to retrieve values, but once the main loop is re-entered, it
4231 becomes available for reuse.
4232
4233 Note that the depth of the backtrack stack constantly increases during the
4234 left-to-right execution of the pattern, rather than going up and down with
4235 the pattern nesting. For example the stack is at its maximum at Z at the
4236 end of the pattern, rather than at X in the following:
4237
4238     /(((X)+)+)+....(Y)+....Z/
4239
4240 The only exceptions to this are lookahead/behind assertions and the cut,
4241 (?>A), which pop all the backtrack states associated with A before
4242 continuing.
4243
4244 Backtrack state structs are allocated in slabs of about 4K in size.
4245 PL_regmatch_state and st always point to the currently active state,
4246 and PL_regmatch_slab points to the slab currently containing
4247 PL_regmatch_state.  The first time regmatch() is called, the first slab is
4248 allocated, and is never freed until interpreter destruction. When the slab
4249 is full, a new one is allocated and chained to the end. At exit from
4250 regmatch(), slabs allocated since entry are freed.
4251
4252 */
4253
4254
4255 #define DEBUG_STATE_pp(pp)                                  \
4256     DEBUG_STATE_r({                                         \
4257         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
4258         Perl_re_printf( aTHX_                                           \
4259             "%*s" pp " %s%s%s%s%s\n",                       \
4260             INDENT_CHARS(depth), "",                        \
4261             PL_reg_name[st->resume_state],                  \
4262             ((st==yes_state||st==mark_state) ? "[" : ""),   \
4263             ((st==yes_state) ? "Y" : ""),                   \
4264             ((st==mark_state) ? "M" : ""),                  \
4265             ((st==yes_state||st==mark_state) ? "]" : "")    \
4266         );                                                  \
4267     });
4268
4269
4270 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
4271
4272 #ifdef DEBUGGING
4273
4274 STATIC void
4275 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4276     const char *start, const char *end, const char *blurb)
4277 {
4278     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4279
4280     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4281
4282     if (!PL_colorset)
4283             reginitcolors();
4284     {
4285         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4286             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4287
4288         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4289             start, end - start, PL_dump_re_max_len);
4290
4291         Perl_re_printf( aTHX_
4292             "%s%s REx%s %s against %s\n",
4293                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4294
4295         if (utf8_target||utf8_pat)
4296             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4297                 utf8_pat ? "pattern" : "",
4298                 utf8_pat && utf8_target ? " and " : "",
4299                 utf8_target ? "string" : ""
4300             );
4301     }
4302 }
4303
4304 STATIC void
4305 S_dump_exec_pos(pTHX_ const char *locinput,
4306                       const regnode *scan,
4307                       const char *loc_regeol,
4308                       const char *loc_bostr,
4309                       const char *loc_reg_starttry,
4310                       const bool utf8_target,
4311                       const U32 depth
4312                 )
4313 {
4314     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4315     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4316     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4317     /* The part of the string before starttry has one color
4318        (pref0_len chars), between starttry and current
4319        position another one (pref_len - pref0_len chars),
4320        after the current position the third one.
4321        We assume that pref0_len <= pref_len, otherwise we
4322        decrease pref0_len.  */
4323     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4324         ? (5 + taill) - l : locinput - loc_bostr;
4325     int pref0_len;
4326
4327     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4328
4329     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4330         pref_len++;
4331     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4332     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4333         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4334               ? (5 + taill) - pref_len : loc_regeol - locinput);
4335     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4336         l--;
4337     if (pref0_len < 0)
4338         pref0_len = 0;
4339     if (pref0_len > pref_len)
4340         pref0_len = pref_len;
4341     {
4342         const int is_uni = utf8_target ? 1 : 0;
4343
4344         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4345             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4346
4347         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4348                     (locinput - pref_len + pref0_len),
4349                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4350
4351         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4352                     locinput, loc_regeol - locinput, 10, 0, 1);
4353
4354         const STRLEN tlen=len0+len1+len2;
4355         Perl_re_printf( aTHX_
4356                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4357                     (IV)(locinput - loc_bostr),
4358                     len0, s0,
4359                     len1, s1,
4360                     (docolor ? "" : "> <"),
4361                     len2, s2,
4362                     (int)(tlen > 19 ? 0 :  19 - tlen),
4363                     "",
4364                     depth);
4365     }
4366 }
4367
4368 #endif
4369
4370 /* reg_check_named_buff_matched()
4371  * Checks to see if a named buffer has matched. The data array of
4372  * buffer numbers corresponding to the buffer is expected to reside
4373  * in the regexp->data->data array in the slot stored in the ARG() of
4374  * node involved. Note that this routine doesn't actually care about the
4375  * name, that information is not preserved from compilation to execution.
4376  * Returns the index of the leftmost defined buffer with the given name
4377  * or 0 if non of the buffers matched.
4378  */
4379 STATIC I32
4380 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4381 {
4382     I32 n;
4383     RXi_GET_DECL(rex,rexi);
4384     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4385     I32 *nums=(I32*)SvPVX(sv_dat);
4386
4387     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4388
4389     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4390         if ((I32)rex->lastparen >= nums[n] &&
4391             rex->offs[nums[n]].end != -1)
4392         {
4393             return nums[n];
4394         }
4395     }
4396     return 0;
4397 }
4398
4399
4400 static bool
4401 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4402         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4403 {
4404     /* This function determines if there are one or two characters that match
4405      * the first character of the passed-in EXACTish node <text_node>, and if
4406      * so, returns them in the passed-in pointers.
4407      *
4408      * If it determines that no possible character in the target string can
4409      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4410      * the first character in <text_node> requires UTF-8 to represent, and the
4411      * target string isn't in UTF-8.)
4412      *
4413      * If there are more than two characters that could match the beginning of
4414      * <text_node>, or if more context is required to determine a match or not,
4415      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4416      *
4417      * The motiviation behind this function is to allow the caller to set up
4418      * tight loops for matching.  If <text_node> is of type EXACT, there is
4419      * only one possible character that can match its first character, and so
4420      * the situation is quite simple.  But things get much more complicated if
4421      * folding is involved.  It may be that the first character of an EXACTFish
4422      * node doesn't participate in any possible fold, e.g., punctuation, so it
4423      * can be matched only by itself.  The vast majority of characters that are
4424      * in folds match just two things, their lower and upper-case equivalents.
4425      * But not all are like that; some have multiple possible matches, or match
4426      * sequences of more than one character.  This function sorts all that out.
4427      *
4428      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4429      * loop of trying to match A*, we know we can't exit where the thing
4430      * following it isn't a B.  And something can't be a B unless it is the
4431      * beginning of B.  By putting a quick test for that beginning in a tight
4432      * loop, we can rule out things that can't possibly be B without having to
4433      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4434      * character, we can make a tight loop matching A*, using the outputs of
4435      * this function.
4436      *
4437      * If the target string to match isn't in UTF-8, and there aren't
4438      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4439      * the one or two possible octets (which are characters in this situation)
4440      * that can match.  In all cases, if there is only one character that can
4441      * match, *<c1p> and *<c2p> will be identical.
4442      *
4443      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4444      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4445      * can match the beginning of <text_node>.  They should be declared with at
4446      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4447      * undefined what these contain.)  If one or both of the buffers are
4448      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4449      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4450      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4451      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4452      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4453
4454     const bool utf8_target = reginfo->is_utf8_target;
4455
4456     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4457     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4458     bool use_chrtest_void = FALSE;
4459     const bool is_utf8_pat = reginfo->is_utf8_pat;
4460
4461     /* Used when we have both utf8 input and utf8 output, to avoid converting
4462      * to/from code points */
4463     bool utf8_has_been_setup = FALSE;
4464
4465     dVAR;
4466
4467     U8 *pat = (U8*)STRING(text_node);
4468     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4469
4470     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4471
4472         /* In an exact node, only one thing can be matched, that first
4473          * character.  If both the pat and the target are UTF-8, we can just
4474          * copy the input to the output, avoiding finding the code point of
4475          * that character */
4476         if (!is_utf8_pat) {
4477             c2 = c1 = *pat;
4478         }
4479         else if (utf8_target) {
4480             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4481             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4482             utf8_has_been_setup = TRUE;
4483         }
4484         else {
4485             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4486         }
4487     }
4488     else { /* an EXACTFish node */
4489         U8 *pat_end = pat + STR_LEN(text_node);
4490
4491         /* An EXACTFL node has at least some characters unfolded, because what
4492          * they match is not known until now.  So, now is the time to fold
4493          * the first few of them, as many as are needed to determine 'c1' and
4494          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4495          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4496          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4497          * need to fold as many characters as a single character can fold to,
4498          * so that later we can check if the first ones are such a multi-char
4499          * fold.  But, in such a pattern only locale-problematic characters
4500          * aren't folded, so we can skip this completely if the first character
4501          * in the node isn't one of the tricky ones */
4502         if (OP(text_node) == EXACTFL) {
4503
4504             if (! is_utf8_pat) {
4505                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4506                 {
4507                     folded[0] = folded[1] = 's';
4508                     pat = folded;
4509                     pat_end = folded + 2;
4510                 }
4511             }
4512             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4513                 U8 *s = pat;
4514                 U8 *d = folded;
4515                 int i;
4516
4517                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4518                     if (isASCII(*s)) {
4519                         *(d++) = (U8) toFOLD_LC(*s);
4520                         s++;
4521                     }
4522                     else {
4523                         STRLEN len;
4524                         _toFOLD_utf8_flags(s,
4525                                            pat_end,
4526                                            d,
4527                                            &len,
4528                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4529                         d += len;
4530                         s += UTF8SKIP(s);
4531                     }
4532                 }
4533
4534                 pat = folded;
4535                 pat_end = d;
4536             }
4537         }
4538
4539         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4540              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4541         {
4542             /* Multi-character folds require more context to sort out.  Also
4543              * PL_utf8_foldclosures used below doesn't handle them, so have to
4544              * be handled outside this routine */
4545             use_chrtest_void = TRUE;
4546         }
4547         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4548             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4549             if (c1 > 255) {
4550                 const unsigned int * remaining_folds_to_list;
4551                 unsigned int first_folds_to;
4552
4553                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4554                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4555                 Size_t folds_to_count = _inverse_folds(c1,
4556                                                      &first_folds_to,
4557                                                      &remaining_folds_to_list);
4558                 if (folds_to_count == 0) {
4559                     c2 = c1;    /* there is only a single character that could
4560                                    match */
4561                 }
4562                 else if (folds_to_count != 1) {
4563                     /* If there aren't exactly two folds to this (itself and
4564                      * another), it is outside the scope of this function */
4565                     use_chrtest_void = TRUE;
4566                 }
4567                 else {  /* There are two.  We already have one, get the other */
4568                     c2 = first_folds_to;
4569
4570                     /* Folds that cross the 255/256 boundary are forbidden if
4571                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4572                      * ASCIII.  The only other match to c1 is c2, and since c1
4573                      * is above 255, c2 better be as well under these
4574                      * circumstances.  If it isn't, it means the only legal
4575                      * match of c1 is itself. */
4576                     if (    c2 < 256
4577                         && (   (   OP(text_node) == EXACTFL
4578                                 && ! IN_UTF8_CTYPE_LOCALE)
4579                             || ((     OP(text_node) == EXACTFAA
4580                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4581                                 && (isASCII(c1) || isASCII(c2)))))
4582                     {
4583                         c2 = c1;
4584                     }
4585                 }
4586             }
4587             else /* Here, c1 is <= 255 */
4588                 if (utf8_target
4589                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4590                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4591                     && ((OP(text_node) != EXACTFAA
4592                         && OP(text_node) != EXACTFAA_NO_TRIE)
4593                         || ! isASCII(c1)))
4594             {
4595                 /* Here, there could be something above Latin1 in the target
4596                  * which folds to this character in the pattern.  All such
4597                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4598                  * than two characters involved in their folds, so are outside
4599                  * the scope of this function */
4600                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4601                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4602                 }
4603                 else {
4604                     use_chrtest_void = TRUE;
4605                 }
4606             }
4607             else { /* Here nothing above Latin1 can fold to the pattern
4608                       character */
4609                 switch (OP(text_node)) {
4610
4611                     case EXACTFL:   /* /l rules */
4612                         c2 = PL_fold_locale[c1];
4613                         break;
4614
4615                     case EXACTF:   /* This node only generated for non-utf8
4616                                     patterns */
4617                         assert(! is_utf8_pat);
4618                         if (! utf8_target) {    /* /d rules */
4619                             c2 = PL_fold[c1];
4620                             break;
4621                         }
4622                         /* FALLTHROUGH */
4623                         /* /u rules for all these.  This happens to work for
4624                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4625                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4626                                                 non-utf8 patterns */
4627                         assert(! is_utf8_pat);
4628                         /* FALLTHROUGH */
4629                     case EXACTFAA:
4630                     case EXACTFU_SS:
4631                     case EXACTFU:
4632                         c2 = PL_fold_latin1[c1];
4633                         break;
4634
4635                     default:
4636                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4637                         NOT_REACHED; /* NOTREACHED */
4638                 }
4639             }
4640         }
4641     }
4642
4643     /* Here have figured things out.  Set up the returns */
4644     if (use_chrtest_void) {
4645         *c2p = *c1p = CHRTEST_VOID;
4646     }
4647     else if (utf8_target) {
4648         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4649             uvchr_to_utf8(c1_utf8, c1);
4650             uvchr_to_utf8(c2_utf8, c2);
4651         }
4652
4653         /* Invariants are stored in both the utf8 and byte outputs; Use
4654          * negative numbers otherwise for the byte ones.  Make sure that the
4655          * byte ones are the same iff the utf8 ones are the same */
4656         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4657         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4658                 ? *c2_utf8
4659                 : (c1 == c2)
4660                   ? CHRTEST_NOT_A_CP_1
4661                   : CHRTEST_NOT_A_CP_2;
4662     }
4663     else if (c1 > 255) {
4664        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4665                            can represent */
4666            return FALSE;
4667        }
4668
4669        *c1p = *c2p = c2;    /* c2 is the only representable value */
4670     }
4671     else {  /* c1 is representable; see about c2 */
4672        *c1p = c1;
4673        *c2p = (c2 < 256) ? c2 : c1;
4674     }
4675
4676     return TRUE;
4677 }
4678
4679 STATIC bool
4680 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4681 {
4682     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4683      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4684
4685     PERL_ARGS_ASSERT_ISGCB;
4686
4687     switch (GCB_table[before][after]) {
4688         case GCB_BREAKABLE:
4689             return TRUE;
4690
4691         case GCB_NOBREAK:
4692             return FALSE;
4693
4694         case GCB_RI_then_RI:
4695             {
4696                 int RI_count = 1;
4697                 U8 * temp_pos = (U8 *) curpos;
4698
4699                 /* Do not break within emoji flag sequences. That is, do not
4700                  * break between regional indicator (RI) symbols if there is an
4701                  * odd number of RI characters before the break point.
4702                  *  GB12   sot (RI RI)* RI × RI
4703                  *  GB13 [^RI] (RI RI)* RI × RI */
4704
4705                 while (backup_one_GCB(strbeg,
4706                                     &temp_pos,
4707                                     utf8_target) == GCB_Regional_Indicator)
4708                 {
4709                     RI_count++;
4710                 }
4711
4712                 return RI_count % 2 != 1;
4713             }
4714
4715         case GCB_EX_then_EM:
4716
4717             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4718             {
4719                 U8 * temp_pos = (U8 *) curpos;
4720                 GCB_enum prev;
4721
4722                 do {
4723                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4724                 }
4725                 while (prev == GCB_Extend);
4726
4727                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4728             }
4729
4730         case GCB_Maybe_Emoji_NonBreak:
4731
4732             {
4733
4734             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4735               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4736               */
4737                 U8 * temp_pos = (U8 *) curpos;
4738                 GCB_enum prev;
4739
4740                 do {
4741                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4742                 }
4743                 while (prev == GCB_Extend);
4744
4745                 return prev != GCB_XPG_XX;
4746             }
4747
4748         default:
4749             break;
4750     }
4751
4752 #ifdef DEBUGGING
4753     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4754                                   before, after, GCB_table[before][after]);
4755     assert(0);
4756 #endif
4757     return TRUE;
4758 }
4759
4760 STATIC GCB_enum
4761 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4762 {
4763     GCB_enum gcb;
4764
4765     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4766
4767     if (*curpos < strbeg) {
4768         return GCB_EDGE;
4769     }
4770
4771     if (utf8_target) {
4772         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4773         U8 * prev_prev_char_pos;
4774
4775         if (! prev_char_pos) {
4776             return GCB_EDGE;
4777         }
4778
4779         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4780             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4781             *curpos = prev_char_pos;
4782             prev_char_pos = prev_prev_char_pos;
4783         }
4784         else {
4785             *curpos = (U8 *) strbeg;
4786             return GCB_EDGE;
4787         }
4788     }
4789     else {
4790         if (*curpos - 2 < strbeg) {
4791             *curpos = (U8 *) strbeg;
4792             return GCB_EDGE;
4793         }
4794         (*curpos)--;
4795         gcb = getGCB_VAL_CP(*(*curpos - 1));
4796     }
4797
4798     return gcb;
4799 }
4800
4801 /* Combining marks attach to most classes that precede them, but this defines
4802  * the exceptions (from TR14) */
4803 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4804                                      || prev == LB_Mandatory_Break      \
4805                                      || prev == LB_Carriage_Return      \
4806                                      || prev == LB_Line_Feed            \
4807                                      || prev == LB_Next_Line            \
4808                                      || prev == LB_Space                \
4809                                      || prev == LB_ZWSpace))
4810
4811 STATIC bool
4812 S_isLB(pTHX_ LB_enum before,
4813              LB_enum after,
4814              const U8 * const strbeg,
4815              const U8 * const curpos,
4816              const U8 * const strend,
4817              const bool utf8_target)
4818 {
4819     U8 * temp_pos = (U8 *) curpos;
4820     LB_enum prev = before;
4821
4822     /* Is the boundary between 'before' and 'after' line-breakable?
4823      * Most of this is just a table lookup of a generated table from Unicode
4824      * rules.  But some rules require context to decide, and so have to be
4825      * implemented in code */
4826
4827     PERL_ARGS_ASSERT_ISLB;
4828
4829     /* Rule numbers in the comments below are as of Unicode 9.0 */
4830
4831   redo:
4832     before = prev;
4833     switch (LB_table[before][after]) {
4834         case LB_BREAKABLE:
4835             return TRUE;
4836
4837         case LB_NOBREAK:
4838         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4839             return FALSE;
4840
4841         case LB_SP_foo + LB_BREAKABLE:
4842         case LB_SP_foo + LB_NOBREAK:
4843         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4844
4845             /* When we have something following a SP, we have to look at the
4846              * context in order to know what to do.
4847              *
4848              * SP SP should not reach here because LB7: Do not break before
4849              * spaces.  (For two spaces in a row there is nothing that
4850              * overrides that) */
4851             assert(after != LB_Space);
4852
4853             /* Here we have a space followed by a non-space.  Mostly this is a
4854              * case of LB18: "Break after spaces".  But there are complications
4855              * as the handling of spaces is somewhat tricky.  They are in a
4856              * number of rules, which have to be applied in priority order, but
4857              * something earlier in the string can cause a rule to be skipped
4858              * and a lower priority rule invoked.  A prime example is LB7 which
4859              * says don't break before a space.  But rule LB8 (lower priority)
4860              * says that the first break opportunity after a ZW is after any
4861              * span of spaces immediately after it.  If a ZW comes before a SP
4862              * in the input, rule LB8 applies, and not LB7.  Other such rules
4863              * involve combining marks which are rules 9 and 10, but they may
4864              * override higher priority rules if they come earlier in the
4865              * string.  Since we're doing random access into the middle of the
4866              * string, we have to look for rules that should get applied based
4867              * on both string position and priority.  Combining marks do not
4868              * attach to either ZW nor SP, so we don't have to consider them
4869              * until later.
4870              *
4871              * To check for LB8, we have to find the first non-space character
4872              * before this span of spaces */
4873             do {
4874                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4875             }
4876             while (prev == LB_Space);
4877
4878             /* LB8 Break before any character following a zero-width space,
4879              * even if one or more spaces intervene.
4880              *      ZW SP* ÷
4881              * So if we have a ZW just before this span, and to get here this
4882              * is the final space in the span. */
4883             if (prev == LB_ZWSpace) {
4884                 return TRUE;
4885             }
4886
4887             /* Here, not ZW SP+.  There are several rules that have higher
4888              * priority than LB18 and can be resolved now, as they don't depend
4889              * on anything earlier in the string (except ZW, which we have
4890              * already handled).  One of these rules is LB11 Do not break
4891              * before Word joiner, but we have specially encoded that in the
4892              * lookup table so it is caught by the single test below which
4893              * catches the other ones. */
4894             if (LB_table[LB_Space][after] - LB_SP_foo
4895                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4896             {
4897                 return FALSE;
4898             }
4899
4900             /* If we get here, we have to XXX consider combining marks. */
4901             if (prev == LB_Combining_Mark) {
4902
4903                 /* What happens with these depends on the character they
4904                  * follow.  */
4905                 do {
4906                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4907                 }
4908                 while (prev == LB_Combining_Mark);
4909
4910                 /* Most times these attach to and inherit the characteristics
4911                  * of that character, but not always, and when not, they are to
4912                  * be treated as AL by rule LB10. */
4913                 if (! LB_CM_ATTACHES_TO(prev)) {
4914                     prev = LB_Alphabetic;
4915                 }
4916             }
4917
4918             /* Here, we have the character preceding the span of spaces all set
4919              * up.  We follow LB18: "Break after spaces" unless the table shows
4920              * that is overriden */
4921             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4922
4923         case LB_CM_ZWJ_foo:
4924
4925             /* We don't know how to treat the CM except by looking at the first
4926              * non-CM character preceding it.  ZWJ is treated as CM */
4927             do {
4928                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4929             }
4930             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4931
4932             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4933              * attatches to it, then it inherits the behavior of 'prev'.  If it
4934              * doesn't attach, it is to be treated as an AL */
4935             if (! LB_CM_ATTACHES_TO(prev)) {
4936                 prev = LB_Alphabetic;
4937             }
4938
4939             goto redo;
4940
4941         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4942         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4943
4944             /* LB21a Don't break after Hebrew + Hyphen.
4945              * HL (HY | BA) × */
4946
4947             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4948                                                           == LB_Hebrew_Letter)
4949             {
4950                 return FALSE;
4951             }
4952
4953             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4954
4955         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4956         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4957
4958             /* LB25a (PR | PO) × ( OP | HY )? NU */
4959             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4960                 return FALSE;
4961             }
4962
4963             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4964                                                                 == LB_BREAKABLE;
4965
4966         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4967         case LB_SY_or_IS_then_various + LB_NOBREAK:
4968         {
4969             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4970
4971             LB_enum temp = prev;
4972             do {
4973                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4974             }
4975             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4976             if (temp == LB_Numeric) {
4977                 return FALSE;
4978             }
4979
4980             return LB_table[prev][after] - LB_SY_or_IS_then_various
4981                                                                == LB_BREAKABLE;
4982         }
4983
4984         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4985         case LB_various_then_PO_or_PR + LB_NOBREAK:
4986         {
4987             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4988
4989             LB_enum temp = prev;
4990             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4991             {
4992                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4993             }
4994             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4995                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4996             }
4997             if (temp == LB_Numeric) {
4998                 return FALSE;
4999             }
5000             return LB_various_then_PO_or_PR;
5001         }
5002
5003         case LB_RI_then_RI + LB_NOBREAK:
5004         case LB_RI_then_RI + LB_BREAKABLE:
5005             {
5006                 int RI_count = 1;
5007
5008                 /* LB30a Break between two regional indicator symbols if and
5009                  * only if there are an even number of regional indicators
5010                  * preceding the position of the break.
5011                  *
5012                  *    sot (RI RI)* RI × RI
5013                  *  [^RI] (RI RI)* RI × RI */
5014
5015                 while (backup_one_LB(strbeg,
5016                                      &temp_pos,
5017                                      utf8_target) == LB_Regional_Indicator)
5018                 {
5019                     RI_count++;
5020                 }
5021
5022                 return RI_count % 2 == 0;
5023             }
5024
5025         default:
5026             break;
5027     }
5028
5029 #ifdef DEBUGGING
5030     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5031                                   before, after, LB_table[before][after]);
5032     assert(0);
5033 #endif
5034     return TRUE;
5035 }
5036
5037 STATIC LB_enum
5038 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5039 {
5040     LB_enum lb;
5041
5042     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5043
5044     if (*curpos >= strend) {
5045         return LB_EDGE;
5046     }
5047
5048     if (utf8_target) {
5049         *curpos += UTF8SKIP(*curpos);
5050         if (*curpos >= strend) {
5051             return LB_EDGE;
5052         }
5053         lb = getLB_VAL_UTF8(*curpos, strend);
5054     }
5055     else {
5056         (*curpos)++;
5057         if (*curpos >= strend) {
5058             return LB_EDGE;
5059         }
5060         lb = getLB_VAL_CP(**curpos);
5061     }
5062
5063     return lb;
5064 }
5065
5066 STATIC LB_enum
5067 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5068 {
5069     LB_enum lb;
5070
5071     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5072
5073     if (*curpos < strbeg) {
5074         return LB_EDGE;
5075     }
5076
5077     if (utf8_target) {
5078         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5079         U8 * prev_prev_char_pos;
5080
5081         if (! prev_char_pos) {
5082             return LB_EDGE;
5083         }
5084
5085         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5086             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5087             *curpos = prev_char_pos;
5088             prev_char_pos = prev_prev_char_pos;
5089         }
5090         else {
5091             *curpos = (U8 *) strbeg;
5092             return LB_EDGE;
5093         }
5094     }
5095     else {
5096         if (*curpos - 2 < strbeg) {
5097             *curpos = (U8 *) strbeg;
5098             return LB_EDGE;
5099         }
5100         (*curpos)--;
5101         lb = getLB_VAL_CP(*(*curpos - 1));
5102     }
5103
5104     return lb;
5105 }
5106
5107 STATIC bool
5108 S_isSB(pTHX_ SB_enum before,
5109              SB_enum after,
5110              const U8 * const strbeg,
5111              const U8 * const curpos,
5112              const U8 * const strend,
5113              const bool utf8_target)
5114 {
5115     /* returns a boolean indicating if there is a Sentence Boundary Break
5116      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
5117
5118     U8 * lpos = (U8 *) curpos;
5119     bool has_para_sep = FALSE;
5120     bool has_sp = FALSE;
5121
5122     PERL_ARGS_ASSERT_ISSB;
5123
5124     /* Break at the start and end of text.
5125         SB1.  sot  ÷
5126         SB2.  ÷  eot
5127       But unstated in Unicode is don't break if the text is empty */
5128     if (before == SB_EDGE || after == SB_EDGE) {
5129         return before != after;
5130     }
5131
5132     /* SB 3: Do not break within CRLF. */
5133     if (before == SB_CR && after == SB_LF) {
5134         return FALSE;
5135     }
5136
5137     /* Break after paragraph separators.  CR and LF are considered
5138      * so because Unicode views text as like word processing text where there
5139      * are no newlines except between paragraphs, and the word processor takes
5140      * care of wrapping without there being hard line-breaks in the text *./
5141        SB4.  Sep | CR | LF  ÷ */
5142     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5143         return TRUE;
5144     }
5145
5146     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5147      * (See Section 6.2, Replacing Ignore Rules.)
5148         SB5.  X (Extend | Format)*  →  X */
5149     if (after == SB_Extend || after == SB_Format) {
5150
5151         /* Implied is that the these characters attach to everything
5152          * immediately prior to them except for those separator-type
5153          * characters.  And the rules earlier have already handled the case
5154          * when one of those immediately precedes the extend char */
5155         return FALSE;
5156     }
5157
5158     if (before == SB_Extend || before == SB_Format) {
5159         U8 * temp_pos = lpos;
5160         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5161         if (   backup != SB_EDGE
5162             && backup != SB_Sep
5163             && backup != SB_CR
5164             && backup != SB_LF)
5165         {
5166             before = backup;
5167             lpos = temp_pos;
5168         }
5169
5170         /* Here, both 'before' and 'backup' are these types; implied is that we
5171          * don't break between them */
5172         if (backup == SB_Extend || backup == SB_Format) {
5173             return FALSE;
5174         }
5175     }
5176
5177     /* Do not break after ambiguous terminators like period, if they are
5178      * immediately followed by a number or lowercase letter, if they are
5179      * between uppercase letters, if the first following letter (optionally
5180      * after certain punctuation) is lowercase, or if they are followed by
5181      * "continuation" punctuation such as comma, colon, or semicolon. For
5182      * example, a period may be an abbreviation or numeric period, and thus may
5183      * not mark the end of a sentence.
5184
5185      * SB6. ATerm  ×  Numeric */
5186     if (before == SB_ATerm && after == SB_Numeric) {
5187         return FALSE;
5188     }
5189
5190     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5191     if (before == SB_ATerm && after == SB_Upper) {
5192         U8 * temp_pos = lpos;
5193         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5194         if (backup == SB_Upper || backup == SB_Lower) {
5195             return FALSE;
5196         }
5197     }
5198
5199     /* The remaining rules that aren't the final one, all require an STerm or
5200      * an ATerm after having backed up over some Close* Sp*, and in one case an
5201      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5202      * So do that backup now, setting flags if either Sp or a paragraph
5203      * separator are found */
5204
5205     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5206         has_para_sep = TRUE;
5207         before = backup_one_SB(strbeg, &lpos, utf8_target);
5208     }
5209
5210     if (before == SB_Sp) {
5211         has_sp = TRUE;
5212         do {
5213             before = backup_one_SB(strbeg, &lpos, utf8_target);
5214         }
5215         while (before == SB_Sp);
5216     }
5217
5218     while (before == SB_Close) {
5219         before = backup_one_SB(strbeg, &lpos, utf8_target);
5220     }
5221
5222     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5223      * most cases an STerm */
5224     if (before == SB_STerm || before == SB_ATerm) {
5225
5226         /* So, here the lhs matches
5227          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5228          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5229          * The rules that apply here are:
5230          *
5231          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5232                                            | LF | STerm | ATerm) )* Lower
5233            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5234            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5235            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5236            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5237          */
5238
5239         /* And all but SB11 forbid having seen a paragraph separator */
5240         if (! has_para_sep) {
5241             if (before == SB_ATerm) {          /* SB8 */
5242                 U8 * rpos = (U8 *) curpos;
5243                 SB_enum later = after;
5244
5245                 while (    later != SB_OLetter
5246                         && later != SB_Upper
5247                         && later != SB_Lower
5248                         && later != SB_Sep
5249                         && later != SB_CR
5250                         && later != SB_LF
5251                         && later != SB_STerm
5252                         && later != SB_ATerm
5253                         && later != SB_EDGE)
5254                 {
5255                     later = advance_one_SB(&rpos, strend, utf8_target);
5256                 }
5257                 if (later == SB_Lower) {
5258                     return FALSE;
5259                 }
5260             }
5261
5262             if (   after == SB_SContinue    /* SB8a */
5263                 || after == SB_STerm
5264                 || after == SB_ATerm)
5265             {
5266                 return FALSE;
5267             }
5268
5269             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5270                 if (   after == SB_Close
5271                     || after == SB_Sp
5272                     || after == SB_Sep
5273                     || after == SB_CR
5274                     || after == SB_LF)
5275                 {
5276                     return FALSE;
5277                 }
5278             }
5279
5280             /* SB10.  This and SB9 could probably be combined some way, but khw
5281              * has decided to follow the Unicode rule book precisely for
5282              * simplified maintenance */
5283             if (   after == SB_Sp
5284                 || after == SB_Sep
5285                 || after == SB_CR
5286                 || after == SB_LF)
5287             {
5288                 return FALSE;
5289             }
5290         }
5291
5292         /* SB11.  */
5293         return TRUE;
5294     }
5295
5296     /* Otherwise, do not break.
5297     SB12.  Any  ×  Any */
5298
5299     return FALSE;
5300 }
5301
5302 STATIC SB_enum
5303 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5304 {
5305     SB_enum sb;
5306
5307     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5308
5309     if (*curpos >= strend) {
5310         return SB_EDGE;
5311     }
5312
5313     if (utf8_target) {
5314         do {
5315             *curpos += UTF8SKIP(*curpos);
5316             if (*curpos >= strend) {
5317                 return SB_EDGE;
5318             }
5319             sb = getSB_VAL_UTF8(*curpos, strend);
5320         } while (sb == SB_Extend || sb == SB_Format);
5321     }
5322     else {
5323         do {
5324             (*curpos)++;
5325             if (*curpos >= strend) {
5326                 return SB_EDGE;
5327             }
5328             sb = getSB_VAL_CP(**curpos);
5329         } while (sb == SB_Extend || sb == SB_Format);
5330     }
5331
5332     return sb;
5333 }
5334
5335 STATIC SB_enum
5336 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5337 {
5338     SB_enum sb;
5339
5340     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5341
5342     if (*curpos < strbeg) {
5343         return SB_EDGE;
5344     }
5345
5346     if (utf8_target) {
5347         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5348         if (! prev_char_pos) {
5349             return SB_EDGE;
5350         }
5351
5352         /* Back up over Extend and Format.  curpos is always just to the right
5353          * of the characater whose value we are getting */
5354         do {
5355             U8 * prev_prev_char_pos;
5356             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5357                                                                       strbeg)))
5358             {
5359                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5360                 *curpos = prev_char_pos;
5361                 prev_char_pos = prev_prev_char_pos;
5362             }
5363             else {
5364                 *curpos = (U8 *) strbeg;
5365                 return SB_EDGE;
5366             }
5367         } while (sb == SB_Extend || sb == SB_Format);
5368     }
5369     else {
5370         do {
5371             if (*curpos - 2 < strbeg) {
5372                 *curpos = (U8 *) strbeg;
5373                 return SB_EDGE;
5374             }
5375             (*curpos)--;
5376             sb = getSB_VAL_CP(*(*curpos - 1));
5377         } while (sb == SB_Extend || sb == SB_Format);
5378     }
5379
5380     return sb;
5381 }
5382
5383 STATIC bool
5384 S_isWB(pTHX_ WB_enum previous,
5385              WB_enum before,
5386              WB_enum after,
5387              const U8 * const strbeg,
5388              const U8 * const curpos,
5389              const U8 * const strend,
5390              const bool utf8_target)
5391 {
5392     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5393      *  a Unicode word break, using their published algorithm, but tailored for
5394      *  Perl by treating spans of white space as one unit.  Context may be
5395      *  needed to make this determination.  If the value for the character
5396      *  before 'before' is known, it is passed as 'previous'; otherwise that
5397      *  should be set to WB_UNKNOWN.  The other input parameters give the
5398      *  boundaries and current position in the matching of the string.  That
5399      *  is, 'curpos' marks the position where the character whose wb value is
5400      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5401
5402     U8 * before_pos = (U8 *) curpos;
5403     U8 * after_pos = (U8 *) curpos;
5404     WB_enum prev = before;
5405     WB_enum next;
5406
5407     PERL_ARGS_ASSERT_ISWB;
5408
5409     /* Rule numbers in the comments below are as of Unicode 9.0 */
5410
5411   redo:
5412     before = prev;
5413     switch (WB_table[before][after]) {
5414         case WB_BREAKABLE:
5415             return TRUE;
5416
5417         case WB_NOBREAK:
5418             return FALSE;
5419
5420         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5421             next = advance_one_WB(&after_pos, strend, utf8_target,
5422                                  FALSE /* Don't skip Extend nor Format */ );
5423             /* A space immediately preceeding an Extend or Format is attached
5424              * to by them, and hence gets separated from previous spaces.
5425              * Otherwise don't break between horizontal white space */
5426             return next == WB_Extend || next == WB_Format;
5427
5428         /* WB4 Ignore Format and Extend characters, except when they appear at
5429          * the beginning of a region of text.  This code currently isn't
5430          * general purpose, but it works as the rules are currently and likely
5431          * to be laid out.  The reason it works is that when 'they appear at
5432          * the beginning of a region of text', the rule is to break before
5433          * them, just like any other character.  Therefore, the default rule
5434          * applies and we don't have to look in more depth.  Should this ever
5435          * change, we would have to have 2 'case' statements, like in the rules
5436          * below, and backup a single character (not spacing over the extend
5437          * ones) and then see if that is one of the region-end characters and
5438          * go from there */
5439         case WB_Ex_or_FO_or_ZWJ_then_foo:
5440             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5441             goto redo;
5442
5443         case WB_DQ_then_HL + WB_BREAKABLE:
5444         case WB_DQ_then_HL + WB_NOBREAK:
5445
5446             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5447
5448             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5449                                                             == WB_Hebrew_Letter)
5450             {
5451                 return FALSE;
5452             }
5453
5454              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5455
5456         case WB_HL_then_DQ + WB_BREAKABLE:
5457         case WB_HL_then_DQ + WB_NOBREAK:
5458
5459             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5460
5461             if (advance_one_WB(&after_pos, strend, utf8_target,
5462                                        TRUE /* Do skip Extend and Format */ )
5463                                                             == WB_Hebrew_Letter)
5464             {
5465                 return FALSE;
5466             }
5467
5468             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5469
5470         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5471         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5472
5473             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5474              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5475
5476             next = advance_one_WB(&after_pos, strend, utf8_target,
5477                                        TRUE /* Do skip Extend and Format */ );
5478
5479             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5480             {
5481                 return FALSE;
5482             }
5483
5484             return WB_table[before][after]
5485                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5486
5487         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5488         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5489
5490             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5491              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5492
5493             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5494             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5495             {
5496                 return FALSE;
5497             }
5498
5499             return WB_table[before][after]
5500                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5501
5502         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5503         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5504
5505             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5506              * */
5507
5508             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5509                                                             == WB_Numeric)
5510             {
5511                 return FALSE;
5512             }
5513
5514             return WB_table[before][after]
5515                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5516
5517         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5518         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5519
5520             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5521
5522             if (advance_one_WB(&after_pos, strend, utf8_target,
5523                                        TRUE /* Do skip Extend and Format */ )
5524                                                             == WB_Numeric)
5525             {
5526                 return FALSE;
5527             }
5528
5529             return WB_table[before][after]
5530                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5531
5532         case WB_RI_then_RI + WB_NOBREAK:
5533         case WB_RI_then_RI + WB_BREAKABLE:
5534             {
5535                 int RI_count = 1;
5536
5537                 /* Do not break within emoji flag sequences. That is, do not
5538                  * break between regional indicator (RI) symbols if there is an
5539                  * odd number of RI characters before the potential break
5540                  * point.
5541                  *
5542                  * WB15   sot (RI RI)* RI × RI
5543                  * WB16 [^RI] (RI RI)* RI × RI */
5544
5545                 while (backup_one_WB(&previous,
5546                                      strbeg,
5547                                      &before_pos,
5548                                      utf8_target) == WB_Regional_Indicator)
5549                 {
5550                     RI_count++;
5551                 }
5552
5553                 return RI_count % 2 != 1;
5554             }
5555
5556         default:
5557             break;
5558     }
5559
5560 #ifdef DEBUGGING
5561     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5562                                   before, after, WB_table[before][after]);
5563     assert(0);
5564 #endif
5565     return TRUE;
5566 }
5567
5568 STATIC WB_enum
5569 S_advance_one_WB(pTHX_ U8 ** curpos,
5570                        const U8 * const strend,
5571                        const bool utf8_target,
5572                        const bool skip_Extend_Format)
5573 {
5574     WB_enum wb;
5575
5576     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5577
5578     if (*curpos >= strend) {
5579         return WB_EDGE;
5580     }
5581
5582     if (utf8_target) {
5583
5584         /* Advance over Extend and Format */
5585         do {
5586             *curpos += UTF8SKIP(*curpos);
5587             if (*curpos >= strend) {
5588                 return WB_EDGE;
5589             }
5590             wb = getWB_VAL_UTF8(*curpos, strend);
5591         } while (    skip_Extend_Format
5592                  && (wb == WB_Extend || wb == WB_Format));
5593     }
5594     else {
5595         do {
5596             (*curpos)++;
5597             if (*curpos >= strend) {
5598                 return WB_EDGE;
5599             }
5600             wb = getWB_VAL_CP(**curpos);
5601         } while (    skip_Extend_Format
5602                  && (wb == WB_Extend || wb == WB_Format));
5603     }
5604
5605     return wb;
5606 }
5607
5608 STATIC WB_enum
5609 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5610 {
5611     WB_enum wb;
5612
5613     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5614
5615     /* If we know what the previous character's break value is, don't have
5616         * to look it up */
5617     if (*previous != WB_UNKNOWN) {
5618         wb = *previous;
5619
5620         /* But we need to move backwards by one */
5621         if (utf8_target) {
5622             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5623             if (! *curpos) {
5624                 *previous = WB_EDGE;
5625                 *curpos = (U8 *) strbeg;
5626             }
5627             else {
5628                 *previous = WB_UNKNOWN;
5629             }
5630         }
5631         else {
5632             (*curpos)--;
5633             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5634         }
5635
5636         /* And we always back up over these three types */
5637         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5638             return wb;
5639         }
5640     }
5641
5642     if (*curpos < strbeg) {
5643         return WB_EDGE;
5644     }
5645
5646     if (utf8_target) {
5647         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5648         if (! prev_char_pos) {
5649             return WB_EDGE;
5650         }
5651
5652         /* Back up over Extend and Format.  curpos is always just to the right
5653          * of the characater whose value we are getting */
5654         do {
5655             U8 * prev_prev_char_pos;
5656             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5657                                                    -1,
5658                                                    strbeg)))
5659             {
5660                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5661                 *curpos = prev_char_pos;
5662                 prev_char_pos = prev_prev_char_pos;
5663             }
5664             else {
5665                 *curpos = (U8 *) strbeg;
5666                 return WB_EDGE;
5667             }
5668         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5669     }
5670     else {
5671         do {
5672             if (*curpos - 2 < strbeg) {
5673                 *curpos = (U8 *) strbeg;
5674                 return WB_EDGE;
5675             }
5676             (*curpos)--;
5677             wb = getWB_VAL_CP(*(*curpos - 1));
5678         } while (wb == WB_Extend || wb == WB_Format);
5679     }
5680
5681     return wb;
5682 }
5683
5684 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5685 (                                                           \
5686     (   ( st )                                         ) && \
5687     (   ( st )->u.eval.close_paren                     ) && \
5688     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5689 )
5690
5691 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5692 (                                                           \
5693     (   ( st )                                         ) && \
5694     (   ( st )->u.eval.close_paren                     ) && \
5695     (   ( expr )                                       ) && \
5696     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5697 )
5698
5699
5700 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5701     (st)->u.eval.close_paren = ( (expr) + 1 )
5702
5703 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5704     (st)->u.eval.close_paren = 0
5705
5706 /* returns -1 on failure, $+[0] on success */
5707 STATIC SSize_t
5708 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5709 {
5710     dVAR;
5711     const bool utf8_target = reginfo->is_utf8_target;
5712     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5713     REGEXP *rex_sv = reginfo->prog;
5714     regexp *rex = ReANY(rex_sv);
5715     RXi_GET_DECL(rex,rexi);
5716     /* the current state. This is a cached copy of PL_regmatch_state */
5717     regmatch_state *st;
5718     /* cache heavy used fields of st in registers */
5719     regnode *scan;
5720     regnode *next;
5721     U32 n = 0;  /* general value; init to avoid compiler warning */
5722     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5723     SSize_t endref = 0; /* offset of end of backref when ln is start */
5724     char *locinput = startpos;
5725     char *pushinput; /* where to continue after a PUSH */
5726     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5727
5728     bool result = 0;        /* return value of S_regmatch */
5729     U32 depth = 0;            /* depth of backtrack stack */
5730     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5731     const U32 max_nochange_depth =
5732         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5733         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5734     regmatch_state *yes_state = NULL; /* state to pop to on success of
5735                                                             subpattern */
5736     /* mark_state piggy backs on the yes_state logic so that when we unwind
5737        the stack on success we can update the mark_state as we go */
5738     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5739     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5740     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5741     U32 state_num;
5742     bool no_final = 0;      /* prevent failure from backtracking? */
5743     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5744     char *startpoint = locinput;
5745     SV *popmark = NULL;     /* are we looking for a mark? */
5746     SV *sv_commit = NULL;   /* last mark name seen in failure */
5747     SV *sv_yes_mark = NULL; /* last mark name we have seen
5748                                during a successful match */
5749     U32 lastopen = 0;       /* last open we saw */
5750     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5751     SV* const oreplsv = GvSVn(PL_replgv);
5752     /* these three flags are set by various ops to signal information to
5753      * the very next op. They have a useful lifetime of exactly one loop
5754      * iteration, and are not preserved or restored by state pushes/pops
5755      */
5756     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5757     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5758     int logical = 0;        /* the following EVAL is:
5759                                 0: (?{...})
5760                                 1: (?(?{...})X|Y)
5761                                 2: (??{...})
5762                                or the following IFMATCH/UNLESSM is:
5763                                 false: plain (?=foo)
5764                                 true:  used as a condition: (?(?=foo))
5765                             */
5766     PAD* last_pad = NULL;
5767     dMULTICALL;
5768     U8 gimme = G_SCALAR;
5769     CV *caller_cv = NULL;       /* who called us */
5770     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5771     U32 maxopenparen = 0;       /* max '(' index seen so far */
5772     int to_complement;  /* Invert the result? */
5773     _char_class_number classnum;
5774     bool is_utf8_pat = reginfo->is_utf8_pat;
5775     bool match = FALSE;
5776     I32 orig_savestack_ix = PL_savestack_ix;
5777     U8 * script_run_begin = NULL;
5778
5779 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5780 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5781 #  define SOLARIS_BAD_OPTIMIZER
5782     const U32 *pl_charclass_dup = PL_charclass;
5783 #  define PL_charclass pl_charclass_dup
5784 #endif
5785
5786 #ifdef DEBUGGING
5787     GET_RE_DEBUG_FLAGS_DECL;
5788 #endif
5789
5790     /* protect against undef(*^R) */
5791     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5792
5793     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5794     multicall_oldcatch = 0;
5795     PERL_UNUSED_VAR(multicall_cop);
5796
5797     PERL_ARGS_ASSERT_REGMATCH;
5798
5799     st = PL_regmatch_state;
5800
5801     /* Note that nextchr is a byte even in UTF */
5802     SET_nextchr;
5803     scan = prog;
5804
5805     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5806             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5807             Perl_re_printf( aTHX_ "regmatch start\n" );
5808     }));
5809
5810     while (scan != NULL) {
5811         next = scan + NEXT_OFF(scan);
5812         if (next == scan)
5813             next = NULL;
5814         state_num = OP(scan);
5815
5816       reenter_switch:
5817         DEBUG_EXECUTE_r(
5818             if (state_num <= REGNODE_MAX) {
5819                 SV * const prop = sv_newmortal();
5820                 regnode *rnext = regnext(scan);
5821
5822                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5823                 regprop(rex, prop, scan, reginfo, NULL);
5824                 Perl_re_printf( aTHX_
5825                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5826                     INDENT_CHARS(depth), "",
5827                     (IV)(scan - rexi->program),
5828                     SvPVX_const(prop),
5829                     (PL_regkind[OP(scan)] == END || !rnext) ?
5830                         0 : (IV)(rnext - rexi->program));
5831             }
5832         );
5833
5834         to_complement = 0;
5835
5836         SET_nextchr;
5837         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5838
5839         switch (state_num) {
5840         case SBOL: /*  /^../ and /\A../  */
5841             if (locinput == reginfo->strbeg)
5842                 break;
5843             sayNO;
5844
5845         case MBOL: /*  /^../m  */
5846             if (locinput == reginfo->strbeg ||
5847                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5848             {
5849                 break;
5850             }
5851             sayNO;
5852
5853         case GPOS: /*  \G  */
5854             if (locinput == reginfo->ganch)
5855                 break;
5856             sayNO;
5857
5858         case KEEPS: /*   \K  */
5859             /* update the startpoint */
5860             st->u.keeper.val = rex->offs[0].start;
5861             rex->offs[0].start = locinput - reginfo->strbeg;
5862             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5863             NOT_REACHED; /* NOTREACHED */
5864
5865         case KEEPS_next_fail:
5866             /* rollback the start point change */
5867             rex->offs[0].start = st->u.keeper.val;
5868             sayNO_SILENT;
5869             NOT_REACHED; /* NOTREACHED */
5870
5871         case MEOL: /* /..$/m  */
5872             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5873                 sayNO;
5874             break;
5875
5876         case SEOL: /* /..$/  */
5877             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5878                 sayNO;
5879             if (reginfo->strend - locinput > 1)
5880                 sayNO;
5881             break;
5882
5883         case EOS: /*  \z  */
5884             if (!NEXTCHR_IS_EOS)
5885                 sayNO;
5886             break;
5887
5888         case SANY: /*  /./s  */
5889             if (NEXTCHR_IS_EOS)
5890                 sayNO;
5891             goto increment_locinput;
5892
5893         case REG_ANY: /*  /./  */
5894             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5895                 sayNO;
5896             goto increment_locinput;
5897
5898
5899 #undef  ST
5900 #define ST st->u.trie
5901         case TRIEC: /* (ab|cd) with known charclass */
5902             /* In this case the charclass data is available inline so
5903                we can fail fast without a lot of extra overhead.
5904              */
5905             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5906                 DEBUG_EXECUTE_r(
5907                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5908                               depth, PL_colors[4], PL_colors[5])
5909                 );
5910                 sayNO_SILENT;
5911                 NOT_REACHED; /* NOTREACHED */
5912             }
5913             /* FALLTHROUGH */
5914         case TRIE:  /* (ab|cd)  */
5915             /* the basic plan of execution of the trie is:
5916              * At the beginning, run though all the states, and
5917              * find the longest-matching word. Also remember the position
5918              * of the shortest matching word. For example, this pattern:
5919              *    1  2 3 4    5
5920              *    ab|a|x|abcd|abc
5921              * when matched against the string "abcde", will generate
5922              * accept states for all words except 3, with the longest
5923              * matching word being 4, and the shortest being 2 (with
5924              * the position being after char 1 of the string).
5925              *
5926              * Then for each matching word, in word order (i.e. 1,2,4,5),
5927              * we run the remainder of the pattern; on each try setting
5928              * the current position to the character following the word,
5929              * returning to try the next word on failure.
5930              *
5931              * We avoid having to build a list of words at runtime by
5932              * using a compile-time structure, wordinfo[].prev, which
5933              * gives, for each word, the previous accepting word (if any).
5934              * In the case above it would contain the mappings 1->2, 2->0,
5935              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5936              * the longest word (4 above), a list of all words, by
5937              * following the list of prev pointers; this gives us the
5938              * unordered list 4,5,1,2. Then given the current word we have
5939              * just tried, we can go through the list and find the
5940              * next-biggest word to try (so if we just failed on word 2,
5941              * the next in the list is 4).
5942              *
5943              * Since at runtime we don't record the matching position in
5944              * the string for each word, we have to work that out for
5945              * each word we're about to process. The wordinfo table holds
5946              * the character length of each word; given that we recorded
5947              * at the start: the position of the shortest word and its
5948              * length in chars, we just need to move the pointer the
5949              * difference between the two char lengths. Depending on
5950              * Unicode status and folding, that's cheap or expensive.
5951              *
5952              * This algorithm is optimised for the case where are only a
5953              * small number of accept states, i.e. 0,1, or maybe 2.
5954              * With lots of accepts states, and having to try all of them,
5955              * it becomes quadratic on number of accept states to find all
5956              * the next words.
5957              */
5958
5959             {
5960                 /* what type of TRIE am I? (utf8 makes this contextual) */
5961                 DECL_TRIE_TYPE(scan);
5962
5963                 /* what trie are we using right now */
5964                 reg_trie_data * const trie
5965                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5966                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5967                 U32 state = trie->startstate;
5968
5969                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5970                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5971                     if (utf8_target
5972                         && ! NEXTCHR_IS_EOS
5973                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5974                         && scan->flags == EXACTL)
5975                     {
5976                         /* We only output for EXACTL, as we let the folder
5977                          * output this message for EXACTFLU8 to avoid
5978                          * duplication */
5979                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5980                                                                reginfo->strend);
5981                     }
5982                 }
5983                 if (   trie->bitmap
5984                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5985                 {
5986                     if (trie->states[ state ].wordnum) {
5987                          DEBUG_EXECUTE_r(
5988                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
5989                                           depth, PL_colors[4], PL_colors[5])
5990                         );
5991                         if (!trie->jump)
5992                             break;
5993                     } else {
5994                         DEBUG_EXECUTE_r(
5995                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5996                                           depth, PL_colors[4], PL_colors[5])
5997                         );
5998                         sayNO_SILENT;
5999                    }
6000                 }
6001
6002             {
6003                 U8 *uc = ( U8* )locinput;
6004
6005                 STRLEN len = 0;
6006                 STRLEN foldlen = 0;
6007                 U8 *uscan = (U8*)NULL;
6008                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6009                 U32 charcount = 0; /* how many input chars we have matched */
6010                 U32 accepted = 0; /* have we seen any accepting states? */
6011
6012                 ST.jump = trie->jump;
6013                 ST.me = scan;
6014                 ST.firstpos = NULL;
6015                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6016                 ST.nextword = 0;
6017
6018                 /* fully traverse the TRIE; note the position of the
6019                    shortest accept state and the wordnum of the longest
6020                    accept state */
6021
6022                 while ( state && uc <= (U8*)(reginfo->strend) ) {
6023                     U32 base = trie->states[ state ].trans.base;
6024                     UV uvc = 0;
6025                     U16 charid = 0;
6026                     U16 wordnum;
6027                     wordnum = trie->states[ state ].wordnum;
6028
6029                     if (wordnum) { /* it's an accept state */
6030                         if (!accepted) {
6031                             accepted = 1;
6032                             /* record first match position */
6033                             if (ST.longfold) {
6034                                 ST.firstpos = (U8*)locinput;
6035                                 ST.firstchars = 0;
6036                             }
6037                             else {
6038                                 ST.firstpos = uc;
6039                                 ST.firstchars = charcount;
6040                             }
6041                         }
6042                         if (!ST.nextword || wordnum < ST.nextword)
6043                             ST.nextword = wordnum;
6044                         ST.topword = wordnum;
6045                     }
6046
6047                     DEBUG_TRIE_EXECUTE_r({
6048                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6049                                 /* HERE */
6050                                 PerlIO_printf( Perl_debug_log,
6051                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6052                                     INDENT_CHARS(depth), "", PL_colors[4],
6053                                     (UV)state, (accepted ? 'Y' : 'N'));
6054                     });
6055
6056                     /* read a char and goto next state */
6057                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6058                         I32 offset;
6059                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6060                                              (U8 *) reginfo->strend, uscan,
6061                                              len, uvc, charid, foldlen,
6062                                              foldbuf, uniflags);
6063                         charcount++;
6064                         if (foldlen>0)
6065                             ST.longfold = TRUE;
6066                         if (charid &&
6067                              ( ((offset =
6068                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6069
6070                              && ((U32)offset < trie->lasttrans)
6071                              && trie->trans[offset].check == state)
6072                         {
6073                             state = trie->trans[offset].next;
6074                         }
6075                         else {
6076                             state = 0;
6077                         }
6078                         uc += len;
6079
6080                     }
6081                     else {
6082                         state = 0;
6083                     }
6084                     DEBUG_TRIE_EXECUTE_r(
6085                         Perl_re_printf( aTHX_
6086                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6087                             charid, uvc, (UV)state, PL_colors[5] );
6088                     );
6089                 }
6090                 if (!accepted)
6091                    sayNO;
6092
6093                 /* calculate total number of accept states */
6094                 {
6095                     U16 w = ST.topword;
6096                     accepted = 0;
6097                     while (w) {
6098                         w = trie->wordinfo[w].prev;
6099                         accepted++;
6100                     }
6101                     ST.accepted = accepted;
6102                 }
6103
6104                 DEBUG_EXECUTE_r(
6105                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6106                         depth,
6107                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6108                 );
6109                 goto trie_first_try; /* jump into the fail handler */
6110             }}
6111             NOT_REACHED; /* NOTREACHED */
6112
6113         case TRIE_next_fail: /* we failed - try next alternative */
6114         {
6115             U8 *uc;
6116             if ( ST.jump ) {
6117                 /* undo any captures done in the tail part of a branch,
6118                  * e.g.
6119                  *    /(?:X(.)(.)|Y(.)).../
6120                  * where the trie just matches X then calls out to do the
6121                  * rest of the branch */
6122                 REGCP_UNWIND(ST.cp);
6123                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6124             }
6125             if (!--ST.accepted) {
6126                 DEBUG_EXECUTE_r({
6127                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6128                         depth,
6129                         PL_colors[4],
6130                         PL_colors[5] );
6131                 });
6132                 sayNO_SILENT;
6133             }
6134             {
6135                 /* Find next-highest word to process.  Note that this code
6136                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6137                 U16 min = 0;
6138                 U16 word;
6139                 U16 const nextword = ST.nextword;
6140                 reg_trie_wordinfo * const wordinfo
6141                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6142                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6143                     if (word > nextword && (!min || word < min))
6144                         min = word;
6145                 }
6146                 ST.nextword = min;
6147             }
6148
6149           trie_first_try:
6150             if (do_cutgroup) {
6151                 do_cutgroup = 0;
6152                 no_final = 0;
6153             }
6154
6155             if ( ST.jump ) {
6156                 ST.lastparen = rex->lastparen;
6157                 ST.lastcloseparen = rex->lastcloseparen;
6158                 REGCP_SET(ST.cp);
6159             }
6160
6161             /* find start char of end of current word */
6162             {
6163                 U32 chars; /* how many chars to skip */
6164                 reg_trie_data * const trie
6165                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6166
6167                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6168                             >=  ST.firstchars);
6169                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6170                             - ST.firstchars;
6171                 uc = ST.firstpos;
6172
6173                 if (ST.longfold) {
6174                     /* the hard option - fold each char in turn and find
6175                      * its folded length (which may be different */
6176                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6177                     STRLEN foldlen;
6178                     STRLEN len;
6179                     UV uvc;
6180                     U8 *uscan;
6181
6182                     while (chars) {
6183                         if (utf8_target) {
6184                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6185                                                     uniflags);
6186                             uc += len;
6187                         }
6188                         else {
6189                             uvc = *uc;
6190                             uc++;
6191                         }
6192                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6193                         uscan = foldbuf;
6194                         while (foldlen) {
6195                             if (!--chars)
6196                                 break;
6197                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6198                                                  uniflags);
6199                             uscan += len;
6200                             foldlen -= len;
6201                         }
6202                     }
6203                 }
6204                 else {
6205                     if (utf8_target)
6206                         while (chars--)
6207                             uc += UTF8SKIP(uc);
6208                     else
6209                         uc += chars;
6210                 }
6211             }
6212
6213             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6214                             ? ST.jump[ST.nextword]
6215                             : NEXT_OFF(ST.me));
6216
6217             DEBUG_EXECUTE_r({
6218                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6219                     depth,
6220                     PL_colors[4],
6221                     ST.nextword,
6222                     PL_colors[5]
6223                     );
6224             });
6225
6226             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6227                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
6228                 NOT_REACHED; /* NOTREACHED */
6229             }
6230             /* only one choice left - just continue */
6231             DEBUG_EXECUTE_r({
6232                 AV *const trie_words
6233                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6234                 SV ** const tmp = trie_words
6235                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6236                 SV *sv= tmp ? sv_newmortal() : NULL;
6237
6238                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6239                     depth, PL_colors[4],
6240                     ST.nextword,
6241                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6242                             PL_colors[0], PL_colors[1],
6243                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6244                         )
6245                     : "not compiled under -Dr",
6246                     PL_colors[5] );
6247             });
6248
6249             locinput = (char*)uc;
6250             continue; /* execute rest of RE */
6251             /* NOTREACHED */
6252         }
6253 #undef  ST
6254
6255         case EXACTL:             /*  /abc/l       */
6256             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6257
6258             /* Complete checking would involve going through every character
6259              * matched by the string to see if any is above latin1.  But the
6260              * comparision otherwise might very well be a fast assembly
6261              * language routine, and I (khw) don't think slowing things down
6262              * just to check for this warning is worth it.  So this just checks
6263              * the first character */
6264             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6265                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6266             }
6267             /* FALLTHROUGH */
6268         case EXACT: {            /*  /abc/        */
6269             char *s = STRING(scan);
6270             ln = STR_LEN(scan);
6271             if (utf8_target != is_utf8_pat) {
6272                 /* The target and the pattern have differing utf8ness. */
6273                 char *l = locinput;
6274                 const char * const e = s + ln;
6275
6276                 if (utf8_target) {
6277                     /* The target is utf8, the pattern is not utf8.
6278                      * Above-Latin1 code points can't match the pattern;
6279                      * invariants match exactly, and the other Latin1 ones need
6280                      * to be downgraded to a single byte in order to do the
6281                      * comparison.  (If we could be confident that the target
6282                      * is not malformed, this could be refactored to have fewer
6283                      * tests by just assuming that if the first bytes match, it
6284                      * is an invariant, but there are tests in the test suite
6285                      * dealing with (??{...}) which violate this) */
6286                     while (s < e) {
6287                         if (l >= reginfo->strend
6288                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6289                         {
6290                             sayNO;
6291                         }
6292                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6293                             if (*l != *s) {
6294                                 sayNO;
6295                             }
6296                             l++;
6297                         }
6298                         else {
6299                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6300                             {
6301                                 sayNO;
6302                             }
6303                             l += 2;
6304                         }
6305                         s++;
6306                     }
6307                 }
6308                 else {
6309                     /* The target is not utf8, the pattern is utf8. */
6310                     while (s < e) {
6311                         if (l >= reginfo->strend
6312                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6313                         {
6314                             sayNO;
6315                         }
6316                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6317                             if (*s != *l) {
6318                                 sayNO;
6319                             }
6320                             s++;
6321                         }
6322                         else {
6323                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6324                             {
6325                                 sayNO;
6326                             }
6327                             s += 2;
6328                         }
6329                         l++;
6330                     }
6331                 }
6332                 locinput = l;
6333             }
6334             else {
6335                 /* The target and the pattern have the same utf8ness. */
6336                 /* Inline the first character, for speed. */
6337                 if (reginfo->strend - locinput < ln
6338                     || UCHARAT(s) != nextchr
6339                     || (ln > 1 && memNE(s, locinput, ln)))
6340                 {
6341                     sayNO;
6342                 }
6343                 locinput += ln;
6344             }
6345             break;
6346             }
6347
6348         case EXACTFL: {          /*  /abc/il      */
6349             re_fold_t folder;
6350             const U8 * fold_array;
6351             const char * s;
6352             U32 fold_utf8_flags;
6353
6354             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6355             folder = foldEQ_locale;
6356             fold_array = PL_fold_locale;
6357             fold_utf8_flags = FOLDEQ_LOCALE;
6358             goto do_exactf;
6359
6360         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6361                                       is effectively /u; hence to match, target
6362                                       must be UTF-8. */
6363             if (! utf8_target) {
6364                 sayNO;
6365             }
6366             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6367                                              | FOLDEQ_S1_FOLDS_SANE;
6368             folder = foldEQ_latin1;
6369             fold_array = PL_fold_latin1;
6370             goto do_exactf;
6371
6372         case EXACTFU_SS:         /*  /\x{df}/iu   */
6373         case EXACTFU:            /*  /abc/iu      */
6374             folder = foldEQ_latin1;
6375             fold_array = PL_fold_latin1;
6376             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6377             goto do_exactf;
6378
6379         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6380                                    patterns */
6381             assert(! is_utf8_pat);
6382             /* FALLTHROUGH */
6383         case EXACTFAA:            /*  /abc/iaa     */
6384             folder = foldEQ_latin1;
6385             fold_array = PL_fold_latin1;
6386             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6387             goto do_exactf;
6388
6389         case EXACTF:             /*  /abc/i    This node only generated for
6390                                                non-utf8 patterns */
6391             assert(! is_utf8_pat);
6392             folder = foldEQ;
6393             fold_array = PL_fold;
6394             fold_utf8_flags = 0;
6395
6396           do_exactf:
6397             s = STRING(scan);
6398             ln = STR_LEN(scan);
6399
6400             if (utf8_target
6401                 || is_utf8_pat
6402                 || state_num == EXACTFU_SS
6403                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6404             {
6405               /* Either target or the pattern are utf8, or has the issue where
6406                * the fold lengths may differ. */
6407                 const char * const l = locinput;
6408                 char *e = reginfo->strend;
6409
6410                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6411                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6412                 {
6413                     sayNO;
6414                 }
6415                 locinput = e;
6416                 break;
6417             }
6418
6419             /* Neither the target nor the pattern are utf8 */
6420             if (UCHARAT(s) != nextchr
6421                 && !NEXTCHR_IS_EOS
6422                 && UCHARAT(s) != fold_array[nextchr])
6423             {
6424                 sayNO;
6425             }
6426             if (reginfo->strend - locinput < ln)
6427                 sayNO;
6428             if (ln > 1 && ! folder(s, locinput, ln))
6429                 sayNO;
6430             locinput += ln;
6431             break;
6432         }
6433
6434         case NBOUNDL: /*  /\B/l  */
6435             to_complement = 1;
6436             /* FALLTHROUGH */
6437
6438         case BOUNDL:  /*  /\b/l  */
6439         {
6440             bool b1, b2;
6441             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6442
6443             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6444                 if (! IN_UTF8_CTYPE_LOCALE) {
6445                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6446                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6447                 }
6448                 goto boundu;
6449             }
6450
6451             if (utf8_target) {
6452                 if (locinput == reginfo->strbeg)
6453                     b1 = isWORDCHAR_LC('\n');
6454                 else {
6455                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6456                                                         (U8*)(reginfo->strbeg)),
6457                                                  (U8*)(reginfo->strend));
6458                 }
6459                 b2 = (NEXTCHR_IS_EOS)
6460                     ? isWORDCHAR_LC('\n')
6461                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6462                                               (U8*) reginfo->strend);
6463             }
6464             else { /* Here the string isn't utf8 */
6465                 b1 = (locinput == reginfo->strbeg)
6466                      ? isWORDCHAR_LC('\n')
6467                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6468                 b2 = (NEXTCHR_IS_EOS)
6469                     ? isWORDCHAR_LC('\n')
6470                     : isWORDCHAR_LC(nextchr);
6471             }
6472             if (to_complement ^ (b1 == b2)) {
6473                 sayNO;
6474             }
6475             break;
6476         }
6477
6478         case NBOUND:  /*  /\B/   */
6479             to_complement = 1;
6480             /* FALLTHROUGH */
6481
6482         case BOUND:   /*  /\b/   */
6483             if (utf8_target) {
6484                 goto bound_utf8;
6485             }
6486             goto bound_ascii_match_only;
6487
6488         case NBOUNDA: /*  /\B/a  */
6489             to_complement = 1;
6490             /* FALLTHROUGH */
6491
6492         case BOUNDA:  /*  /\b/a  */
6493         {
6494             bool b1, b2;
6495
6496           bound_ascii_match_only:
6497             /* Here the string isn't utf8, or is utf8 and only ascii characters
6498              * are to match \w.  In the latter case looking at the byte just
6499              * prior to the current one may be just the final byte of a
6500              * multi-byte character.  This is ok.  There are two cases:
6501              * 1) it is a single byte character, and then the test is doing
6502              *    just what it's supposed to.
6503              * 2) it is a multi-byte character, in which case the final byte is
6504              *    never mistakable for ASCII, and so the test will say it is
6505              *    not a word character, which is the correct answer. */
6506             b1 = (locinput == reginfo->strbeg)
6507                  ? isWORDCHAR_A('\n')
6508                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6509             b2 = (NEXTCHR_IS_EOS)
6510                 ? isWORDCHAR_A('\n')
6511                 : isWORDCHAR_A(nextchr);
6512             if (to_complement ^ (b1 == b2)) {
6513                 sayNO;
6514             }
6515             break;
6516         }
6517
6518         case NBOUNDU: /*  /\B/u  */
6519             to_complement = 1;
6520             /* FALLTHROUGH */
6521
6522         case BOUNDU:  /*  /\b/u  */
6523
6524           boundu:
6525             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6526                 match = FALSE;
6527             }
6528             else if (utf8_target) {
6529               bound_utf8:
6530                 switch((bound_type) FLAGS(scan)) {
6531                     case TRADITIONAL_BOUND:
6532                     {
6533                         bool b1, b2;
6534                         b1 = (locinput == reginfo->strbeg)
6535                              ? 0 /* isWORDCHAR_L1('\n') */
6536                              : isWORDCHAR_utf8_safe(
6537                                                reghop3((U8*)locinput,
6538                                                        -1,
6539                                                        (U8*)(reginfo->strbeg)),
6540                                                     (U8*) reginfo->strend);
6541                         b2 = (NEXTCHR_IS_EOS)
6542                             ? 0 /* isWORDCHAR_L1('\n') */
6543                             : isWORDCHAR_utf8_safe((U8*)locinput,
6544                                                    (U8*) reginfo->strend);
6545                         match = cBOOL(b1 != b2);
6546                         break;
6547                     }
6548                     case GCB_BOUND:
6549                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6550                             match = TRUE; /* GCB always matches at begin and
6551                                              end */
6552                         }
6553                         else {
6554                             /* Find the gcb values of previous and current
6555                              * chars, then see if is a break point */
6556                             match = isGCB(getGCB_VAL_UTF8(
6557                                                 reghop3((U8*)locinput,
6558                                                         -1,
6559                                                         (U8*)(reginfo->strbeg)),
6560                                                 (U8*) reginfo->strend),
6561                                           getGCB_VAL_UTF8((U8*) locinput,
6562                                                         (U8*) reginfo->strend),
6563                                           (U8*) reginfo->strbeg,
6564                                           (U8*) locinput,
6565                                           utf8_target);
6566                         }
6567                         break;
6568
6569                     case LB_BOUND:
6570                         if (locinput == reginfo->strbeg) {
6571                             match = FALSE;
6572                         }
6573                         else if (NEXTCHR_IS_EOS) {
6574                             match = TRUE;
6575                         }
6576                         else {
6577                             match = isLB(getLB_VAL_UTF8(
6578                                                 reghop3((U8*)locinput,
6579                                                         -1,
6580                                                         (U8*)(reginfo->strbeg)),
6581                                                 (U8*) reginfo->strend),
6582                                           getLB_VAL_UTF8((U8*) locinput,
6583                                                         (U8*) reginfo->strend),
6584                                           (U8*) reginfo->strbeg,
6585                                           (U8*) locinput,
6586                                           (U8*) reginfo->strend,
6587                                           utf8_target);
6588                         }
6589                         break;
6590
6591                     case SB_BOUND: /* Always matches at begin and end */
6592                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6593                             match = TRUE;
6594                         }
6595                         else {
6596                             match = isSB(getSB_VAL_UTF8(
6597                                                 reghop3((U8*)locinput,
6598                                                         -1,
6599                                                         (U8*)(reginfo->strbeg)),
6600                                                 (U8*) reginfo->strend),
6601                                           getSB_VAL_UTF8((U8*) locinput,
6602                                                         (U8*) reginfo->strend),
6603                                           (U8*) reginfo->strbeg,
6604                                           (U8*) locinput,
6605                                           (U8*) reginfo->strend,
6606                                           utf8_target);
6607                         }
6608                         break;
6609
6610                     case WB_BOUND:
6611                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6612                             match = TRUE;
6613                         }
6614                         else {
6615                             match = isWB(WB_UNKNOWN,
6616                                          getWB_VAL_UTF8(
6617                                                 reghop3((U8*)locinput,
6618                                                         -1,
6619                                                         (U8*)(reginfo->strbeg)),
6620                                                 (U8*) reginfo->strend),
6621                                           getWB_VAL_UTF8((U8*) locinput,
6622                                                         (U8*) reginfo->strend),
6623                                           (U8*) reginfo->strbeg,
6624                                           (U8*) locinput,
6625                                           (U8*) reginfo->strend,
6626                                           utf8_target);
6627                         }
6628                         break;
6629                 }
6630             }
6631             else {  /* Not utf8 target */
6632                 switch((bound_type) FLAGS(scan)) {
6633                     case TRADITIONAL_BOUND:
6634                     {
6635                         bool b1, b2;
6636                         b1 = (locinput == reginfo->strbeg)
6637                             ? 0 /* isWORDCHAR_L1('\n') */
6638                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6639                         b2 = (NEXTCHR_IS_EOS)
6640                             ? 0 /* isWORDCHAR_L1('\n') */
6641                             : isWORDCHAR_L1(nextchr);
6642                         match = cBOOL(b1 != b2);
6643                         break;
6644                     }
6645
6646                     case GCB_BOUND:
6647                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6648                             match = TRUE; /* GCB always matches at begin and
6649                                              end */
6650                         }
6651                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6652                                    range */
6653                             match =    UCHARAT(locinput - 1) != '\r'
6654                                     || UCHARAT(locinput) != '\n';
6655                         }
6656                         break;
6657
6658                     case LB_BOUND:
6659                         if (locinput == reginfo->strbeg) {
6660                             match = FALSE;
6661                         }
6662                         else if (NEXTCHR_IS_EOS) {
6663                             match = TRUE;
6664                         }
6665                         else {
6666                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6667                                          getLB_VAL_CP(UCHARAT(locinput)),
6668                                          (U8*) reginfo->strbeg,
6669                                          (U8*) locinput,
6670                                          (U8*) reginfo->strend,
6671                                          utf8_target);
6672                         }
6673                         break;
6674
6675                     case SB_BOUND: /* Always matches at begin and end */
6676                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6677                             match = TRUE;
6678                         }
6679                         else {
6680                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6681                                          getSB_VAL_CP(UCHARAT(locinput)),
6682                                          (U8*) reginfo->strbeg,
6683                                          (U8*) locinput,
6684                                          (U8*) reginfo->strend,
6685                                          utf8_target);
6686                         }
6687                         break;
6688
6689                     case WB_BOUND:
6690                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6691                             match = TRUE;
6692                         }
6693                         else {
6694                             match = isWB(WB_UNKNOWN,
6695                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6696                                          getWB_VAL_CP(UCHARAT(locinput)),
6697                                          (U8*) reginfo->strbeg,
6698                                          (U8*) locinput,
6699                                          (U8*) reginfo->strend,
6700                                          utf8_target);
6701                         }
6702                         break;
6703                 }
6704             }
6705
6706             if (to_complement ^ ! match) {
6707                 sayNO;
6708             }
6709             break;
6710
6711         case ANYOFPOSIXL:
6712         case ANYOFL:  /*  /[abc]/l      */
6713             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6714
6715             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6716             {
6717               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6718             }
6719             /* FALLTHROUGH */
6720         case ANYOFD:  /*   /[abc]/d       */
6721         case ANYOF:  /*   /[abc]/       */
6722             if (NEXTCHR_IS_EOS)
6723                 sayNO;
6724             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6725                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6726                                                                    utf8_target))
6727                     sayNO;
6728                 locinput += UTF8SKIP(locinput);
6729             }
6730             else {
6731                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6732                     sayNO;
6733                 locinput++;
6734             }
6735             break;
6736
6737         case ANYOFM:
6738             if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) {
6739                 sayNO;
6740             }
6741             locinput++;
6742             break;
6743
6744         case ASCII:
6745             if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
6746                 sayNO;
6747             }
6748
6749             locinput++;     /* ASCII is always single byte */
6750             break;
6751
6752         case NASCII:
6753             if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
6754                 sayNO;
6755             }
6756
6757             goto increment_locinput;
6758             break;
6759
6760         /* The argument (FLAGS) to all the POSIX node types is the class number
6761          * */
6762
6763         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6764             to_complement = 1;
6765             /* FALLTHROUGH */
6766
6767         case POSIXL:    /* \w or [:punct:] etc. under /l */
6768             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6769             if (NEXTCHR_IS_EOS)
6770                 sayNO;
6771
6772             /* Use isFOO_lc() for characters within Latin1.  (Note that
6773              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6774              * wouldn't be invariant) */
6775             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6776                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6777                     sayNO;
6778                 }
6779
6780                 locinput++;
6781                 break;
6782             }
6783
6784             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6785                 /* An above Latin-1 code point, or malformed */
6786                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6787                                                        reginfo->strend);
6788                 goto utf8_posix_above_latin1;
6789             }
6790
6791             /* Here is a UTF-8 variant code point below 256 and the target is
6792              * UTF-8 */
6793             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6794                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6795                                             *(locinput + 1))))))
6796             {
6797                 sayNO;
6798             }
6799
6800             goto increment_locinput;
6801
6802         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6803             to_complement = 1;
6804             /* FALLTHROUGH */
6805
6806         case POSIXD:    /* \w or [:punct:] etc. under /d */
6807             if (utf8_target) {
6808                 goto utf8_posix;
6809             }
6810             goto posixa;
6811
6812         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6813
6814             if (NEXTCHR_IS_EOS) {
6815                 sayNO;
6816             }
6817
6818             /* All UTF-8 variants match */
6819             if (! UTF8_IS_INVARIANT(nextchr)) {
6820                 goto increment_locinput;
6821             }
6822
6823             to_complement = 1;
6824             goto join_nposixa;
6825
6826         case POSIXA:    /* \w or [:punct:] etc. under /a */
6827
6828           posixa:
6829             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6830              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6831              * character is a single byte */
6832
6833             if (NEXTCHR_IS_EOS) {
6834                 sayNO;
6835             }
6836
6837           join_nposixa:
6838
6839             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6840                                                                 FLAGS(scan)))))
6841             {
6842                 sayNO;
6843             }
6844
6845             /* Here we are either not in utf8, or we matched a utf8-invariant,
6846              * so the next char is the next byte */
6847             locinput++;
6848             break;
6849
6850         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6851             to_complement = 1;
6852             /* FALLTHROUGH */
6853
6854         case POSIXU:    /* \w or [:punct:] etc. under /u */
6855           utf8_posix:
6856             if (NEXTCHR_IS_EOS) {
6857                 sayNO;
6858             }
6859
6860             /* Use _generic_isCC() for characters within Latin1.  (Note that
6861              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6862              * wouldn't be invariant) */
6863             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6864                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6865                                                            FLAGS(scan)))))
6866                 {
6867                     sayNO;
6868                 }
6869                 locinput++;
6870             }
6871             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6872                 if (! (to_complement
6873                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6874                                                                *(locinput + 1)),
6875                                              FLAGS(scan)))))
6876                 {
6877                     sayNO;
6878                 }
6879                 locinput += 2;
6880             }
6881             else {  /* Handle above Latin-1 code points */
6882               utf8_posix_above_latin1:
6883                 classnum = (_char_class_number) FLAGS(scan);
6884                 switch (classnum) {
6885                     default:
6886                         if (! (to_complement
6887                            ^ cBOOL(_invlist_contains_cp(
6888                                       PL_XPosix_ptrs[classnum],
6889                                       utf8_to_uvchr_buf((U8 *) locinput,
6890                                                         (U8 *) reginfo->strend,
6891                                                         NULL)))))
6892                         {
6893                             sayNO;
6894                         }
6895                         break;
6896                     case _CC_ENUM_SPACE:
6897                         if (! (to_complement
6898                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
6899                         {
6900                             sayNO;
6901                         }
6902                         break;
6903                     case _CC_ENUM_BLANK:
6904                         if (! (to_complement
6905                                         ^ cBOOL(is_HORIZWS_high(locinput))))
6906                         {
6907                             sayNO;
6908                         }
6909                         break;
6910                     case _CC_ENUM_XDIGIT:
6911                         if (! (to_complement
6912                                         ^ cBOOL(is_XDIGIT_high(locinput))))
6913                         {
6914                             sayNO;
6915                         }
6916                         break;
6917                     case _CC_ENUM_VERTSPACE:
6918                         if (! (to_complement
6919                                         ^ cBOOL(is_VERTWS_high(locinput))))
6920                         {
6921                             sayNO;
6922                         }
6923                         break;
6924                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
6925                     case _CC_ENUM_ASCII:
6926                         if (! to_complement) {
6927                             sayNO;
6928                         }
6929                         break;
6930                 }
6931                 locinput += UTF8SKIP(locinput);
6932             }
6933             break;
6934
6935         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6936                        a Unicode extended Grapheme Cluster */
6937             if (NEXTCHR_IS_EOS)
6938                 sayNO;
6939             if  (! utf8_target) {
6940
6941                 /* Match either CR LF  or '.', as all the other possibilities
6942                  * require utf8 */
6943                 locinput++;         /* Match the . or CR */
6944                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6945                                        match the LF */
6946                     && locinput < reginfo->strend
6947                     && UCHARAT(locinput) == '\n')
6948                 {
6949                     locinput++;
6950                 }
6951             }
6952             else {
6953
6954                 /* Get the gcb type for the current character */
6955                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6956                                                        (U8*) reginfo->strend);
6957
6958                 /* Then scan through the input until we get to the first
6959                  * character whose type is supposed to be a gcb with the
6960                  * current character.  (There is always a break at the
6961                  * end-of-input) */
6962                 locinput += UTF8SKIP(locinput);
6963                 while (locinput < reginfo->strend) {
6964                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6965                                                          (U8*) reginfo->strend);
6966                     if (isGCB(prev_gcb, cur_gcb,
6967                               (U8*) reginfo->strbeg, (U8*) locinput,
6968                               utf8_target))
6969                     {
6970                         break;
6971                     }
6972
6973                     prev_gcb = cur_gcb;
6974                     locinput += UTF8SKIP(locinput);
6975                 }
6976
6977
6978             }
6979             break;
6980
6981         case NREFFL:  /*  /\g{name}/il  */
6982         {   /* The capture buffer cases.  The ones beginning with N for the
6983                named buffers just convert to the equivalent numbered and
6984                pretend they were called as the corresponding numbered buffer
6985                op.  */
6986             /* don't initialize these in the declaration, it makes C++
6987                unhappy */
6988             const char *s;
6989             char type;
6990             re_fold_t folder;
6991             const U8 *fold_array;
6992             UV utf8_fold_flags;
6993
6994             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6995             folder = foldEQ_locale;
6996             fold_array = PL_fold_locale;
6997             type = REFFL;
6998             utf8_fold_flags = FOLDEQ_LOCALE;
6999             goto do_nref;
7000
7001         case NREFFA:  /*  /\g{name}/iaa  */
7002             folder = foldEQ_latin1;
7003             fold_array = PL_fold_latin1;
7004             type = REFFA;
7005             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7006             goto do_nref;
7007
7008         case NREFFU:  /*  /\g{name}/iu  */
7009             folder = foldEQ_latin1;
7010             fold_array = PL_fold_latin1;
7011             type = REFFU;
7012             utf8_fold_flags = 0;
7013             goto do_nref;
7014
7015         case NREFF:  /*  /\g{name}/i  */
7016             folder = foldEQ;
7017             fold_array = PL_fold;
7018             type = REFF;
7019             utf8_fold_flags = 0;
7020             goto do_nref;
7021
7022         case NREF:  /*  /\g{name}/   */
7023             type = REF;
7024             folder = NULL;
7025             fold_array = NULL;
7026             utf8_fold_flags = 0;
7027           do_nref:
7028
7029             /* For the named back references, find the corresponding buffer
7030              * number */
7031             n = reg_check_named_buff_matched(rex,scan);
7032
7033             if ( ! n ) {
7034                 sayNO;
7035             }
7036             goto do_nref_ref_common;
7037
7038         case REFFL:  /*  /\1/il  */
7039             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7040             folder = foldEQ_locale;
7041             fold_array = PL_fold_locale;
7042             utf8_fold_flags = FOLDEQ_LOCALE;
7043             goto do_ref;
7044
7045         case REFFA:  /*  /\1/iaa  */
7046             folder = foldEQ_latin1;
7047             fold_array = PL_fold_latin1;
7048             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7049             goto do_ref;
7050
7051         case REFFU:  /*  /\1/iu  */
7052             folder = foldEQ_latin1;
7053             fold_array = PL_fold_latin1;
7054             utf8_fold_flags = 0;
7055             goto do_ref;
7056
7057         case REFF:  /*  /\1/i  */
7058             folder = foldEQ;
7059             fold_array = PL_fold;
7060             utf8_fold_flags = 0;
7061             goto do_ref;
7062
7063         case REF:  /*  /\1/    */
7064             folder = NULL;
7065             fold_array = NULL;
7066             utf8_fold_flags = 0;
7067
7068           do_ref:
7069             type = OP(scan);
7070             n = ARG(scan);  /* which paren pair */
7071
7072           do_nref_ref_common:
7073             ln = rex->offs[n].start;
7074             endref = rex->offs[n].end;
7075             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7076             if (rex->lastparen < n || ln == -1 || endref == -1)
7077                 sayNO;                  /* Do not match unless seen CLOSEn. */
7078             if (ln == endref)
7079                 break;
7080
7081             s = reginfo->strbeg + ln;
7082             if (type != REF     /* REF can do byte comparison */
7083                 && (utf8_target || type == REFFU || type == REFFL))
7084             {
7085                 char * limit = reginfo->strend;
7086
7087                 /* This call case insensitively compares the entire buffer
7088                     * at s, with the current input starting at locinput, but
7089                     * not going off the end given by reginfo->strend, and
7090                     * returns in <limit> upon success, how much of the
7091                     * current input was matched */
7092                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7093                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7094                 {
7095                     sayNO;
7096                 }
7097                 locinput = limit;
7098                 break;
7099             }
7100
7101             /* Not utf8:  Inline the first character, for speed. */
7102             if (!NEXTCHR_IS_EOS &&
7103                 UCHARAT(s) != nextchr &&
7104                 (type == REF ||
7105                  UCHARAT(s) != fold_array[nextchr]))
7106                 sayNO;
7107             ln = endref - ln;
7108             if (locinput + ln > reginfo->strend)
7109                 sayNO;
7110             if (ln > 1 && (type == REF
7111                            ? memNE(s, locinput, ln)
7112                            : ! folder(s, locinput, ln)))
7113                 sayNO;
7114             locinput += ln;
7115             break;
7116         }
7117
7118         case NOTHING: /* null op; e.g. the 'nothing' following
7119                        * the '*' in m{(a+|b)*}' */
7120             break;
7121         case TAIL: /* placeholder while compiling (A|B|C) */
7122             break;
7123
7124 #undef  ST
7125 #define ST st->u.eval
7126 #define CUR_EVAL cur_eval->u.eval
7127
7128         {
7129             SV *ret;
7130             REGEXP *re_sv;
7131             regexp *re;
7132             regexp_internal *rei;
7133             regnode *startpoint;
7134             U32 arg;
7135
7136         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7137             arg= (U32)ARG(scan);
7138             if (cur_eval && cur_eval->locinput == locinput) {
7139                 if ( ++nochange_depth > max_nochange_depth )
7140                     Perl_croak(aTHX_
7141                         "Pattern subroutine nesting without pos change"
7142                         " exceeded limit in regex");
7143             } else {
7144                 nochange_depth = 0;
7145             }
7146             re_sv = rex_sv;
7147             re = rex;
7148             rei = rexi;
7149             startpoint = scan + ARG2L(scan);
7150             EVAL_CLOSE_PAREN_SET( st, arg );
7151             /* Detect infinite recursion
7152              *
7153              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7154              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7155              * So we track the position in the string we are at each time
7156              * we recurse and if we try to enter the same routine twice from
7157              * the same position we throw an error.
7158              */
7159             if ( rex->recurse_locinput[arg] == locinput ) {
7160                 /* FIXME: we should show the regop that is failing as part
7161                  * of the error message. */
7162                 Perl_croak(aTHX_ "Infinite recursion in regex");
7163             } else {
7164                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7165                 rex->recurse_locinput[arg]= locinput;
7166
7167                 DEBUG_r({
7168                     GET_RE_DEBUG_FLAGS_DECL;
7169                     DEBUG_STACK_r({
7170                         Perl_re_exec_indentf( aTHX_
7171                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7172                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7173                         );
7174                     });
7175                 });
7176             }
7177
7178             /* Save all the positions seen so far. */
7179             ST.cp = regcppush(rex, 0, maxopenparen);
7180             REGCP_SET(ST.lastcp);
7181
7182             /* and then jump to the code we share with EVAL */
7183             goto eval_recurse_doit;
7184             /* NOTREACHED */
7185
7186         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7187             if (cur_eval && cur_eval->locinput==locinput) {
7188                 if ( ++nochange_depth > max_nochange_depth )
7189                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7190             } else {
7191                 nochange_depth = 0;
7192             }
7193             {
7194                 /* execute the code in the {...} */
7195
7196                 dSP;
7197                 IV before;
7198                 OP * const oop = PL_op;
7199                 COP * const ocurcop = PL_curcop;
7200                 OP *nop;
7201                 CV *newcv;
7202
7203                 /* save *all* paren positions */
7204                 regcppush(rex, 0, maxopenparen);
7205                 REGCP_SET(ST.lastcp);
7206
7207                 if (!caller_cv)
7208                     caller_cv = find_runcv(NULL);
7209
7210                 n = ARG(scan);
7211
7212                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7213                     newcv = (ReANY(
7214                                     (REGEXP*)(rexi->data->data[n])
7215                             ))->qr_anoncv;
7216                     nop = (OP*)rexi->data->data[n+1];
7217                 }
7218                 else if (rexi->data->what[n] == 'l') { /* literal code */
7219                     newcv = caller_cv;
7220                     nop = (OP*)rexi->data->data[n];
7221                     assert(CvDEPTH(newcv));
7222                 }
7223                 else {
7224                     /* literal with own CV */
7225                     assert(rexi->data->what[n] == 'L');
7226                     newcv = rex->qr_anoncv;
7227                     nop = (OP*)rexi->data->data[n];
7228                 }
7229
7230                 /* Some notes about MULTICALL and the context and save stacks.
7231                  *
7232                  * In something like
7233                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7234                  * since codeblocks don't introduce a new scope (so that
7235                  * local() etc accumulate), at the end of a successful
7236                  * match there will be a SAVEt_CLEARSV on the savestack
7237                  * for each of $x, $y, $z. If the three code blocks above
7238                  * happen to have come from different CVs (e.g. via
7239                  * embedded qr//s), then we must ensure that during any
7240                  * savestack unwinding, PL_comppad always points to the
7241                  * right pad at each moment. We achieve this by
7242                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7243                  * there is a change of pad.
7244                  * In theory whenever we call a code block, we should
7245                  * push a CXt_SUB context, then pop it on return from
7246                  * that code block. This causes a bit of an issue in that
7247                  * normally popping a context also clears the savestack
7248                  * back to cx->blk_oldsaveix, but here we specifically
7249                  * don't want to clear the save stack on exit from the
7250                  * code block.
7251                  * Also for efficiency we don't want to keep pushing and
7252                  * popping the single SUB context as we backtrack etc.
7253                  * So instead, we push a single context the first time
7254                  * we need, it, then hang onto it until the end of this
7255                  * function. Whenever we encounter a new code block, we
7256                  * update the CV etc if that's changed. During the times
7257                  * in this function where we're not executing a code
7258                  * block, having the SUB context still there is a bit
7259                  * naughty - but we hope that no-one notices.
7260                  * When the SUB context is initially pushed, we fake up
7261                  * cx->blk_oldsaveix to be as if we'd pushed this context
7262                  * on first entry to S_regmatch rather than at some random
7263                  * point during the regexe execution. That way if we
7264                  * croak, popping the context stack will ensure that
7265                  * *everything* SAVEd by this function is undone and then
7266                  * the context popped, rather than e.g., popping the
7267                  * context (and restoring the original PL_comppad) then
7268                  * popping more of the savestack and restoring a bad
7269                  * PL_comppad.
7270                  */
7271
7272                 /* If this is the first EVAL, push a MULTICALL. On
7273                  * subsequent calls, if we're executing a different CV, or
7274                  * if PL_comppad has got messed up from backtracking
7275                  * through SAVECOMPPADs, then refresh the context.
7276                  */
7277                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7278                 {
7279                     U8 flags = (CXp_SUB_RE |
7280                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7281                     SAVECOMPPAD();
7282                     if (last_pushed_cv) {
7283                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7284                     }
7285                     else {
7286                         PUSH_MULTICALL_FLAGS(newcv, flags);
7287                     }
7288                     /* see notes above */
7289                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7290
7291                     last_pushed_cv = newcv;
7292                 }
7293                 else {
7294                     /* these assignments are just to silence compiler
7295                      * warnings */
7296                     multicall_cop = NULL;
7297                 }
7298                 last_pad = PL_comppad;
7299
7300                 /* the initial nextstate you would normally execute
7301                  * at the start of an eval (which would cause error
7302                  * messages to come from the eval), may be optimised
7303                  * away from the execution path in the regex code blocks;
7304                  * so manually set PL_curcop to it initially */
7305                 {
7306                     OP *o = cUNOPx(nop)->op_first;
7307                     assert(o->op_type == OP_NULL);
7308                     if (o->op_targ == OP_SCOPE) {
7309                         o = cUNOPo->op_first;
7310                     }
7311                     else {
7312                         assert(o->op_targ == OP_LEAVE);
7313                         o = cUNOPo->op_first;
7314                         assert(o->op_type == OP_ENTER);
7315                         o = OpSIBLING(o);
7316                     }
7317
7318                     if (o->op_type != OP_STUB) {
7319                         assert(    o->op_type == OP_NEXTSTATE
7320                                 || o->op_type == OP_DBSTATE
7321                                 || (o->op_type == OP_NULL
7322                                     &&  (  o->op_targ == OP_NEXTSTATE
7323                                         || o->op_targ == OP_DBSTATE
7324                                         )
7325                                     )
7326                         );
7327                         PL_curcop = (COP*)o;
7328                     }
7329                 }
7330                 nop = nop->op_next;
7331
7332                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7333                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7334
7335                 rex->offs[0].end = locinput - reginfo->strbeg;
7336                 if (reginfo->info_aux_eval->pos_magic)
7337                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7338                                   reginfo->sv, reginfo->strbeg,
7339                                   locinput - reginfo->strbeg);
7340
7341                 if (sv_yes_mark) {
7342                     SV *sv_mrk = get_sv("REGMARK", 1);
7343                     sv_setsv(sv_mrk, sv_yes_mark);
7344                 }
7345
7346                 /* we don't use MULTICALL here as we want to call the
7347                  * first op of the block of interest, rather than the
7348                  * first op of the sub. Also, we don't want to free
7349                  * the savestack frame */
7350                 before = (IV)(SP-PL_stack_base);
7351                 PL_op = nop;
7352                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7353                 SPAGAIN;
7354                 if ((IV)(SP-PL_stack_base) == before)
7355                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7356                 else {
7357                     ret = POPs;
7358                     PUTBACK;
7359                 }
7360
7361                 /* before restoring everything, evaluate the returned
7362                  * value, so that 'uninit' warnings don't use the wrong
7363                  * PL_op or pad. Also need to process any magic vars
7364                  * (e.g. $1) *before* parentheses are restored */
7365
7366                 PL_op = NULL;
7367
7368                 re_sv = NULL;
7369                 if (logical == 0)        /*   (?{})/   */
7370                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
7371                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7372                     sw = cBOOL(SvTRUE_NN(ret));
7373                     logical = 0;
7374                 }
7375                 else {                   /*  /(??{})  */
7376                     /*  if its overloaded, let the regex compiler handle
7377                      *  it; otherwise extract regex, or stringify  */
7378                     if (SvGMAGICAL(ret))
7379                         ret = sv_mortalcopy(ret);
7380                     if (!SvAMAGIC(ret)) {
7381                         SV *sv = ret;
7382                         if (SvROK(sv))
7383                             sv = SvRV(sv);
7384                         if (SvTYPE(sv) == SVt_REGEXP)
7385                             re_sv = (REGEXP*) sv;
7386                         else if (SvSMAGICAL(ret)) {
7387                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7388                             if (mg)
7389                                 re_sv = (REGEXP *) mg->mg_obj;
7390                         }
7391
7392                         /* force any undef warnings here */
7393                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7394                             ret = sv_mortalcopy(ret);
7395                             (void) SvPV_force_nolen(ret);
7396                         }
7397                     }
7398
7399                 }
7400
7401                 /* *** Note that at this point we don't restore
7402                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7403                  * be used again soon. This is safe as long as nothing
7404                  * in the regexp code uses the pad ! */
7405                 PL_op = oop;
7406                 PL_curcop = ocurcop;
7407                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7408                 PL_curpm_under = PL_curpm;
7409                 PL_curpm = PL_reg_curpm;
7410
7411                 if (logical != 2) {
7412                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7413                     /* NOTREACHED */
7414                 }
7415             }
7416
7417                 /* only /(??{})/  from now on */
7418                 logical = 0;
7419                 {
7420                     /* extract RE object from returned value; compiling if
7421                      * necessary */
7422
7423                     if (re_sv) {
7424                         re_sv = reg_temp_copy(NULL, re_sv);
7425                     }
7426                     else {
7427                         U32 pm_flags = 0;
7428
7429                         if (SvUTF8(ret) && IN_BYTES) {
7430                             /* In use 'bytes': make a copy of the octet
7431                              * sequence, but without the flag on */
7432                             STRLEN len;
7433                             const char *const p = SvPV(ret, len);
7434                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7435                         }
7436                         if (rex->intflags & PREGf_USE_RE_EVAL)
7437                             pm_flags |= PMf_USE_RE_EVAL;
7438
7439                         /* if we got here, it should be an engine which
7440                          * supports compiling code blocks and stuff */
7441                         assert(rex->engine && rex->engine->op_comp);
7442                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7443                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7444                                     rex->engine, NULL, NULL,
7445                                     /* copy /msixn etc to inner pattern */
7446                                     ARG2L(scan),
7447                                     pm_flags);
7448
7449                         if (!(SvFLAGS(ret)
7450                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7451                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7452                             /* This isn't a first class regexp. Instead, it's
7453                                caching a regexp onto an existing, Perl visible
7454                                scalar.  */
7455                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7456                         }
7457                     }
7458                     SAVEFREESV(re_sv);
7459                     re = ReANY(re_sv);
7460                 }
7461                 RXp_MATCH_COPIED_off(re);
7462                 re->subbeg = rex->subbeg;
7463                 re->sublen = rex->sublen;
7464                 re->suboffset = rex->suboffset;
7465                 re->subcoffset = rex->subcoffset;
7466                 re->lastparen = 0;
7467                 re->lastcloseparen = 0;
7468                 rei = RXi_GET(re);
7469                 DEBUG_EXECUTE_r(
7470                     debug_start_match(re_sv, utf8_target, locinput,
7471                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7472                 );
7473                 startpoint = rei->program + 1;
7474                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7475                                              * close_paren only for GOSUB */
7476                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7477                 /* Save all the seen positions so far. */
7478                 ST.cp = regcppush(rex, 0, maxopenparen);
7479                 REGCP_SET(ST.lastcp);
7480                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7481                 maxopenparen = 0;
7482                 /* run the pattern returned from (??{...}) */
7483
7484               eval_recurse_doit: /* Share code with GOSUB below this line
7485                             * At this point we expect the stack context to be
7486                             * set up correctly */
7487
7488                 /* invalidate the S-L poscache. We're now executing a
7489                  * different set of WHILEM ops (and their associated
7490                  * indexes) against the same string, so the bits in the
7491                  * cache are meaningless. Setting maxiter to zero forces
7492                  * the cache to be invalidated and zeroed before reuse.
7493                  * XXX This is too dramatic a measure. Ideally we should
7494                  * save the old cache and restore when running the outer
7495                  * pattern again */
7496                 reginfo->poscache_maxiter = 0;
7497
7498                 /* the new regexp might have a different is_utf8_pat than we do */
7499                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7500
7501                 ST.prev_rex = rex_sv;
7502                 ST.prev_curlyx = cur_curlyx;
7503                 rex_sv = re_sv;
7504                 SET_reg_curpm(rex_sv);
7505                 rex = re;
7506                 rexi = rei;
7507                 cur_curlyx = NULL;
7508                 ST.B = next;
7509                 ST.prev_eval = cur_eval;
7510                 cur_eval = st;
7511                 /* now continue from first node in postoned RE */
7512                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7513                 NOT_REACHED; /* NOTREACHED */
7514         }
7515
7516         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7517             /* note: this is called twice; first after popping B, then A */
7518             DEBUG_STACK_r({
7519                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7520                     depth, cur_eval, ST.prev_eval);
7521             });
7522
7523 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7524             if ( cur_eval && CUR_EVAL.close_paren ) {\
7525                 DEBUG_STACK_r({ \
7526                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7527                         depth,    \
7528                         CUR_EVAL.close_paren - 1,\
7529                         cur_eval, \
7530                         VAL);     \
7531                 });               \
7532                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7533             }
7534
7535             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7536
7537             rex_sv = ST.prev_rex;
7538             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7539             SET_reg_curpm(rex_sv);
7540             rex = ReANY(rex_sv);
7541             rexi = RXi_GET(rex);
7542             {
7543                 /* preserve $^R across LEAVE's. See Bug 121070. */
7544                 SV *save_sv= GvSV(PL_replgv);
7545                 SvREFCNT_inc(save_sv);
7546                 regcpblow(ST.cp); /* LEAVE in disguise */
7547                 sv_setsv(GvSV(PL_replgv), save_sv);
7548                 SvREFCNT_dec(save_sv);
7549             }
7550             cur_eval = ST.prev_eval;
7551             cur_curlyx = ST.prev_curlyx;
7552
7553             /* Invalidate cache. See "invalidate" comment above. */
7554             reginfo->poscache_maxiter = 0;
7555             if ( nochange_depth )
7556                 nochange_depth--;
7557
7558             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7559             sayYES;
7560
7561
7562         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7563             REGCP_UNWIND(ST.lastcp);
7564             sayNO;
7565
7566         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7567             /* note: this is called twice; first after popping B, then A */
7568             DEBUG_STACK_r({
7569                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7570                     depth, cur_eval, ST.prev_eval);
7571             });
7572
7573             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7574
7575             rex_sv = ST.prev_rex;
7576             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7577             SET_reg_curpm(rex_sv);
7578             rex = ReANY(rex_sv);
7579             rexi = RXi_GET(rex);
7580
7581             REGCP_UNWIND(ST.lastcp);
7582             regcppop(rex, &maxopenparen);
7583             cur_eval = ST.prev_eval;
7584             cur_curlyx = ST.prev_curlyx;
7585
7586             /* Invalidate cache. See "invalidate" comment above. */
7587             reginfo->poscache_maxiter = 0;
7588             if ( nochange_depth )
7589                 nochange_depth--;
7590
7591             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7592             sayNO_SILENT;
7593 #undef ST
7594
7595         case OPEN: /*  (  */
7596             n = ARG(scan);  /* which paren pair */
7597             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7598             if (n > maxopenparen)
7599                 maxopenparen = n;
7600             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7601                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7602                 depth,
7603                 PTR2UV(rex),
7604                 PTR2UV(rex->offs),
7605                 (UV)n,
7606                 (IV)rex->offs[n].start_tmp,
7607                 (UV)maxopenparen
7608             ));
7609             lastopen = n;
7610             break;
7611
7612         case SROPEN: /*  (*SCRIPT_RUN:  */
7613             script_run_begin = (U8 *) locinput;
7614             break;
7615
7616
7617         case CLOSE:  /*  )  */
7618             n = ARG(scan);  /* which paren pair */
7619             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7620                              locinput - reginfo->strbeg);
7621             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7622                 goto fake_end;
7623
7624             break;
7625
7626         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7627
7628             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7629             {
7630                 sayNO;
7631             }
7632
7633             break;
7634
7635
7636         case ACCEPT:  /*  (*ACCEPT)  */
7637             if (scan->flags)
7638                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7639             if (ARG2L(scan)){
7640                 regnode *cursor;
7641                 for (cursor=scan;
7642                      cursor && OP(cursor)!=END;
7643                      cursor=regnext(cursor))
7644                 {
7645                     if ( OP(cursor)==CLOSE ){
7646                         n = ARG(cursor);
7647                         if ( n <= lastopen ) {
7648                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7649                                              locinput - reginfo->strbeg);
7650                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7651                                 break;
7652                         }
7653                     }
7654                 }
7655             }
7656             goto fake_end;
7657             /* NOTREACHED */
7658
7659         case GROUPP:  /*  (?(1))  */
7660             n = ARG(scan);  /* which paren pair */
7661             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7662             break;
7663
7664         case NGROUPP:  /*  (?(<name>))  */
7665             /* reg_check_named_buff_matched returns 0 for no match */
7666             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7667             break;
7668
7669         case INSUBP:   /*  (?(R))  */
7670             n = ARG(scan);
7671             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7672              * of SCAN is already set up as matches a eval.close_paren */
7673             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7674             break;
7675
7676         case DEFINEP:  /*  (?(DEFINE))  */
7677             sw = 0;
7678             break;
7679
7680         case IFTHEN:   /*  (?(cond)A|B)  */
7681             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7682             if (sw)
7683                 next = NEXTOPER(NEXTOPER(scan));
7684             else {
7685                 next = scan + ARG(scan);
7686                 if (OP(next) == IFTHEN) /* Fake one. */
7687                     next = NEXTOPER(NEXTOPER(next));
7688             }
7689             break;
7690
7691         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7692             logical = scan->flags;
7693             break;
7694
7695 /*******************************************************************
7696
7697 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7698 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7699 STAR/PLUS/CURLY/CURLYN are used instead.)
7700
7701 A*B is compiled as <CURLYX><A><WHILEM><B>
7702
7703 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7704 state, which contains the current count, initialised to -1. It also sets
7705 cur_curlyx to point to this state, with any previous value saved in the
7706 state block.
7707
7708 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7709 since the pattern may possibly match zero times (i.e. it's a while {} loop
7710 rather than a do {} while loop).
7711
7712 Each entry to WHILEM represents a successful match of A. The count in the
7713 CURLYX block is incremented, another WHILEM state is pushed, and execution
7714 passes to A or B depending on greediness and the current count.
7715
7716 For example, if matching against the string a1a2a3b (where the aN are
7717 substrings that match /A/), then the match progresses as follows: (the
7718 pushed states are interspersed with the bits of strings matched so far):
7719
7720     <CURLYX cnt=-1>
7721     <CURLYX cnt=0><WHILEM>
7722     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7723     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7724     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7725     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7726
7727 (Contrast this with something like CURLYM, which maintains only a single
7728 backtrack state:
7729
7730     <CURLYM cnt=0> a1
7731     a1 <CURLYM cnt=1> a2
7732     a1 a2 <CURLYM cnt=2> a3
7733     a1 a2 a3 <CURLYM cnt=3> b
7734 )
7735
7736 Each WHILEM state block marks a point to backtrack to upon partial failure
7737 of A or B, and also contains some minor state data related to that
7738 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7739 overall state, such as the count, and pointers to the A and B ops.
7740
7741 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7742 must always point to the *current* CURLYX block, the rules are:
7743
7744 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7745 and set cur_curlyx to point the new block.
7746
7747 When popping the CURLYX block after a successful or unsuccessful match,
7748 restore the previous cur_curlyx.
7749
7750 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7751 to the outer one saved in the CURLYX block.
7752
7753 When popping the WHILEM block after a successful or unsuccessful B match,
7754 restore the previous cur_curlyx.
7755
7756 Here's an example for the pattern (AI* BI)*BO
7757 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7758
7759 cur_
7760 curlyx backtrack stack
7761 ------ ---------------
7762 NULL
7763 CO     <CO prev=NULL> <WO>
7764 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7765 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7766 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7767
7768 At this point the pattern succeeds, and we work back down the stack to
7769 clean up, restoring as we go:
7770
7771 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7772 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7773 CO     <CO prev=NULL> <WO>
7774 NULL
7775
7776 *******************************************************************/
7777
7778 #define ST st->u.curlyx
7779
7780         case CURLYX:    /* start of /A*B/  (for complex A) */
7781         {
7782             /* No need to save/restore up to this paren */
7783             I32 parenfloor = scan->flags;
7784
7785             assert(next); /* keep Coverity happy */
7786             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7787                 next += ARG(next);
7788
7789             /* XXXX Probably it is better to teach regpush to support
7790                parenfloor > maxopenparen ... */
7791             if (parenfloor > (I32)rex->lastparen)
7792                 parenfloor = rex->lastparen; /* Pessimization... */
7793
7794             ST.prev_curlyx= cur_curlyx;
7795             cur_curlyx = st;
7796             ST.cp = PL_savestack_ix;
7797
7798             /* these fields contain the state of the current curly.
7799              * they are accessed by subsequent WHILEMs */
7800             ST.parenfloor = parenfloor;
7801             ST.me = scan;
7802             ST.B = next;
7803             ST.minmod = minmod;
7804             minmod = 0;
7805             ST.count = -1;      /* this will be updated by WHILEM */
7806             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7807
7808             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7809             NOT_REACHED; /* NOTREACHED */
7810         }
7811
7812         case CURLYX_end: /* just finished matching all of A*B */
7813             cur_curlyx = ST.prev_curlyx;
7814             sayYES;
7815             NOT_REACHED; /* NOTREACHED */
7816
7817         case CURLYX_end_fail: /* just failed to match all of A*B */
7818             regcpblow(ST.cp);
7819             cur_curlyx = ST.prev_curlyx;
7820             sayNO;
7821             NOT_REACHED; /* NOTREACHED */
7822
7823
7824 #undef ST
7825 #define ST st->u.whilem
7826
7827         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7828         {
7829             /* see the discussion above about CURLYX/WHILEM */
7830             I32 n;
7831             int min, max;
7832             regnode *A;
7833
7834             assert(cur_curlyx); /* keep Coverity happy */
7835
7836             min = ARG1(cur_curlyx->u.curlyx.me);
7837             max = ARG2(cur_curlyx->u.curlyx.me);
7838             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7839             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7840             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7841             ST.cache_offset = 0;
7842             ST.cache_mask = 0;
7843
7844
7845             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
7846                   depth, (long)n, min, max)
7847             );
7848
7849             /* First just match a string of min A's. */
7850
7851             if (n < min) {
7852                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7853                 cur_curlyx->u.curlyx.lastloc = locinput;
7854                 REGCP_SET(ST.lastcp);
7855
7856                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7857                 NOT_REACHED; /* NOTREACHED */
7858             }
7859
7860             /* If degenerate A matches "", assume A done. */
7861
7862             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7863                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
7864                    depth)
7865                 );
7866                 goto do_whilem_B_max;
7867             }
7868
7869             /* super-linear cache processing.
7870              *
7871              * The idea here is that for certain types of CURLYX/WHILEM -
7872              * principally those whose upper bound is infinity (and
7873              * excluding regexes that have things like \1 and other very
7874              * non-regular expresssiony things), then if a pattern like
7875              * /....A*.../ fails and we backtrack to the WHILEM, then we
7876              * make a note that this particular WHILEM op was at string
7877              * position 47 (say) when the rest of pattern failed. Then, if
7878              * we ever find ourselves back at that WHILEM, and at string
7879              * position 47 again, we can just fail immediately rather than
7880              * running the rest of the pattern again.
7881              *
7882              * This is very handy when patterns start to go
7883              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7884              * with a combinatorial explosion of backtracking.
7885              *
7886              * The cache is implemented as a bit array, with one bit per
7887              * string byte position per WHILEM op (up to 16) - so its
7888              * between 0.25 and 2x the string size.
7889              *
7890              * To avoid allocating a poscache buffer every time, we do an
7891              * initially countdown; only after we have  executed a WHILEM
7892              * op (string-length x #WHILEMs) times do we allocate the
7893              * cache.
7894              *
7895              * The top 4 bits of scan->flags byte say how many different
7896              * relevant CURLLYX/WHILEM op pairs there are, while the
7897              * bottom 4-bits is the identifying index number of this
7898              * WHILEM.
7899              */
7900
7901             if (scan->flags) {
7902
7903                 if (!reginfo->poscache_maxiter) {
7904                     /* start the countdown: Postpone detection until we
7905                      * know the match is not *that* much linear. */
7906                     reginfo->poscache_maxiter
7907                         =    (reginfo->strend - reginfo->strbeg + 1)
7908                            * (scan->flags>>4);
7909                     /* possible overflow for long strings and many CURLYX's */
7910                     if (reginfo->poscache_maxiter < 0)
7911                         reginfo->poscache_maxiter = I32_MAX;
7912                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7913                 }
7914
7915                 if (reginfo->poscache_iter-- == 0) {
7916                     /* initialise cache */
7917                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7918                     regmatch_info_aux *const aux = reginfo->info_aux;
7919                     if (aux->poscache) {
7920                         if ((SSize_t)reginfo->poscache_size < size) {
7921                             Renew(aux->poscache, size, char);
7922                             reginfo->poscache_size = size;
7923                         }
7924                         Zero(aux->poscache, size, char);
7925                     }
7926                     else {
7927                         reginfo->poscache_size = size;
7928                         Newxz(aux->poscache, size, char);
7929                     }
7930                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7931       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
7932                               PL_colors[4], PL_colors[5])
7933                     );
7934                 }
7935
7936                 if (reginfo->poscache_iter < 0) {
7937                     /* have we already failed at this position? */
7938                     SSize_t offset, mask;
7939
7940                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7941                     offset  = (scan->flags & 0xf) - 1
7942                                 +   (locinput - reginfo->strbeg)
7943                                   * (scan->flags>>4);
7944                     mask    = 1 << (offset % 8);
7945                     offset /= 8;
7946                     if (reginfo->info_aux->poscache[offset] & mask) {
7947                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
7948                             depth)
7949                         );
7950                         cur_curlyx->u.curlyx.count--;
7951                         sayNO; /* cache records failure */
7952                     }
7953                     ST.cache_offset = offset;
7954                     ST.cache_mask   = mask;
7955                 }
7956             }
7957
7958             /* Prefer B over A for minimal matching. */
7959
7960             if (cur_curlyx->u.curlyx.minmod) {
7961                 ST.save_curlyx = cur_curlyx;
7962                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7963                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7964                                     locinput);
7965                 NOT_REACHED; /* NOTREACHED */
7966             }
7967
7968             /* Prefer A over B for maximal matching. */
7969
7970             if (n < max) { /* More greed allowed? */
7971                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7972                             maxopenparen);
7973                 cur_curlyx->u.curlyx.lastloc = locinput;
7974                 REGCP_SET(ST.lastcp);
7975                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7976                 NOT_REACHED; /* NOTREACHED */
7977             }
7978             goto do_whilem_B_max;
7979         }
7980         NOT_REACHED; /* NOTREACHED */
7981
7982         case WHILEM_B_min: /* just matched B in a minimal match */
7983         case WHILEM_B_max: /* just matched B in a maximal match */
7984             cur_curlyx = ST.save_curlyx;
7985             sayYES;
7986             NOT_REACHED; /* NOTREACHED */
7987
7988         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7989             cur_curlyx = ST.save_curlyx;
7990             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7991             cur_curlyx->u.curlyx.count--;
7992             CACHEsayNO;
7993             NOT_REACHED; /* NOTREACHED */
7994
7995         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7996             /* FALLTHROUGH */
7997         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7998             REGCP_UNWIND(ST.lastcp);
7999             regcppop(rex, &maxopenparen);
8000             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8001             cur_curlyx->u.curlyx.count--;
8002             CACHEsayNO;
8003             NOT_REACHED; /* NOTREACHED */
8004
8005         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8006             REGCP_UNWIND(ST.lastcp);
8007             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8008             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8009                 depth)
8010             );
8011           do_whilem_B_max:
8012             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8013                 && ckWARN(WARN_REGEXP)
8014                 && !reginfo->warned)
8015             {
8016                 reginfo->warned = TRUE;
8017                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8018                      "Complex regular subexpression recursion limit (%d) "
8019                      "exceeded",
8020                      REG_INFTY - 1);
8021             }
8022
8023             /* now try B */
8024             ST.save_curlyx = cur_curlyx;
8025             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8026             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8027                                 locinput);
8028             NOT_REACHED; /* NOTREACHED */
8029
8030         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8031             cur_curlyx = ST.save_curlyx;
8032
8033             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8034                 /* Maximum greed exceeded */
8035                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8036                     && ckWARN(WARN_REGEXP)
8037                     && !reginfo->warned)
8038                 {
8039                     reginfo->warned     = TRUE;
8040                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8041                         "Complex regular subexpression recursion "
8042                         "limit (%d) exceeded",
8043                         REG_INFTY - 1);
8044                 }
8045                 cur_curlyx->u.curlyx.count--;
8046                 CACHEsayNO;
8047             }
8048
8049             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8050             );
8051             /* Try grabbing another A and see if it helps. */
8052             cur_curlyx->u.curlyx.lastloc = locinput;
8053             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8054                             maxopenparen);
8055             REGCP_SET(ST.lastcp);
8056             PUSH_STATE_GOTO(WHILEM_A_min,
8057                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8058                 locinput);
8059             NOT_REACHED; /* NOTREACHED */
8060
8061 #undef  ST
8062 #define ST st->u.branch
8063
8064         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8065             next = scan + ARG(scan);
8066             if (next == scan)
8067                 next = NULL;
8068             scan = NEXTOPER(scan);
8069             /* FALLTHROUGH */
8070
8071         case BRANCH:        /*  /(...|A|...)/ */
8072             scan = NEXTOPER(scan); /* scan now points to inner node */
8073             ST.lastparen = rex->lastparen;
8074             ST.lastcloseparen = rex->lastcloseparen;
8075             ST.next_branch = next;
8076             REGCP_SET(ST.cp);
8077
8078             /* Now go into the branch */
8079             if (has_cutgroup) {
8080                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
8081             } else {
8082                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
8083             }
8084             NOT_REACHED; /* NOTREACHED */
8085
8086         case CUTGROUP:  /*  /(*THEN)/  */
8087             sv_yes_mark = st->u.mark.mark_name = scan->flags
8088                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8089                 : NULL;
8090             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
8091             NOT_REACHED; /* NOTREACHED */
8092
8093         case CUTGROUP_next_fail:
8094             do_cutgroup = 1;
8095             no_final = 1;
8096             if (st->u.mark.mark_name)
8097                 sv_commit = st->u.mark.mark_name;
8098             sayNO;
8099             NOT_REACHED; /* NOTREACHED */
8100
8101         case BRANCH_next:
8102             sayYES;
8103             NOT_REACHED; /* NOTREACHED */
8104
8105         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8106             if (do_cutgroup) {
8107                 do_cutgroup = 0;
8108                 no_final = 0;
8109             }
8110             REGCP_UNWIND(ST.cp);
8111             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8112             scan = ST.next_branch;
8113             /* no more branches? */
8114             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8115                 DEBUG_EXECUTE_r({
8116                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8117                         depth,
8118                         PL_colors[4],
8119                         PL_colors[5] );
8120                 });
8121                 sayNO_SILENT;
8122             }
8123             continue; /* execute next BRANCH[J] op */
8124             /* NOTREACHED */
8125
8126         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8127             minmod = 1;
8128             break;
8129
8130 #undef  ST
8131 #define ST st->u.curlym
8132
8133         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8134
8135             /* This is an optimisation of CURLYX that enables us to push
8136              * only a single backtracking state, no matter how many matches
8137              * there are in {m,n}. It relies on the pattern being constant
8138              * length, with no parens to influence future backrefs
8139              */
8140
8141             ST.me = scan;
8142             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8143
8144             ST.lastparen      = rex->lastparen;
8145             ST.lastcloseparen = rex->lastcloseparen;
8146
8147             /* if paren positive, emulate an OPEN/CLOSE around A */
8148             if (ST.me->flags) {
8149                 U32 paren = ST.me->flags;
8150                 if (paren > maxopenparen)
8151                     maxopenparen = paren;
8152                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8153             }
8154             ST.A = scan;
8155             ST.B = next;
8156             ST.alen = 0;
8157             ST.count = 0;
8158             ST.minmod = minmod;
8159             minmod = 0;
8160             ST.c1 = CHRTEST_UNINIT;
8161             REGCP_SET(ST.cp);
8162
8163             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8164                 goto curlym_do_B;
8165
8166           curlym_do_A: /* execute the A in /A{m,n}B/  */
8167             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
8168             NOT_REACHED; /* NOTREACHED */
8169
8170         case CURLYM_A: /* we've just matched an A */
8171             ST.count++;
8172             /* after first match, determine A's length: u.curlym.alen */
8173             if (ST.count == 1) {
8174                 if (reginfo->is_utf8_target) {
8175                     char *s = st->locinput;
8176                     while (s < locinput) {
8177                         ST.alen++;
8178                         s += UTF8SKIP(s);
8179                     }
8180                 }
8181                 else {
8182                     ST.alen = locinput - st->locinput;
8183                 }
8184                 if (ST.alen == 0)
8185                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8186             }
8187             DEBUG_EXECUTE_r(
8188                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8189                           depth, (IV) ST.count, (IV)ST.alen)
8190             );
8191
8192             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8193                 goto fake_end;
8194
8195             {
8196                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8197                 if ( max == REG_INFTY || ST.count < max )
8198                     goto curlym_do_A; /* try to match another A */
8199             }
8200             goto curlym_do_B; /* try to match B */
8201
8202         case CURLYM_A_fail: /* just failed to match an A */
8203             REGCP_UNWIND(ST.cp);
8204
8205
8206             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8207                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8208                 sayNO;
8209
8210           curlym_do_B: /* execute the B in /A{m,n}B/  */
8211             if (ST.c1 == CHRTEST_UNINIT) {
8212                 /* calculate c1 and c2 for possible match of 1st char
8213                  * following curly */
8214                 ST.c1 = ST.c2 = CHRTEST_VOID;
8215                 assert(ST.B);
8216                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8217                     regnode *text_node = ST.B;
8218                     if (! HAS_TEXT(text_node))
8219                         FIND_NEXT_IMPT(text_node);
8220                     /* this used to be
8221
8222                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
8223
8224                         But the former is redundant in light of the latter.
8225
8226                         if this changes back then the macro for
8227                         IS_TEXT and friends need to change.
8228                      */
8229                     if (PL_regkind[OP(text_node)] == EXACT) {
8230                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8231                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8232                            reginfo))
8233                         {
8234                             sayNO;
8235                         }
8236                     }
8237                 }
8238             }
8239
8240             DEBUG_EXECUTE_r(
8241                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8242                     depth, (IV)ST.count)
8243                 );
8244             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8245                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8246                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8247                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8248                     {
8249                         /* simulate B failing */
8250                         DEBUG_OPTIMISE_r(
8251                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8252                                 depth,
8253                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8254                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8255                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8256                         );
8257                         state_num = CURLYM_B_fail;
8258                         goto reenter_switch;
8259                     }
8260                 }
8261                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8262                     /* simulate B failing */
8263                     DEBUG_OPTIMISE_r(
8264                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8265                             depth,
8266                             (int) nextchr, ST.c1, ST.c2)
8267                     );
8268                     state_num = CURLYM_B_fail;
8269                     goto reenter_switch;
8270                 }
8271             }
8272
8273             if (ST.me->flags) {
8274                 /* emulate CLOSE: mark current A as captured */
8275                 U32 paren = (U32)ST.me->flags;
8276                 if (ST.count) {
8277                     CLOSE_CAPTURE(paren,
8278                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
8279                         locinput - reginfo->strbeg);
8280                 }
8281                 else
8282                     rex->offs[paren].end = -1;
8283
8284                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8285                 {
8286                     if (ST.count)
8287                         goto fake_end;
8288                     else
8289                         sayNO;
8290                 }
8291             }
8292
8293             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
8294             NOT_REACHED; /* NOTREACHED */
8295
8296         case CURLYM_B_fail: /* just failed to match a B */
8297             REGCP_UNWIND(ST.cp);
8298             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8299             if (ST.minmod) {
8300                 I32 max = ARG2(ST.me);
8301                 if (max != REG_INFTY && ST.count == max)
8302                     sayNO;
8303                 goto curlym_do_A; /* try to match a further A */
8304             }
8305             /* backtrack one A */
8306             if (ST.count == ARG1(ST.me) /* min */)
8307                 sayNO;
8308             ST.count--;
8309             SET_locinput(HOPc(locinput, -ST.alen));
8310             goto curlym_do_B; /* try to match B */
8311
8312 #undef ST
8313 #define ST st->u.curly
8314
8315 #define CURLY_SETPAREN(paren, success) \
8316     if (paren) { \
8317         if (success) { \
8318             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8319                                  locinput - reginfo->strbeg); \
8320         } \
8321         else { \
8322             rex->offs[paren].end = -1; \
8323             rex->lastparen      = ST.lastparen; \
8324             rex->lastcloseparen = ST.lastcloseparen; \
8325         } \
8326     }
8327
8328         case STAR:              /*  /A*B/ where A is width 1 char */
8329             ST.paren = 0;
8330             ST.min = 0;
8331             ST.max = REG_INFTY;
8332             scan = NEXTOPER(scan);
8333             goto repeat;
8334
8335         case PLUS:              /*  /A+B/ where A is width 1 char */
8336             ST.paren = 0;
8337             ST.min = 1;
8338             ST.max = REG_INFTY;
8339             scan = NEXTOPER(scan);
8340             goto repeat;
8341
8342         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8343             ST.paren = scan->flags;     /* Which paren to set */
8344             ST.lastparen      = rex->lastparen;
8345             ST.lastcloseparen = rex->lastcloseparen;
8346             if (ST.paren > maxopenparen)
8347                 maxopenparen = ST.paren;
8348             ST.min = ARG1(scan);  /* min to match */
8349             ST.max = ARG2(scan);  /* max to match */
8350             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8351
8352             /* handle the single-char capture called as a GOSUB etc */
8353             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8354             {
8355                 char *li = locinput;
8356                 if (!regrepeat(rex, &li, scan, reginfo, 1))
8357                     sayNO;
8358                 SET_locinput(li);
8359                 goto fake_end;
8360             }
8361
8362             goto repeat;
8363
8364         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8365             ST.paren = 0;
8366             ST.min = ARG1(scan);  /* min to match */
8367             ST.max = ARG2(scan);  /* max to match */
8368             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8369           repeat:
8370             /*
8371             * Lookahead to avoid useless match attempts
8372             * when we know what character comes next.
8373             *
8374             * Used to only do .*x and .*?x, but now it allows
8375             * for )'s, ('s and (?{ ... })'s to be in the way
8376             * of the quantifier and the EXACT-like node.  -- japhy
8377             */
8378
8379             assert(ST.min <= ST.max);
8380             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8381                 ST.c1 = ST.c2 = CHRTEST_VOID;
8382             }
8383             else {
8384                 regnode *text_node = next;
8385
8386                 if (! HAS_TEXT(text_node))
8387                     FIND_NEXT_IMPT(text_node);
8388
8389                 if (! HAS_TEXT(text_node))
8390                     ST.c1 = ST.c2 = CHRTEST_VOID;
8391                 else {
8392                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8393                         ST.c1 = ST.c2 = CHRTEST_VOID;
8394                     }
8395                     else {
8396
8397                     /*  Currently we only get here when
8398
8399                         PL_rekind[OP(text_node)] == EXACT
8400
8401                         if this changes back then the macro for IS_TEXT and
8402                         friends need to change. */
8403                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8404                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8405                            reginfo))
8406                         {
8407                             sayNO;
8408                         }
8409                     }
8410                 }
8411             }
8412
8413             ST.A = scan;
8414             ST.B = next;
8415             if (minmod) {
8416                 char *li = locinput;
8417                 minmod = 0;
8418                 if (ST.min &&
8419                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8420                             < ST.min)
8421                     sayNO;
8422                 SET_locinput(li);
8423                 ST.count = ST.min;
8424                 REGCP_SET(ST.cp);
8425                 if (ST.c1 == CHRTEST_VOID)
8426                     goto curly_try_B_min;
8427
8428                 ST.oldloc = locinput;
8429
8430                 /* set ST.maxpos to the furthest point along the
8431                  * string that could possibly match */
8432                 if  (ST.max == REG_INFTY) {
8433                     ST.maxpos = reginfo->strend - 1;
8434                     if (utf8_target)
8435                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8436                             ST.maxpos--;
8437                 }
8438                 else if (utf8_target) {
8439                     int m = ST.max - ST.min;
8440                     for (ST.maxpos = locinput;
8441                          m >0 && ST.maxpos < reginfo->strend; m--)
8442                         ST.maxpos += UTF8SKIP(ST.maxpos);
8443                 }
8444                 else {
8445                     ST.maxpos = locinput + ST.max - ST.min;
8446                     if (ST.maxpos >= reginfo->strend)
8447                         ST.maxpos = reginfo->strend - 1;
8448                 }
8449                 goto curly_try_B_min_known;
8450
8451             }
8452             else {
8453                 /* avoid taking address of locinput, so it can remain
8454                  * a register var */
8455                 char *li = locinput;
8456                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8457                 if (ST.count < ST.min)
8458                     sayNO;
8459                 SET_locinput(li);
8460                 if ((ST.count > ST.min)
8461                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8462                 {
8463                     /* A{m,n} must come at the end of the string, there's
8464                      * no point in backing off ... */
8465                     ST.min = ST.count;
8466                     /* ...except that $ and \Z can match before *and* after
8467                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8468                        We may back off by one in this case. */
8469                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8470                         ST.min--;
8471                 }
8472                 REGCP_SET(ST.cp);
8473                 goto curly_try_B_max;
8474             }
8475             NOT_REACHED; /* NOTREACHED */
8476
8477         case CURLY_B_min_fail:
8478             /* failed to find B in a non-greedy match.
8479              * Handles both cases where c1,c2 valid or not */
8480
8481             REGCP_UNWIND(ST.cp);
8482             if (ST.paren) {
8483                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8484             }
8485
8486             if (ST.c1 == CHRTEST_VOID) {
8487                 /* failed -- move forward one */
8488                 char *li = locinput;
8489                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8490                     sayNO;
8491                 }
8492                 locinput = li;
8493                 ST.count++;
8494                 if (!(   ST.count <= ST.max
8495                         /* count overflow ? */
8496                      || (ST.max == REG_INFTY && ST.count > 0))
8497                 )
8498                     sayNO;
8499             }
8500             else {
8501                 int n;
8502                 /* Couldn't or didn't -- move forward. */
8503                 ST.oldloc = locinput;
8504                 if (utf8_target)
8505                     locinput += UTF8SKIP(locinput);
8506                 else
8507                     locinput++;
8508                 ST.count++;
8509
8510               curly_try_B_min_known:
8511                 /* find the next place where 'B' could work, then call B */
8512                 if (utf8_target) {
8513                     n = (ST.oldloc == locinput) ? 0 : 1;
8514                     if (ST.c1 == ST.c2) {
8515                         /* set n to utf8_distance(oldloc, locinput) */
8516                         while (locinput <= ST.maxpos
8517                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8518                         {
8519                             locinput += UTF8SKIP(locinput);
8520                             n++;
8521                         }
8522                     }
8523                     else {
8524                         /* set n to utf8_distance(oldloc, locinput) */
8525                         while (locinput <= ST.maxpos
8526                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8527                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8528                         {
8529                             locinput += UTF8SKIP(locinput);
8530                             n++;
8531                         }
8532                     }
8533                 }
8534                 else {  /* Not utf8_target */
8535                     if (ST.c1 == ST.c2) {
8536                         locinput = (char *) memchr(locinput,
8537                                                    ST.c1,
8538                                                    ST.maxpos + 1 - locinput);
8539                         if (! locinput) {
8540                             locinput = ST.maxpos + 1;
8541                         }
8542                     }
8543                     else {
8544                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8545
8546                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8547                             while (   locinput <= ST.maxpos
8548                                    && UCHARAT(locinput) != ST.c1
8549                                    && UCHARAT(locinput) != ST.c2)
8550                             {
8551                                 locinput++;
8552                             }
8553                         }
8554                         else {
8555                             /* If c1 and c2 only differ by a single bit, we can
8556                              * avoid a conditional each time through the loop,
8557                              * at the expense of a little preliminary setup and
8558                              * an extra mask each iteration.  By masking out
8559                              * that bit, we match exactly two characters, c1
8560                              * and c2, and so we don't have to test for both.
8561                              * On both ASCII and EBCDIC platforms, most of the
8562                              * ASCII-range and Latin1-range folded equivalents
8563                              * differ only in a single bit, so this is actually
8564                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8565                              * 0x61). */
8566                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8567                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8568                             while (   locinput <= ST.maxpos
8569                                    && (UCHARAT(locinput) & c1_c2_mask)
8570                                                                 != c1_masked)
8571                             {
8572                                 locinput++;
8573                             }
8574                         }
8575                     }
8576                     n = locinput - ST.oldloc;
8577                 }
8578                 if (locinput > ST.maxpos)
8579                     sayNO;
8580                 if (n) {
8581                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8582                      * at b; check that everything between oldloc and
8583                      * locinput matches */
8584                     char *li = ST.oldloc;
8585                     ST.count += n;
8586                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8587                         sayNO;
8588                     assert(n == REG_INFTY || locinput == li);
8589                 }
8590             }
8591
8592           curly_try_B_min:
8593             CURLY_SETPAREN(ST.paren, ST.count);
8594             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8595             NOT_REACHED; /* NOTREACHED */
8596
8597
8598           curly_try_B_max:
8599             /* a successful greedy match: now try to match B */
8600             {
8601                 bool could_match = locinput < reginfo->strend;
8602
8603                 /* If it could work, try it. */
8604                 if (ST.c1 != CHRTEST_VOID && could_match) {
8605                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8606                     {
8607                         could_match = memEQ(locinput,
8608                                             ST.c1_utf8,
8609                                             UTF8SKIP(locinput))
8610                                     || memEQ(locinput,
8611                                              ST.c2_utf8,
8612                                              UTF8SKIP(locinput));
8613                     }
8614                     else {
8615                         could_match = UCHARAT(locinput) == ST.c1
8616                                       || UCHARAT(locinput) == ST.c2;
8617                     }
8618                 }
8619                 if (ST.c1 == CHRTEST_VOID || could_match) {
8620                     CURLY_SETPAREN(ST.paren, ST.count);
8621                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8622                     NOT_REACHED; /* NOTREACHED */
8623                 }
8624             }
8625             /* FALLTHROUGH */
8626
8627         case CURLY_B_max_fail:
8628             /* failed to find B in a greedy match */
8629
8630             REGCP_UNWIND(ST.cp);
8631             if (ST.paren) {
8632                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8633             }
8634             /*  back up. */
8635             if (--ST.count < ST.min)
8636                 sayNO;
8637             locinput = HOPc(locinput, -1);
8638             goto curly_try_B_max;
8639
8640 #undef ST
8641
8642         case END: /*  last op of main pattern  */
8643           fake_end:
8644             if (cur_eval) {
8645                 /* we've just finished A in /(??{A})B/; now continue with B */
8646                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8647                 st->u.eval.prev_rex = rex_sv;           /* inner */
8648
8649                 /* Save *all* the positions. */
8650                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8651                 rex_sv = CUR_EVAL.prev_rex;
8652                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8653                 SET_reg_curpm(rex_sv);
8654                 rex = ReANY(rex_sv);
8655                 rexi = RXi_GET(rex);
8656
8657                 st->u.eval.prev_curlyx = cur_curlyx;
8658                 cur_curlyx = CUR_EVAL.prev_curlyx;
8659
8660                 REGCP_SET(st->u.eval.lastcp);
8661
8662                 /* Restore parens of the outer rex without popping the
8663                  * savestack */
8664                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8665
8666                 st->u.eval.prev_eval = cur_eval;
8667                 cur_eval = CUR_EVAL.prev_eval;
8668                 DEBUG_EXECUTE_r(
8669                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8670                                       depth, cur_eval););
8671                 if ( nochange_depth )
8672                     nochange_depth--;
8673
8674                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8675
8676                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8677                                     locinput); /* match B */
8678             }
8679
8680             if (locinput < reginfo->till) {
8681                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8682                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8683                                       PL_colors[4],
8684                                       (long)(locinput - startpos),
8685                                       (long)(reginfo->till - startpos),
8686                                       PL_colors[5]));
8687
8688                 sayNO_SILENT;           /* Cannot match: too short. */
8689             }
8690             sayYES;                     /* Success! */
8691
8692         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8693             DEBUG_EXECUTE_r(
8694             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8695                 depth, PL_colors[4], PL_colors[5]));
8696             sayYES;                     /* Success! */
8697
8698 #undef  ST
8699 #define ST st->u.ifmatch
8700
8701         {
8702             char *newstart;
8703
8704         case SUSPEND:   /* (?>A) */
8705             ST.wanted = 1;
8706             newstart = locinput;
8707             goto do_ifmatch;
8708
8709         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8710             ST.wanted = 0;
8711             goto ifmatch_trivial_fail_test;
8712
8713         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8714             ST.wanted = 1;
8715           ifmatch_trivial_fail_test:
8716             if (scan->flags) {
8717                 char * const s = HOPBACKc(locinput, scan->flags);
8718                 if (!s) {
8719                     /* trivial fail */
8720                     if (logical) {
8721                         logical = 0;
8722                         sw = 1 - cBOOL(ST.wanted);
8723                     }
8724                     else if (ST.wanted)
8725                         sayNO;
8726                     next = scan + ARG(scan);
8727                     if (next == scan)
8728                         next = NULL;
8729                     break;
8730                 }
8731                 newstart = s;
8732             }
8733             else
8734                 newstart = locinput;
8735
8736           do_ifmatch:
8737             ST.me = scan;
8738             ST.logical = logical;
8739             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8740
8741             /* execute body of (?...A) */
8742             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8743             NOT_REACHED; /* NOTREACHED */
8744         }
8745
8746         case IFMATCH_A_fail: /* body of (?...A) failed */
8747             ST.wanted = !ST.wanted;
8748             /* FALLTHROUGH */
8749
8750         case IFMATCH_A: /* body of (?...A) succeeded */
8751             if (ST.logical) {
8752                 sw = cBOOL(ST.wanted);
8753             }
8754             else if (!ST.wanted)
8755                 sayNO;
8756
8757             if (OP(ST.me) != SUSPEND) {
8758                 /* restore old position except for (?>...) */
8759                 locinput = st->locinput;
8760             }
8761             scan = ST.me + ARG(ST.me);
8762             if (scan == ST.me)
8763                 scan = NULL;
8764             continue; /* execute B */
8765
8766 #undef ST
8767
8768         case LONGJMP: /*  alternative with many branches compiles to
8769                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8770             next = scan + ARG(scan);
8771             if (next == scan)
8772                 next = NULL;
8773             break;
8774
8775         case COMMIT:  /*  (*COMMIT)  */
8776             reginfo->cutpoint = reginfo->strend;
8777             /* FALLTHROUGH */
8778
8779         case PRUNE:   /*  (*PRUNE)   */
8780             if (scan->flags)
8781                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8782             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8783             NOT_REACHED; /* NOTREACHED */
8784
8785         case COMMIT_next_fail:
8786             no_final = 1;
8787             /* FALLTHROUGH */
8788             sayNO;
8789             NOT_REACHED; /* NOTREACHED */
8790
8791         case OPFAIL:   /* (*FAIL)  */
8792             if (scan->flags)
8793                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8794             if (logical) {
8795                 /* deal with (?(?!)X|Y) properly,
8796                  * make sure we trigger the no branch
8797                  * of the trailing IFTHEN structure*/
8798                 sw= 0;
8799                 break;
8800             } else {
8801                 sayNO;
8802             }
8803             NOT_REACHED; /* NOTREACHED */
8804
8805 #define ST st->u.mark
8806         case MARKPOINT: /*  (*MARK:foo)  */
8807             ST.prev_mark = mark_state;
8808             ST.mark_name = sv_commit = sv_yes_mark
8809                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8810             mark_state = st;
8811             ST.mark_loc = locinput;
8812             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8813             NOT_REACHED; /* NOTREACHED */
8814
8815         case MARKPOINT_next:
8816             mark_state = ST.prev_mark;
8817             sayYES;
8818             NOT_REACHED; /* NOTREACHED */
8819
8820         case MARKPOINT_next_fail:
8821             if (popmark && sv_eq(ST.mark_name,popmark))
8822             {
8823                 if (ST.mark_loc > startpoint)
8824                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8825                 popmark = NULL; /* we found our mark */
8826                 sv_commit = ST.mark_name;
8827
8828                 DEBUG_EXECUTE_r({
8829                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8830                             depth,
8831                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8832                 });
8833             }
8834             mark_state = ST.prev_mark;
8835             sv_yes_mark = mark_state ?
8836                 mark_state->u.mark.mark_name : NULL;
8837             sayNO;
8838             NOT_REACHED; /* NOTREACHED */
8839
8840         case SKIP:  /*  (*SKIP)  */
8841             if (!scan->flags) {
8842                 /* (*SKIP) : if we fail we cut here*/
8843                 ST.mark_name = NULL;
8844                 ST.mark_loc = locinput;
8845                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8846             } else {
8847                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8848                    otherwise do nothing.  Meaning we need to scan
8849                  */
8850                 regmatch_state *cur = mark_state;
8851                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8852
8853                 while (cur) {
8854                     if ( sv_eq( cur->u.mark.mark_name,
8855                                 find ) )
8856                     {
8857                         ST.mark_name = find;
8858                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8859                     }
8860                     cur = cur->u.mark.prev_mark;
8861                 }
8862             }
8863             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8864             break;
8865
8866         case SKIP_next_fail:
8867             if (ST.mark_name) {
8868                 /* (*CUT:NAME) - Set up to search for the name as we
8869                    collapse the stack*/
8870                 popmark = ST.mark_name;
8871             } else {
8872                 /* (*CUT) - No name, we cut here.*/
8873                 if (ST.mark_loc > startpoint)
8874                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8875                 /* but we set sv_commit to latest mark_name if there
8876                    is one so they can test to see how things lead to this
8877                    cut */
8878                 if (mark_state)
8879                     sv_commit=mark_state->u.mark.mark_name;
8880             }
8881             no_final = 1;
8882             sayNO;
8883             NOT_REACHED; /* NOTREACHED */
8884 #undef ST
8885
8886         case LNBREAK: /* \R */
8887             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8888                 locinput += n;
8889             } else
8890                 sayNO;
8891             break;
8892
8893         default:
8894             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8895                           PTR2UV(scan), OP(scan));
8896             Perl_croak(aTHX_ "regexp memory corruption");
8897
8898         /* this is a point to jump to in order to increment
8899          * locinput by one character */
8900           increment_locinput:
8901             assert(!NEXTCHR_IS_EOS);
8902             if (utf8_target) {
8903                 locinput += PL_utf8skip[nextchr];
8904                 /* locinput is allowed to go 1 char off the end (signifying
8905                  * EOS), but not 2+ */
8906                 if (locinput > reginfo->strend)
8907                     sayNO;
8908             }
8909             else
8910                 locinput++;
8911             break;
8912
8913         } /* end switch */
8914
8915         /* switch break jumps here */
8916         scan = next; /* prepare to execute the next op and ... */
8917         continue;    /* ... jump back to the top, reusing st */
8918         /* NOTREACHED */
8919
8920       push_yes_state:
8921         /* push a state that backtracks on success */
8922         st->u.yes.prev_yes_state = yes_state;
8923         yes_state = st;
8924         /* FALLTHROUGH */
8925       push_state:
8926         /* push a new regex state, then continue at scan  */
8927         {
8928             regmatch_state *newst;
8929
8930             DEBUG_STACK_r({
8931                 regmatch_state *cur = st;
8932                 regmatch_state *curyes = yes_state;
8933                 U32 i;
8934                 regmatch_slab *slab = PL_regmatch_slab;
8935                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
8936                     if (cur < SLAB_FIRST(slab)) {
8937                         slab = slab->prev;
8938                         cur = SLAB_LAST(slab);
8939                     }
8940                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
8941                         depth,
8942                         i ? "    " : "push",
8943                         depth - i, PL_reg_name[cur->resume_state],
8944                         (curyes == cur) ? "yes" : ""
8945                     );
8946                     if (curyes == cur)
8947                         curyes = cur->u.yes.prev_yes_state;
8948                 }
8949             } else
8950                 DEBUG_STATE_pp("push")
8951             );
8952             depth++;
8953             st->locinput = locinput;
8954             newst = st+1;
8955             if (newst >  SLAB_LAST(PL_regmatch_slab))
8956                 newst = S_push_slab(aTHX);
8957             PL_regmatch_state = newst;
8958
8959             locinput = pushinput;
8960             st = newst;
8961             continue;
8962             /* NOTREACHED */
8963         }
8964     }
8965 #ifdef SOLARIS_BAD_OPTIMIZER
8966 #  undef PL_charclass
8967 #endif
8968
8969     /*
8970     * We get here only if there's trouble -- normally "case END" is
8971     * the terminating point.
8972     */
8973     Perl_croak(aTHX_ "corrupted regexp pointers");
8974     NOT_REACHED; /* NOTREACHED */
8975
8976   yes:
8977     if (yes_state) {
8978         /* we have successfully completed a subexpression, but we must now
8979          * pop to the state marked by yes_state and continue from there */
8980         assert(st != yes_state);
8981 #ifdef DEBUGGING
8982         while (st != yes_state) {
8983             st--;
8984             if (st < SLAB_FIRST(PL_regmatch_slab)) {
8985                 PL_regmatch_slab = PL_regmatch_slab->prev;
8986                 st = SLAB_LAST(PL_regmatch_slab);
8987             }
8988             DEBUG_STATE_r({
8989                 if (no_final) {
8990                     DEBUG_STATE_pp("pop (no final)");
8991                 } else {
8992                     DEBUG_STATE_pp("pop (yes)");
8993                 }
8994             });
8995             depth--;
8996         }
8997 #else
8998         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8999             || yes_state > SLAB_LAST(PL_regmatch_slab))
9000         {
9001             /* not in this slab, pop slab */
9002             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9003             PL_regmatch_slab = PL_regmatch_slab->prev;
9004             st = SLAB_LAST(PL_regmatch_slab);
9005         }
9006         depth -= (st - yes_state);
9007 #endif
9008         st = yes_state;
9009         yes_state = st->u.yes.prev_yes_state;
9010         PL_regmatch_state = st;
9011
9012         if (no_final)
9013             locinput= st->locinput;
9014         state_num = st->resume_state + no_final;
9015         goto reenter_switch;
9016     }
9017
9018     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9019                           PL_colors[4], PL_colors[5]));
9020
9021     if (reginfo->info_aux_eval) {
9022         /* each successfully executed (?{...}) block does the equivalent of
9023          *   local $^R = do {...}
9024          * When popping the save stack, all these locals would be undone;
9025          * bypass this by setting the outermost saved $^R to the latest
9026          * value */
9027         /* I dont know if this is needed or works properly now.
9028          * see code related to PL_replgv elsewhere in this file.
9029          * Yves
9030          */
9031         if (oreplsv != GvSV(PL_replgv))
9032             sv_setsv(oreplsv, GvSV(PL_replgv));
9033     }
9034     result = 1;
9035     goto final_exit;
9036
9037   no:
9038     DEBUG_EXECUTE_r(
9039         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9040             depth,
9041             PL_colors[4], PL_colors[5])
9042         );
9043
9044   no_silent:
9045     if (no_final) {
9046         if (yes_state) {
9047             goto yes;
9048         } else {
9049             goto final_exit;
9050         }
9051     }
9052     if (depth) {
9053         /* there's a previous state to backtrack to */
9054         st--;
9055         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9056             PL_regmatch_slab = PL_regmatch_slab->prev;
9057             st = SLAB_LAST(PL_regmatch_slab);
9058         }
9059         PL_regmatch_state = st;
9060         locinput= st->locinput;
9061
9062         DEBUG_STATE_pp("pop");
9063         depth--;
9064         if (yes_state == st)
9065             yes_state = st->u.yes.prev_yes_state;
9066
9067         state_num = st->resume_state + 1; /* failure = success + 1 */
9068         PERL_ASYNC_CHECK();
9069         goto reenter_switch;
9070     }
9071     result = 0;
9072
9073   final_exit:
9074     if (rex->intflags & PREGf_VERBARG_SEEN) {
9075         SV *sv_err = get_sv("REGERROR", 1);
9076         SV *sv_mrk = get_sv("REGMARK", 1);
9077         if (result) {
9078             sv_commit = &PL_sv_no;
9079             if (!sv_yes_mark)
9080                 sv_yes_mark = &PL_sv_yes;
9081         } else {
9082             if (!sv_commit)
9083                 sv_commit = &PL_sv_yes;
9084             sv_yes_mark = &PL_sv_no;
9085         }
9086         assert(sv_err);
9087         assert(sv_mrk);
9088         sv_setsv(sv_err, sv_commit);
9089         sv_setsv(sv_mrk, sv_yes_mark);
9090     }
9091
9092
9093     if (last_pushed_cv) {
9094         dSP;
9095         /* see "Some notes about MULTICALL" above */
9096         POP_MULTICALL;
9097         PERL_UNUSED_VAR(SP);
9098     }
9099     else
9100         LEAVE_SCOPE(orig_savestack_ix);
9101
9102     assert(!result ||  locinput - reginfo->strbeg >= 0);
9103     return result ?  locinput - reginfo->strbeg : -1;
9104 }
9105
9106 /*
9107  - regrepeat - repeatedly match something simple, report how many
9108  *
9109  * What 'simple' means is a node which can be the operand of a quantifier like
9110  * '+', or {1,3}
9111  *
9112  * startposp - pointer a pointer to the start position.  This is updated
9113  *             to point to the byte following the highest successful
9114  *             match.
9115  * p         - the regnode to be repeatedly matched against.
9116  * reginfo   - struct holding match state, such as strend
9117  * max       - maximum number of things to match.
9118  * depth     - (for debugging) backtracking depth.
9119  */
9120 STATIC I32
9121 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9122             regmatch_info *const reginfo, I32 max _pDEPTH)
9123 {
9124     char *scan;     /* Pointer to current position in target string */
9125     I32 c;
9126     char *loceol = reginfo->strend;   /* local version */
9127     I32 hardcount = 0;  /* How many matches so far */
9128     bool utf8_target = reginfo->is_utf8_target;
9129     unsigned int to_complement = 0;  /* Invert the result? */
9130     UV utf8_flags;
9131     _char_class_number classnum;
9132
9133     PERL_ARGS_ASSERT_REGREPEAT;
9134
9135     scan = *startposp;
9136     if (max == REG_INFTY)
9137         max = I32_MAX;
9138     else if (! utf8_target && loceol - scan > max)
9139         loceol = scan + max;
9140
9141     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
9142      * to the maximum of how far we should go in it (leaving it set to the real
9143      * end, if the maximum permissible would take us beyond that).  This allows
9144      * us to make the loop exit condition that we haven't gone past <loceol> to
9145      * also mean that we haven't exceeded the max permissible count, saving a
9146      * test each time through the loop.  But it assumes that the OP matches a
9147      * single byte, which is true for most of the OPs below when applied to a
9148      * non-UTF-8 target.  Those relatively few OPs that don't have this
9149      * characteristic will have to compensate.
9150      *
9151      * There is no adjustment for UTF-8 targets, as the number of bytes per
9152      * character varies.  OPs will have to test both that the count is less
9153      * than the max permissible (using <hardcount> to keep track), and that we
9154      * are still within the bounds of the string (using <loceol>.  A few OPs
9155      * match a single byte no matter what the encoding.  They can omit the max
9156      * test if, for the UTF-8 case, they do the adjustment that was skipped
9157      * above.
9158      *
9159      * Thus, the code above sets things up for the common case; and exceptional
9160      * cases need extra work; the common case is to make sure <scan> doesn't
9161      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
9162      * count doesn't exceed the maximum permissible */
9163
9164     switch (OP(p)) {
9165     case REG_ANY:
9166         if (utf8_target) {
9167             while (scan < loceol && hardcount < max && *scan != '\n') {
9168                 scan += UTF8SKIP(scan);
9169                 hardcount++;
9170             }
9171         } else {
9172             scan = (char *) memchr(scan, '\n', loceol - scan);
9173             if (! scan) {
9174                 scan = loceol;
9175             }
9176         }
9177         break;
9178     case SANY:
9179         if (utf8_target) {
9180             while (scan < loceol && hardcount < max) {
9181                 scan += UTF8SKIP(scan);
9182                 hardcount++;
9183             }
9184         }
9185         else
9186             scan = loceol;
9187         break;
9188     case EXACTL:
9189         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9190         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9191             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9192         }
9193         /* FALLTHROUGH */
9194     case EXACT:
9195         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9196
9197         c = (U8)*STRING(p);
9198
9199         /* Can use a simple find if the pattern char to match on is invariant
9200          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9201          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9202          * true iff it doesn't matter if the argument is in UTF-8 or not */
9203         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9204             if (utf8_target && loceol - scan > max) {
9205                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
9206                  * since here, to match at all, 1 char == 1 byte */
9207                 loceol = scan + max;
9208             }
9209             scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9210         }
9211         else if (reginfo->is_utf8_pat) {
9212             if (utf8_target) {
9213                 STRLEN scan_char_len;
9214
9215                 /* When both target and pattern are UTF-8, we have to do
9216                  * string EQ */
9217                 while (hardcount < max
9218                        && scan < loceol
9219                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9220                        && memEQ(scan, STRING(p), scan_char_len))
9221                 {
9222                     scan += scan_char_len;
9223                     hardcount++;
9224                 }
9225             }
9226             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9227
9228                 /* Target isn't utf8; convert the character in the UTF-8
9229                  * pattern to non-UTF8, and do a simple find */
9230                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9231                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9232             } /* else pattern char is above Latin1, can't possibly match the
9233                  non-UTF-8 target */
9234         }
9235         else {
9236
9237             /* Here, the string must be utf8; pattern isn't, and <c> is
9238              * different in utf8 than not, so can't compare them directly.
9239              * Outside the loop, find the two utf8 bytes that represent c, and
9240              * then look for those in sequence in the utf8 string */
9241             U8 high = UTF8_TWO_BYTE_HI(c);
9242             U8 low = UTF8_TWO_BYTE_LO(c);
9243
9244             while (hardcount < max
9245                     && scan + 1 < loceol
9246                     && UCHARAT(scan) == high
9247                     && UCHARAT(scan + 1) == low)
9248             {
9249                 scan += 2;
9250                 hardcount++;
9251             }
9252         }
9253         break;
9254
9255     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9256         assert(! reginfo->is_utf8_pat);
9257         /* FALLTHROUGH */
9258     case EXACTFAA:
9259         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9260         goto do_exactf;
9261
9262     case EXACTFL:
9263         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9264         utf8_flags = FOLDEQ_LOCALE;
9265         goto do_exactf;
9266
9267     case EXACTF:   /* This node only generated for non-utf8 patterns */
9268         assert(! reginfo->is_utf8_pat);
9269         utf8_flags = 0;
9270         goto do_exactf;
9271
9272     case EXACTFLU8:
9273         if (! utf8_target) {
9274             break;
9275         }
9276         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9277                                     | FOLDEQ_S2_FOLDS_SANE;
9278         goto do_exactf;
9279
9280     case EXACTFU_SS:
9281     case EXACTFU:
9282         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
9283
9284       do_exactf: {
9285         int c1, c2;
9286         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9287
9288         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9289
9290         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9291                                         reginfo))
9292         {
9293             if (c1 == CHRTEST_VOID) {
9294                 /* Use full Unicode fold matching */
9295                 char *tmpeol = reginfo->strend;
9296                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9297                 while (hardcount < max
9298                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9299                                              STRING(p), NULL, pat_len,
9300                                              reginfo->is_utf8_pat, utf8_flags))
9301                 {
9302                     scan = tmpeol;
9303                     tmpeol = reginfo->strend;
9304                     hardcount++;
9305                 }
9306             }
9307             else if (utf8_target) {
9308                 if (c1 == c2) {
9309                     while (scan < loceol
9310                            && hardcount < max
9311                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
9312                     {
9313                         scan += UTF8SKIP(scan);
9314                         hardcount++;
9315                     }
9316                 }
9317                 else {
9318                     while (scan < loceol
9319                            && hardcount < max
9320                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
9321                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
9322                     {
9323                         scan += UTF8SKIP(scan);
9324                         hardcount++;
9325                     }
9326                 }
9327             }
9328             else if (c1 == c2) {
9329                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c1);
9330             }
9331             else {
9332                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9333                  * a conditional each time through the loop if the characters
9334                  * differ only in a single bit, as is the usual situation */
9335                 U8 c1_c2_bits_differing = c1 ^ c2;
9336
9337                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9338                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9339
9340                     scan = (char *) find_span_end_mask((U8 *) scan,
9341                                                        (U8 *) loceol,
9342                                                        c1 & c1_c2_mask,
9343                                                        c1_c2_mask);
9344                 }
9345                 else {
9346                     while (    scan < loceol
9347                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9348                     {
9349                         scan++;
9350                     }
9351                 }
9352             }
9353         }
9354         break;
9355     }
9356     case ANYOFPOSIXL:
9357     case ANYOFL:
9358         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9359
9360         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9361             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9362         }
9363         /* FALLTHROUGH */
9364     case ANYOFD:
9365     case ANYOF:
9366         if (utf8_target) {
9367             while (hardcount < max
9368                    && scan < loceol
9369                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
9370             {
9371                 scan += UTF8SKIP(scan);
9372                 hardcount++;
9373             }
9374         }
9375         else if (ANYOF_FLAGS(p)) {
9376             while (scan < loceol
9377                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9378                 scan++;
9379         }
9380         else {
9381             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9382                 scan++;
9383         }
9384         break;
9385
9386     case ANYOFM:
9387         if (utf8_target && loceol - scan > max) {
9388
9389             /* We didn't adjust <loceol> at the beginning of this routine
9390              * because is UTF-8, but it is actually ok to do so, since here, to
9391              * match, 1 char == 1 byte. */
9392             loceol = scan + max;
9393         }
9394
9395         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9396         break;
9397
9398     case ASCII:
9399         if (utf8_target && loceol - scan > max) {
9400             loceol = scan + max;
9401         }
9402
9403         scan = find_next_non_ascii(scan, loceol, utf8_target);
9404         break;
9405
9406     case NASCII:
9407         if (utf8_target) {
9408             while (     hardcount < max
9409                    &&   scan < loceol
9410                    && ! isASCII_utf8_safe(scan, loceol))
9411             {
9412                 scan += UTF8SKIP(scan);
9413                 hardcount++;
9414             }
9415         }
9416         else {
9417             scan = find_next_ascii(scan, loceol, utf8_target);
9418         }
9419         break;
9420
9421     /* The argument (FLAGS) to all the POSIX node types is the class number */
9422
9423     case NPOSIXL:
9424         to_complement = 1;
9425         /* FALLTHROUGH */
9426
9427     case POSIXL:
9428         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9429         if (! utf8_target) {
9430             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9431                                                                    *scan)))
9432             {
9433                 scan++;
9434             }
9435         } else {
9436             while (hardcount < max && scan < loceol
9437                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9438                                                                   (U8 *) scan,
9439                                                                   (U8 *) loceol)))
9440             {
9441                 scan += UTF8SKIP(scan);
9442                 hardcount++;
9443             }
9444         }
9445         break;
9446
9447     case POSIXD:
9448         if (utf8_target) {
9449             goto utf8_posix;
9450         }
9451         /* FALLTHROUGH */
9452
9453     case POSIXA:
9454         if (utf8_target && loceol - scan > max) {
9455
9456             /* We didn't adjust <loceol> at the beginning of this routine
9457              * because is UTF-8, but it is actually ok to do so, since here, to
9458              * match, 1 char == 1 byte. */
9459             loceol = scan + max;
9460         }
9461         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9462             scan++;
9463         }
9464         break;
9465
9466     case NPOSIXD:
9467         if (utf8_target) {
9468             to_complement = 1;
9469             goto utf8_posix;
9470         }
9471         /* FALLTHROUGH */
9472
9473     case NPOSIXA:
9474         if (! utf8_target) {
9475             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9476                 scan++;
9477             }
9478         }
9479         else {
9480
9481             /* The complement of something that matches only ASCII matches all
9482              * non-ASCII, plus everything in ASCII that isn't in the class. */
9483             while (hardcount < max && scan < loceol
9484                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9485                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9486             {
9487                 scan += UTF8SKIP(scan);
9488                 hardcount++;
9489             }
9490         }
9491         break;
9492
9493     case NPOSIXU:
9494         to_complement = 1;
9495         /* FALLTHROUGH */
9496
9497     case POSIXU:
9498         if (! utf8_target) {
9499             while (scan < loceol && to_complement
9500                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9501             {
9502                 scan++;
9503             }
9504         }
9505         else {
9506           utf8_posix:
9507             classnum = (_char_class_number) FLAGS(p);
9508             switch (classnum) {
9509                 default:
9510                     while (   hardcount < max && scan < loceol
9511                            && to_complement ^ cBOOL(_invlist_contains_cp(
9512                                               PL_XPosix_ptrs[classnum],
9513                                               utf8_to_uvchr_buf((U8 *) scan,
9514                                                                 (U8 *) loceol,
9515                                                                 NULL))))
9516                     {
9517                         scan += UTF8SKIP(scan);
9518                         hardcount++;
9519                     }
9520                     break;
9521
9522                     /* For the classes below, the knowledge of how to handle
9523                      * every code point is compiled in to Perl via a macro.
9524                      * This code is written for making the loops as tight as
9525                      * possible.  It could be refactored to save space instead.
9526                      * */
9527
9528                 case _CC_ENUM_SPACE:
9529                     while (hardcount < max
9530                            && scan < loceol
9531                            && (to_complement
9532                                ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9533                     {
9534                         scan += UTF8SKIP(scan);
9535                         hardcount++;
9536                     }
9537                     break;
9538                 case _CC_ENUM_BLANK:
9539                     while (hardcount < max
9540                            && scan < loceol
9541                            && (to_complement
9542                                 ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9543                     {
9544                         scan += UTF8SKIP(scan);
9545                         hardcount++;
9546                     }
9547                     break;
9548                 case _CC_ENUM_XDIGIT:
9549                     while (hardcount < max
9550                            && scan < loceol
9551                            && (to_complement
9552                                ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9553                     {
9554                         scan += UTF8SKIP(scan);
9555                         hardcount++;
9556                     }
9557                     break;
9558                 case _CC_ENUM_VERTSPACE:
9559                     while (hardcount < max
9560                            && scan < loceol
9561                            && (to_complement
9562                                ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9563                     {
9564                         scan += UTF8SKIP(scan);
9565                         hardcount++;
9566                     }
9567                     break;
9568                 case _CC_ENUM_CNTRL:
9569                     while (hardcount < max
9570                            && scan < loceol
9571                            && (to_complement
9572                                ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9573                     {
9574                         scan += UTF8SKIP(scan);
9575                         hardcount++;
9576                     }
9577                     break;
9578             }
9579         }
9580         break;
9581
9582     case LNBREAK:
9583         if (utf8_target) {
9584             while (hardcount < max && scan < loceol &&
9585                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9586                 scan += c;
9587                 hardcount++;
9588             }
9589         } else {
9590             /* LNBREAK can match one or two latin chars, which is ok, but we
9591              * have to use hardcount in this situation, and throw away the
9592              * adjustment to <loceol> done before the switch statement */
9593             loceol = reginfo->strend;
9594             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9595                 scan+=c;
9596                 hardcount++;
9597             }
9598         }
9599         break;
9600
9601     case BOUNDL:
9602     case NBOUNDL:
9603         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9604         /* FALLTHROUGH */
9605     case BOUND:
9606     case BOUNDA:
9607     case BOUNDU:
9608     case EOS:
9609     case GPOS:
9610     case KEEPS:
9611     case NBOUND:
9612     case NBOUNDA:
9613     case NBOUNDU:
9614     case OPFAIL:
9615     case SBOL:
9616     case SEOL:
9617         /* These are all 0 width, so match right here or not at all. */
9618         break;
9619
9620     default:
9621         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9622         NOT_REACHED; /* NOTREACHED */
9623
9624     }
9625
9626     if (hardcount)
9627         c = hardcount;
9628     else
9629         c = scan - *startposp;
9630     *startposp = scan;
9631
9632     DEBUG_r({
9633         GET_RE_DEBUG_FLAGS_DECL;
9634         DEBUG_EXECUTE_r({
9635             SV * const prop = sv_newmortal();
9636             regprop(prog, prop, p, reginfo, NULL);
9637             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9638                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9639         });
9640     });
9641
9642     return(c);
9643 }
9644
9645
9646 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9647 /*
9648 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9649 create a copy so that changes the caller makes won't change the shared one.
9650 If <altsvp> is non-null, will return NULL in it, for back-compat.
9651  */
9652 SV *
9653 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9654 {
9655     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9656
9657     if (altsvp) {
9658         *altsvp = NULL;
9659     }
9660
9661     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9662 }
9663
9664 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9665
9666 /*
9667  - reginclass - determine if a character falls into a character class
9668
9669   n is the ANYOF-type regnode
9670   p is the target string
9671   p_end points to one byte beyond the end of the target string
9672   utf8_target tells whether p is in UTF-8.
9673
9674   Returns true if matched; false otherwise.
9675
9676   Note that this can be a synthetic start class, a combination of various
9677   nodes, so things you think might be mutually exclusive, such as locale,
9678   aren't.  It can match both locale and non-locale
9679
9680  */
9681
9682 STATIC bool
9683 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9684 {
9685     dVAR;
9686     const char flags = ANYOF_FLAGS(n);
9687     bool match = FALSE;
9688     UV c = *p;
9689
9690     PERL_ARGS_ASSERT_REGINCLASS;
9691
9692     /* If c is not already the code point, get it.  Note that
9693      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9694     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9695         STRLEN c_len = 0;
9696         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9697         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9698         if (c_len == (STRLEN)-1) {
9699             _force_out_malformed_utf8_message(p, p_end,
9700                                               utf8n_flags,
9701                                               1 /* 1 means die */ );
9702             NOT_REACHED; /* NOTREACHED */
9703         }
9704         if (     c > 255
9705             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
9706             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
9707         {
9708             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9709         }
9710     }
9711
9712     /* If this character is potentially in the bitmap, check it */
9713     if (c < NUM_ANYOF_CODE_POINTS) {
9714         if (ANYOF_BITMAP_TEST(n, c))
9715             match = TRUE;
9716         else if ((flags
9717                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9718                   && OP(n) == ANYOFD
9719                   && ! utf8_target
9720                   && ! isASCII(c))
9721         {
9722             match = TRUE;
9723         }
9724         else if (flags & ANYOF_LOCALE_FLAGS) {
9725             if ((flags & ANYOFL_FOLD)
9726                 && c < 256
9727                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9728             {
9729                 match = TRUE;
9730             }
9731             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9732                      && c < 256
9733             ) {
9734
9735                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9736                  * if the class includes the Posix character class given by
9737                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9738                  * complemented Posix class given by int(bit/2).  So we loop
9739                  * through the bits, each time changing whether we complement
9740                  * the result or not.  Suppose for the sake of illustration
9741                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9742                  * is set, it means there is a match for this ANYOF node if the
9743                  * character is in the class given by the expression (0 / 2 = 0
9744                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9745                  * and since 'to_complement' is 0, the result will stay TRUE,
9746                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9747                  * bit 1 is 1.  That means there is a match if the character
9748                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9749                  * but will on bit 1.  On the second iteration 'to_complement'
9750                  * will be 1, so the exclusive or will reverse things, so we
9751                  * are testing for \W.  On the third iteration, 'to_complement'
9752                  * will be 0, and we would be testing for \s; the fourth
9753                  * iteration would test for \S, etc.
9754                  *
9755                  * Note that this code assumes that all the classes are closed
9756                  * under folding.  For example, if a character matches \w, then
9757                  * its fold does too; and vice versa.  This should be true for
9758                  * any well-behaved locale for all the currently defined Posix
9759                  * classes, except for :lower: and :upper:, which are handled
9760                  * by the pseudo-class :cased: which matches if either of the
9761                  * other two does.  To get rid of this assumption, an outer
9762                  * loop could be used below to iterate over both the source
9763                  * character, and its fold (if different) */
9764
9765                 int count = 0;
9766                 int to_complement = 0;
9767
9768                 while (count < ANYOF_MAX) {
9769                     if (ANYOF_POSIXL_TEST(n, count)
9770                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9771                     {
9772                         match = TRUE;
9773                         break;
9774                     }
9775                     count++;
9776                     to_complement ^= 1;
9777                 }
9778             }
9779         }
9780     }
9781
9782
9783     /* If the bitmap didn't (or couldn't) match, and something outside the
9784      * bitmap could match, try that. */
9785     if (!match) {
9786         if (c >= NUM_ANYOF_CODE_POINTS
9787             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9788         {
9789             match = TRUE;       /* Everything above the bitmap matches */
9790         }
9791             /* Here doesn't match everything above the bitmap.  If there is
9792              * some information available beyond the bitmap, we may find a
9793              * match in it.  If so, this is most likely because the code point
9794              * is outside the bitmap range.  But rarely, it could be because of
9795              * some other reason.  If so, various flags are set to indicate
9796              * this possibility.  On ANYOFD nodes, there may be matches that
9797              * happen only when the target string is UTF-8; or for other node
9798              * types, because runtime lookup is needed, regardless of the
9799              * UTF-8ness of the target string.  Finally, under /il, there may
9800              * be some matches only possible if the locale is a UTF-8 one. */
9801         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9802                  && (   c >= NUM_ANYOF_CODE_POINTS
9803                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9804                          && (   UNLIKELY(OP(n) != ANYOFD)
9805                              || (utf8_target && ! isASCII_uni(c)
9806 #                               if NUM_ANYOF_CODE_POINTS > 256
9807                                                                  && c < 256
9808 #                               endif
9809                                 )))
9810                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9811                          && IN_UTF8_CTYPE_LOCALE)))
9812         {
9813             SV* only_utf8_locale = NULL;
9814             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9815                                                        &only_utf8_locale, NULL);
9816             if (sw) {
9817                 U8 utf8_buffer[2];
9818                 U8 * utf8_p;
9819                 if (utf8_target) {
9820                     utf8_p = (U8 *) p;
9821                 } else { /* Convert to utf8 */
9822                     utf8_p = utf8_buffer;
9823                     append_utf8_from_native_byte(*p, &utf8_p);
9824                     utf8_p = utf8_buffer;
9825                 }
9826
9827                 if (swash_fetch(sw, utf8_p, TRUE)) {
9828                     match = TRUE;
9829                 }
9830             }
9831             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9832                 match = _invlist_contains_cp(only_utf8_locale, c);
9833             }
9834         }
9835
9836         if (UNICODE_IS_SUPER(c)
9837             && (flags
9838                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9839             && OP(n) != ANYOFD
9840             && ckWARN_d(WARN_NON_UNICODE))
9841         {
9842             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9843                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9844         }
9845     }
9846
9847 #if ANYOF_INVERT != 1
9848     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9849      * use it, don't */
9850 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9851 #endif
9852
9853     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9854     return (flags & ANYOF_INVERT) ^ match;
9855 }
9856
9857 STATIC U8 *
9858 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9859 {
9860     /* return the position 'off' UTF-8 characters away from 's', forward if
9861      * 'off' >= 0, backwards if negative.  But don't go outside of position
9862      * 'lim', which better be < s  if off < 0 */
9863
9864     PERL_ARGS_ASSERT_REGHOP3;
9865
9866     if (off >= 0) {
9867         while (off-- && s < lim) {
9868             /* XXX could check well-formedness here */
9869             U8 *new_s = s + UTF8SKIP(s);
9870             if (new_s > lim) /* lim may be in the middle of a long character */
9871                 return s;
9872             s = new_s;
9873         }
9874     }
9875     else {
9876         while (off++ && s > lim) {
9877             s--;
9878             if (UTF8_IS_CONTINUED(*s)) {
9879                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9880                     s--;
9881                 if (! UTF8_IS_START(*s)) {
9882                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9883                 }
9884             }
9885             /* XXX could check well-formedness here */
9886         }
9887     }
9888     return s;
9889 }
9890
9891 STATIC U8 *
9892 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9893 {
9894     PERL_ARGS_ASSERT_REGHOP4;
9895
9896     if (off >= 0) {
9897         while (off-- && s < rlim) {
9898             /* XXX could check well-formedness here */
9899             s += UTF8SKIP(s);
9900         }
9901     }
9902     else {
9903         while (off++ && s > llim) {
9904             s--;
9905             if (UTF8_IS_CONTINUED(*s)) {
9906                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9907                     s--;
9908                 if (! UTF8_IS_START(*s)) {
9909                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9910                 }
9911             }
9912             /* XXX could check well-formedness here */
9913         }
9914     }
9915     return s;
9916 }
9917
9918 /* like reghop3, but returns NULL on overrun, rather than returning last
9919  * char pos */
9920
9921 STATIC U8 *
9922 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9923 {
9924     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9925
9926     if (off >= 0) {
9927         while (off-- && s < lim) {
9928             /* XXX could check well-formedness here */
9929             s += UTF8SKIP(s);
9930         }
9931         if (off >= 0)
9932             return NULL;
9933     }
9934     else {
9935         while (off++ && s > lim) {
9936             s--;
9937             if (UTF8_IS_CONTINUED(*s)) {
9938                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9939                     s--;
9940                 if (! UTF8_IS_START(*s)) {
9941                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9942                 }
9943             }
9944             /* XXX could check well-formedness here */
9945         }
9946         if (off <= 0)
9947             return NULL;
9948     }
9949     return s;
9950 }
9951
9952
9953 /* when executing a regex that may have (?{}), extra stuff needs setting
9954    up that will be visible to the called code, even before the current
9955    match has finished. In particular:
9956
9957    * $_ is localised to the SV currently being matched;
9958    * pos($_) is created if necessary, ready to be updated on each call-out
9959      to code;
9960    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9961      isn't set until the current pattern is successfully finished), so that
9962      $1 etc of the match-so-far can be seen;
9963    * save the old values of subbeg etc of the current regex, and  set then
9964      to the current string (again, this is normally only done at the end
9965      of execution)
9966 */
9967
9968 static void
9969 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9970 {
9971     MAGIC *mg;
9972     regexp *const rex = ReANY(reginfo->prog);
9973     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9974
9975     eval_state->rex = rex;
9976
9977     if (reginfo->sv) {
9978         /* Make $_ available to executed code. */
9979         if (reginfo->sv != DEFSV) {
9980             SAVE_DEFSV;
9981             DEFSV_set(reginfo->sv);
9982         }
9983
9984         if (!(mg = mg_find_mglob(reginfo->sv))) {
9985             /* prepare for quick setting of pos */
9986             mg = sv_magicext_mglob(reginfo->sv);
9987             mg->mg_len = -1;
9988         }
9989         eval_state->pos_magic = mg;
9990         eval_state->pos       = mg->mg_len;
9991         eval_state->pos_flags = mg->mg_flags;
9992     }
9993     else
9994         eval_state->pos_magic = NULL;
9995
9996     if (!PL_reg_curpm) {
9997         /* PL_reg_curpm is a fake PMOP that we can attach the current
9998          * regex to and point PL_curpm at, so that $1 et al are visible
9999          * within a /(?{})/. It's just allocated once per interpreter the
10000          * first time its needed */
10001         Newxz(PL_reg_curpm, 1, PMOP);
10002 #ifdef USE_ITHREADS
10003         {
10004             SV* const repointer = &PL_sv_undef;
10005             /* this regexp is also owned by the new PL_reg_curpm, which
10006                will try to free it.  */
10007             av_push(PL_regex_padav, repointer);
10008             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10009             PL_regex_pad = AvARRAY(PL_regex_padav);
10010         }
10011 #endif
10012     }
10013     SET_reg_curpm(reginfo->prog);
10014     eval_state->curpm = PL_curpm;
10015     PL_curpm_under = PL_curpm;
10016     PL_curpm = PL_reg_curpm;
10017     if (RXp_MATCH_COPIED(rex)) {
10018         /*  Here is a serious problem: we cannot rewrite subbeg,
10019             since it may be needed if this match fails.  Thus
10020             $` inside (?{}) could fail... */
10021         eval_state->subbeg     = rex->subbeg;
10022         eval_state->sublen     = rex->sublen;
10023         eval_state->suboffset  = rex->suboffset;
10024         eval_state->subcoffset = rex->subcoffset;
10025 #ifdef PERL_ANY_COW
10026         eval_state->saved_copy = rex->saved_copy;
10027 #endif
10028         RXp_MATCH_COPIED_off(rex);
10029     }
10030     else
10031         eval_state->subbeg = NULL;
10032     rex->subbeg = (char *)reginfo->strbeg;
10033     rex->suboffset = 0;
10034     rex->subcoffset = 0;
10035     rex->sublen = reginfo->strend - reginfo->strbeg;
10036 }
10037
10038
10039 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10040
10041 static void
10042 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10043 {
10044     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10045     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10046     regmatch_slab *s;
10047
10048     Safefree(aux->poscache);
10049
10050     if (eval_state) {
10051
10052         /* undo the effects of S_setup_eval_state() */
10053
10054         if (eval_state->subbeg) {
10055             regexp * const rex = eval_state->rex;
10056             rex->subbeg     = eval_state->subbeg;
10057             rex->sublen     = eval_state->sublen;
10058             rex->suboffset  = eval_state->suboffset;
10059             rex->subcoffset = eval_state->subcoffset;
10060 #ifdef PERL_ANY_COW
10061             rex->saved_copy = eval_state->saved_copy;
10062 #endif
10063             RXp_MATCH_COPIED_on(rex);
10064         }
10065         if (eval_state->pos_magic)
10066         {
10067             eval_state->pos_magic->mg_len = eval_state->pos;
10068             eval_state->pos_magic->mg_flags =
10069                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10070                | (eval_state->pos_flags & MGf_BYTES);
10071         }
10072
10073         PL_curpm = eval_state->curpm;
10074     }
10075
10076     PL_regmatch_state = aux->old_regmatch_state;
10077     PL_regmatch_slab  = aux->old_regmatch_slab;
10078
10079     /* free all slabs above current one - this must be the last action
10080      * of this function, as aux and eval_state are allocated within
10081      * slabs and may be freed here */
10082
10083     s = PL_regmatch_slab->next;
10084     if (s) {
10085         PL_regmatch_slab->next = NULL;
10086         while (s) {
10087             regmatch_slab * const osl = s;
10088             s = s->next;
10089             Safefree(osl);
10090         }
10091     }
10092 }
10093
10094
10095 STATIC void
10096 S_to_utf8_substr(pTHX_ regexp *prog)
10097 {
10098     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10099      * on the converted value */
10100
10101     int i = 1;
10102
10103     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10104
10105     do {
10106         if (prog->substrs->data[i].substr
10107             && !prog->substrs->data[i].utf8_substr) {
10108             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10109             prog->substrs->data[i].utf8_substr = sv;
10110             sv_utf8_upgrade(sv);
10111             if (SvVALID(prog->substrs->data[i].substr)) {
10112                 if (SvTAIL(prog->substrs->data[i].substr)) {
10113                     /* Trim the trailing \n that fbm_compile added last
10114                        time.  */
10115                     SvCUR_set(sv, SvCUR(sv) - 1);
10116                     /* Whilst this makes the SV technically "invalid" (as its
10117                        buffer is no longer followed by "\0") when fbm_compile()
10118                        adds the "\n" back, a "\0" is restored.  */
10119                     fbm_compile(sv, FBMcf_TAIL);
10120                 } else
10121                     fbm_compile(sv, 0);
10122             }
10123             if (prog->substrs->data[i].substr == prog->check_substr)
10124                 prog->check_utf8 = sv;
10125         }
10126     } while (i--);
10127 }
10128
10129 STATIC bool
10130 S_to_byte_substr(pTHX_ regexp *prog)
10131 {
10132     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10133      * on the converted value; returns FALSE if can't be converted. */
10134
10135     int i = 1;
10136
10137     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10138
10139     do {
10140         if (prog->substrs->data[i].utf8_substr
10141             && !prog->substrs->data[i].substr) {
10142             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10143             if (! sv_utf8_downgrade(sv, TRUE)) {
10144                 return FALSE;
10145             }
10146             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10147                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10148                     /* Trim the trailing \n that fbm_compile added last
10149                         time.  */
10150                     SvCUR_set(sv, SvCUR(sv) - 1);
10151                     fbm_compile(sv, FBMcf_TAIL);
10152                 } else
10153                     fbm_compile(sv, 0);
10154             }
10155             prog->substrs->data[i].substr = sv;
10156             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10157                 prog->check_substr = sv;
10158         }
10159     } while (i--);
10160
10161     return TRUE;
10162 }
10163
10164 #ifndef PERL_IN_XSUB_RE
10165
10166 bool
10167 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10168 {
10169     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10170      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10171      * the larger string bounded by 'strbeg' and 'strend'.
10172      *
10173      * 'cp' needs to be assigned (if not a future version of the Unicode
10174      * Standard could make it something that combines with adjacent characters,
10175      * so code using it would then break), and there has to be a GCB break
10176      * before and after the character. */
10177
10178     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10179     const U8 * prev_cp_start;
10180
10181     PERL_ARGS_ASSERT__IS_GRAPHEME;
10182
10183     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10184         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10185     {
10186         /* These are considered graphemes */
10187         return TRUE;
10188     }
10189
10190     /* Otherwise, unassigned code points are forbidden */
10191     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10192                                     _invlist_search(PL_Assigned_invlist, cp))))
10193     {
10194         return FALSE;
10195     }
10196
10197     cp_gcb_val = getGCB_VAL_CP(cp);
10198
10199     /* Find the GCB value of the previous code point in the input */
10200     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10201     if (UNLIKELY(prev_cp_start == s)) {
10202         prev_cp_gcb_val = GCB_EDGE;
10203     }
10204     else {
10205         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10206     }
10207
10208     /* And check that is a grapheme boundary */
10209     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10210                 TRUE /* is UTF-8 encoded */ ))
10211     {
10212         return FALSE;
10213     }
10214
10215     /* Similarly verify there is a break between the current character and the
10216      * following one */
10217     s += UTF8SKIP(s);
10218     if (s >= strend) {
10219         next_cp_gcb_val = GCB_EDGE;
10220     }
10221     else {
10222         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10223     }
10224
10225     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10226 }
10227
10228 /*
10229 =head1 Unicode Support
10230
10231 =for apidoc isSCRIPT_RUN
10232
10233 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10234 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10235 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10236 two degenerate cases given below, this function returns TRUE iff all code
10237 points in it come from any combination of three "scripts" given by the Unicode
10238 "Script Extensions" property: Common, Inherited, and possibly one other.
10239 Additionally all decimal digits must come from the same consecutive sequence of
10240 10.
10241
10242 For example, if all the characters in the sequence are Greek, or Common, or
10243 Inherited, this function will return TRUE, provided any decimal digits in it
10244 are the ASCII digits "0".."9".  For scripts (unlike Greek) that have their own
10245 digits defined this will accept either digits from that set or from 0..9, but
10246 not a combination of the two.  Some scripts, such as Arabic, have more than one
10247 set of digits.  All digits must come from the same set for this function to
10248 return TRUE.
10249
10250 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10251 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10252 C<SCX_INVALID> if the function returns FALSE.
10253
10254 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10255 will be C<SCX_INVALID>.
10256
10257 If the sequence contains a single code point which is unassigned to a character
10258 in the version of Unicode being used, the function will return TRUE, and the
10259 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10260 in the input sequence will result in the function treating the input as not
10261 being a script run.
10262
10263 The returned script will be C<SCX_Inherited> iff all the code points in it are
10264 from the Inherited script.
10265
10266 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10267 it are from the Inherited or Common scripts.
10268
10269 =cut
10270
10271 */
10272
10273 bool
10274 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10275 {
10276     /* Basically, it looks at each character in the sequence to see if the
10277      * above conditions are met; if not it fails.  It uses an inversion map to
10278      * find the enum corresponding to the script of each character.  But this
10279      * is complicated by the fact that a few code points can be in any of
10280      * several scripts.  The data has been constructed so that there are
10281      * additional enum values (all negative) for these situations.  The
10282      * absolute value of those is an index into another table which contains
10283      * pointers to auxiliary tables for each such situation.  Each aux array
10284      * lists all the scripts for the given situation.  There is another,
10285      * parallel, table that gives the number of entries in each aux table.
10286      * These are all defined in charclass_invlists.h */
10287
10288     /* XXX Here are the additional things UTS 39 says could be done:
10289      *
10290      * Forbid sequences of the same nonspacing mark
10291      *
10292      * Check to see that all the characters are in the sets of exemplar
10293      * characters for at least one language in the Unicode Common Locale Data
10294      * Repository [CLDR]. */
10295
10296
10297     /* Things that match /\d/u */
10298     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10299     UV * decimals_array = invlist_array(decimals_invlist);
10300
10301     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10302      * not currently known) */
10303     UV zero_of_run = 0;
10304
10305     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10306     SCX_enum script_of_char = SCX_INVALID;
10307
10308     /* If the script remains not fully determined from iteration to iteration,
10309      * this is the current intersection of the possiblities.  */
10310     SCX_enum * intersection = NULL;
10311     PERL_UINT_FAST8_T intersection_len = 0;
10312
10313     bool retval = TRUE;
10314     SCX_enum * ret_script = NULL;
10315
10316     assert(send >= s);
10317
10318     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10319
10320     /* All code points in 0..255 are either Common or Latin, so must be a
10321      * script run.  We can return immediately unless we need to know which
10322      * script it is. */
10323     if (! utf8_target && LIKELY(send > s)) {
10324         if (ret_script == NULL) {
10325             return TRUE;
10326         }
10327
10328         /* If any character is Latin, the run is Latin */
10329         while (s < send) {
10330             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10331                 *ret_script = SCX_Latin;
10332                 return TRUE;
10333             }
10334         }
10335
10336         /* Here, all are Common */
10337         *ret_script = SCX_Common;
10338         return TRUE;
10339     }
10340
10341     /* Look at each character in the sequence */
10342     while (s < send) {
10343         /* If the current character being examined is a digit, this is the code
10344          * point of the zero for its sequence of 10 */
10345         UV zero_of_char;
10346
10347         UV cp;
10348
10349         /* The code allows all scripts to use the ASCII digits.  This is
10350          * because they are used in commerce even in scripts that have their
10351          * own set.  Hence any ASCII ones found are ok, unless and until a
10352          * digit from another set has already been encountered.  (The other
10353          * digit ranges in Common are not similarly blessed) */
10354         if (UNLIKELY(isDIGIT(*s))) {
10355             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10356                 retval = FALSE;
10357                 break;
10358             }
10359             if (zero_of_run) {
10360                 if (zero_of_run != '0') {
10361                     retval = FALSE;
10362                     break;
10363                 }
10364             }
10365             else {
10366                 zero_of_run = '0';
10367             }
10368             s++;
10369             continue;
10370         }
10371
10372         /* Here, isn't an ASCII digit.  Find the code point of the character */
10373         if (! UTF8_IS_INVARIANT(*s)) {
10374             Size_t len;
10375             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10376             s += len;
10377         }
10378         else {
10379             cp = *(s++);
10380         }
10381
10382         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10383          * digit in that script.  We can skip the rest of this code for this
10384          * character. */
10385         if (UNLIKELY(   zero_of_run
10386                      && cp >= zero_of_run
10387                      && cp - zero_of_run <= 9))
10388         {
10389             continue;
10390         }
10391
10392         /* Find the character's script.  The correct values are hard-coded here
10393          * for small-enough code points. */
10394         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10395                                unlikely to change */
10396             if (       cp > 255
10397                 || (   isALPHA_L1(cp)
10398                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10399             {
10400                 script_of_char = SCX_Latin;
10401             }
10402             else {
10403                 script_of_char = SCX_Common;
10404             }
10405         }
10406         else {
10407             script_of_char = _Perl_SCX_invmap[
10408                                        _invlist_search(PL_SCX_invlist, cp)];
10409         }
10410
10411         /* We arbitrarily accept a single unassigned character, but not in
10412          * combination with anything else, and not a run of them. */
10413         if (   UNLIKELY(script_of_run == SCX_Unknown)
10414             || UNLIKELY(   script_of_run != SCX_INVALID
10415                         && script_of_char == SCX_Unknown))
10416         {
10417             retval = FALSE;
10418             break;
10419         }
10420
10421         /* For the first character, or the run is inherited, the run's script
10422          * is set to the char's */
10423         if (   UNLIKELY(script_of_run == SCX_INVALID)
10424             || UNLIKELY(script_of_run == SCX_Inherited))
10425         {
10426             script_of_run = script_of_char;
10427         }
10428
10429         /* For the character's script to be Unknown, it must be the first
10430          * character in the sequence (for otherwise a test above would have
10431          * prevented us from reaching here), and we have set the run's script
10432          * to it.  Nothing further to be done for this character */
10433         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10434             continue;
10435         }
10436
10437         /* We accept 'inherited' script characters currently even at the
10438          * beginning.  (We know that no characters in Inherited are digits, or
10439          * we'd have to check for that) */
10440         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10441             continue;
10442         }
10443
10444         /* If the run so far is Common, and the new character isn't, change the
10445          * run's script to that of this character */
10446         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10447
10448             /* But Common contains several sets of digits.  Only the '0' set
10449              * can be part of another script. */
10450             if (zero_of_run && zero_of_run != '0') {
10451                 retval = FALSE;
10452                 break;
10453             }
10454
10455             script_of_run = script_of_char;
10456         }
10457
10458         /* Now we can see if the script of the character is the same as that of
10459          * the run */
10460         if (LIKELY(script_of_char == script_of_run)) {
10461             /* By far the most common case */
10462             goto scripts_match;
10463         }
10464
10465         /* Here, the script of the run isn't Common.  But characters in Common
10466          * match any script */
10467         if (script_of_char == SCX_Common) {
10468             goto scripts_match;
10469         }
10470
10471 #ifndef HAS_SCX_AUX_TABLES
10472
10473         /* Too early a Unicode version to have a code point belonging to more
10474          * than one script, so, if the scripts don't exactly match, fail */
10475         PERL_UNUSED_VAR(intersection_len);
10476         retval = FALSE;
10477         break;
10478
10479 #else
10480
10481         /* Here there is no exact match between the character's script and the
10482          * run's.  And we've handled the special cases of scripts Unknown,
10483          * Inherited, and Common.
10484          *
10485          * Negative script numbers signify that the value may be any of several
10486          * scripts, and we need to look at auxiliary information to make our
10487          * deterimination.  But if both are non-negative, we can fail now */
10488         if (LIKELY(script_of_char >= 0)) {
10489             const SCX_enum * search_in;
10490             PERL_UINT_FAST8_T search_in_len;
10491             PERL_UINT_FAST8_T i;
10492
10493             if (LIKELY(script_of_run >= 0)) {
10494                 retval = FALSE;
10495                 break;
10496             }
10497
10498             /* Use the previously constructed set of possible scripts, if any.
10499              * */
10500             if (intersection) {
10501                 search_in = intersection;
10502                 search_in_len = intersection_len;
10503             }
10504             else {
10505                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10506                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10507             }
10508
10509             for (i = 0; i < search_in_len; i++) {
10510                 if (search_in[i] == script_of_char) {
10511                     script_of_run = script_of_char;
10512                     goto scripts_match;
10513                 }
10514             }
10515
10516             retval = FALSE;
10517             break;
10518         }
10519         else if (LIKELY(script_of_run >= 0)) {
10520             /* script of character could be one of several, but run is a single
10521              * script */
10522             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10523             const PERL_UINT_FAST8_T search_in_len
10524                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10525             PERL_UINT_FAST8_T i;
10526
10527             for (i = 0; i < search_in_len; i++) {
10528                 if (search_in[i] == script_of_run) {
10529                     script_of_char = script_of_run;
10530                     goto scripts_match;
10531                 }
10532             }
10533
10534             retval = FALSE;
10535             break;
10536         }
10537         else {
10538             /* Both run and char could be in one of several scripts.  If the
10539              * intersection is empty, then this character isn't in this script
10540              * run.  Otherwise, we need to calculate the intersection to use
10541              * for future iterations of the loop, unless we are already at the
10542              * final character */
10543             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10544             const PERL_UINT_FAST8_T char_len
10545                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10546             const SCX_enum * search_run;
10547             PERL_UINT_FAST8_T run_len;
10548
10549             SCX_enum * new_overlap = NULL;
10550             PERL_UINT_FAST8_T i, j;
10551
10552             if (intersection) {
10553                 search_run = intersection;
10554                 run_len = intersection_len;
10555             }
10556             else {
10557                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10558                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10559             }
10560
10561             intersection_len = 0;
10562
10563             for (i = 0; i < run_len; i++) {
10564                 for (j = 0; j < char_len; j++) {
10565                     if (search_run[i] == search_char[j]) {
10566
10567                         /* Here, the script at i,j matches.  That means this
10568                          * character is in the run.  But continue on to find
10569                          * the complete intersection, for the next loop
10570                          * iteration, and for the digit check after it.
10571                          *
10572                          * On the first found common script, we malloc space
10573                          * for the intersection list for the worst case of the
10574                          * intersection, which is the minimum of the number of
10575                          * scripts remaining in each set. */
10576                         if (intersection_len == 0) {
10577                             Newx(new_overlap,
10578                                  MIN(run_len - i, char_len - j),
10579                                  SCX_enum);
10580                         }
10581                         new_overlap[intersection_len++] = search_run[i];
10582                     }
10583                 }
10584             }
10585
10586             /* Here we've looked through everything.  If they have no scripts
10587              * in common, not a run */
10588             if (intersection_len == 0) {
10589                 retval = FALSE;
10590                 break;
10591             }
10592
10593             /* If there is only a single script in common, set to that.
10594              * Otherwise, use the intersection going forward */
10595             Safefree(intersection);
10596             intersection = NULL;
10597             if (intersection_len == 1) {
10598                 script_of_run = script_of_char = new_overlap[0];
10599                 Safefree(new_overlap);
10600                 new_overlap = NULL;
10601             }
10602             else {
10603                 intersection = new_overlap;
10604             }
10605         }
10606
10607 #endif
10608
10609   scripts_match:
10610
10611         /* Here, the script of the character is compatible with that of the
10612          * run.  That means that in most cases, it continues the script run.
10613          * Either it and the run match exactly, or one or both can be in any of
10614          * several scripts, and the intersection is not empty.  However, if the
10615          * character is a decimal digit, it could still mean failure if it is
10616          * from the wrong sequence of 10.  So, we need to look at if it's a
10617          * digit.  We've already handled the 10 decimal digits, and the next
10618          * lowest one is this one: */
10619         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
10620             continue;   /* Not a digit; this character is part of the run */
10621         }
10622
10623         /* If we have a definitive '0' for the script of this character, we
10624          * know that for this to be a digit, it must be in the range of +0..+9
10625          * of that zero. */
10626         if (   script_of_char >= 0
10627             && (zero_of_char = script_zeros[script_of_char]))
10628         {
10629             if (   cp < zero_of_char
10630                 || cp > zero_of_char + 9)
10631             {
10632                 continue;   /* Not a digit; this character is part of the run
10633                              */
10634             }
10635
10636         }
10637         else {  /* Need to look up if this character is a digit or not */
10638             SSize_t index_of_zero_of_char;
10639             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
10640             if (     UNLIKELY(index_of_zero_of_char < 0)
10641                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
10642             {
10643                 continue;   /* Not a digit; this character is part of the run.
10644                              */
10645             }
10646
10647             zero_of_char = decimals_array[index_of_zero_of_char];
10648         }
10649
10650         /* Here, the character is a decimal digit, and the zero of its sequence
10651          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
10652          * they better be the same. */
10653         if (zero_of_run) {
10654             if (zero_of_run != zero_of_char) {
10655                 retval = FALSE;
10656                 break;
10657             }
10658         }
10659         else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
10660
10661             /* Here, the script run isn't Common, but the current digit is in
10662              * Common, and isn't '0'-'9' (those were handled earlier).   Only
10663              * '0'-'9' are acceptable in non-Common scripts. */
10664             retval = FALSE;
10665             break;
10666         }
10667         else {  /* Otherwise we now have a zero for this run */
10668             zero_of_run = zero_of_char;
10669         }
10670     } /* end of looping through CLOSESR text */
10671
10672     Safefree(intersection);
10673
10674     if (ret_script != NULL) {
10675         if (retval) {
10676             *ret_script = script_of_run;
10677         }
10678         else {
10679             *ret_script = SCX_INVALID;
10680         }
10681     }
10682
10683     return retval;
10684 }
10685
10686 #endif /* ifndef PERL_IN_XSUB_RE */
10687
10688 /*
10689  * ex: set ts=8 sts=4 sw=4 et:
10690  */