regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  41       "Use of \\b{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  42
  43 /*
  44  * pregcomp and pregexec -- regsub and regerror are not used in perl
  45  *
  46  *      Copyright (c) 1986 by University of Toronto.
  47  *      Written by Henry Spencer.  Not derived from licensed software.
  48  *
  49  *      Permission is granted to anyone to use this software for any
  50  *      purpose on any computer system, and to redistribute it freely,
  51  *      subject to the following restrictions:
  52  *
  53  *      1. The author is not responsible for the consequences of use of
  54  *              this software, no matter how awful, even if they arise
  55  *              from defects in it.
  56  *
  57  *      2. The origin of this software must not be misrepresented, either
  58  *              by explicit claim or by omission.
  59  *
  60  *      3. Altered versions must be plainly marked as such, and must not
  61  *              be misrepresented as being the original software.
  62  *
  63  ****    Alterations to Henry's code are...
  64  ****
  65  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  66  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  67  ****    by Larry Wall and others
  68  ****
  69  ****    You may distribute under the terms of either the GNU General Public
  70  ****    License or the Artistic License, as specified in the README file.
  71  *
  72  * Beware that some of this code is subtly aware of the way operator
  73  * precedence is structured in regular expressions.  Serious changes in
  74  * regular-expression syntax might require a total rethink.
  75  */
  76 #include "EXTERN.h"
  77 #define PERL_IN_REGEXEC_C
  78 #include "perl.h"
  79
  80 #ifdef PERL_IN_XSUB_RE
  81 #  include "re_comp.h"
  82 #else
  83 #  include "regcomp.h"
  84 #endif
  85
  86 #include "inline_invlist.c"
  87 #include "unicode_constants.h"
  88
  89 #ifdef DEBUGGING
  90 /* At least one required character in the target string is expressible only in
  91  * UTF-8. */
  92 static const char* const non_utf8_target_but_utf8_required
  93                 = "Can't match, because target string needs to be in UTF-8\n";
  94 #endif
  95
  96 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  97     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  98     goto target; \
  99 } STMT_END
 100
 101 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 /* Valid only for non-utf8 strings: avoids the reginclass
 108  * call if there are no complications: i.e., if everything matchable is
 109  * straight forward in the bitmap */
 110 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
 111                                               : ANYOF_BITMAP_TEST(p,*(c)))
 112
 113 /*
 114  * Forwards.
 115  */
 116
 117 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 118 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 119
 120 #define HOPc(pos,off) \
 121         (char *)(reginfo->is_utf8_target \
 122             ? reghop3((U8*)pos, off, \
 123                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 124             : (U8*)(pos + off))
 125
 126 #define HOPBACKc(pos, off) \
 127         (char*)(reginfo->is_utf8_target \
 128             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 129             : (pos - off >= reginfo->strbeg)    \
 130                 ? (U8*)pos - off                \
 131                 : NULL)
 132
 133 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 135
 136 /* lim must be +ve. Returns NULL on overshoot */
 137 #define HOPMAYBE3(pos,off,lim) \
 138         (reginfo->is_utf8_target                        \
 139             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 140             : ((U8*)pos + off <= lim)                   \
 141                 ? (U8*)pos + off                        \
 142                 : NULL)
 143
 144 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 145  * off must be >=0; args should be vars rather than expressions */
 146 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 147     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 148     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 149
 150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 151     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 152     : (U8*)(pos + off))
 153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 154
 155 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 156 #define NEXTCHR_IS_EOS (nextchr < 0)
 157
 158 #define SET_nextchr \
 159     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 160
 161 #define SET_locinput(p) \
 162     locinput = (p);  \
 163     SET_nextchr
 164
 165
 166 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 167         if (!swash_ptr) {                                                     \
 168             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 169             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 170                                          1, 0, invlist, &flags);              \
 171             assert(swash_ptr);                                                \
 172         }                                                                     \
 173     } STMT_END
 174
 175 /* If in debug mode, we test that a known character properly matches */
 176 #ifdef DEBUGGING
 177 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 178                                           property_name,                      \
 179                                           invlist,                            \
 180                                           utf8_char_in_property)              \
 181         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 182         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 183 #else
 184 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 185                                           property_name,                      \
 186                                           invlist,                            \
 187                                           utf8_char_in_property)              \
 188         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 189 #endif
 190
 191 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 192                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 193                                         "",                                   \
 194                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 195                                         LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 196
 197 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 198 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 199
 200 /* for use after a quantifier and before an EXACT-like node -- japhy */
 201 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 202  *
 203  * NOTE that *nothing* that affects backtracking should be in here, specifically
 204  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 205  * node that is in between two EXACT like nodes when ascertaining what the required
 206  * "follow" character is. This should probably be moved to regex compile time
 207  * although it may be done at run time beause of the REF possibility - more
 208  * investigation required. -- demerphq
 209 */
 210 #define JUMPABLE(rn) (                                                             \
 211     OP(rn) == OPEN ||                                                              \
 212     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 213     OP(rn) == EVAL ||                                                              \
 214     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 215     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 216     OP(rn) == KEEPS ||                                                             \
 217     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 218 )
 219 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 220
 221 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 222
 223 #if 0
 224 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 225    we don't need this definition.  XXX These are now out-of-sync*/
 226 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 227 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 228 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 229
 230 #else
 231 /* ... so we use this as its faster. */
 232 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 233 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 234 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 235 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 236
 237 #endif
 238
 239 /*
 240   Search for mandatory following text node; for lookahead, the text must
 241   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 242 */
 243 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 244     while (JUMPABLE(rn)) { \
 245         const OPCODE type = OP(rn); \
 246         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 247             rn = NEXTOPER(NEXTOPER(rn)); \
 248         else if (type == PLUS) \
 249             rn = NEXTOPER(rn); \
 250         else if (type == IFMATCH) \
 251             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 252         else rn += NEXT_OFF(rn); \
 253     } \
 254 } STMT_END
 255
 256 #define SLAB_FIRST(s) (&(s)->states[0])
 257 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 258
 259 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 260 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 261 static regmatch_state * S_push_slab(pTHX);
 262
 263 #define REGCP_PAREN_ELEMS 3
 264 #define REGCP_OTHER_ELEMS 3
 265 #define REGCP_FRAME_ELEMS 1
 266 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 267  * are needed for the regexp context stack bookkeeping. */
 268
 269 STATIC CHECKPOINT
 270 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 271 {
 272     const int retval = PL_savestack_ix;
 273     const int paren_elems_to_push =
 274                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 275     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 276     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 277     I32 p;
 278     GET_RE_DEBUG_FLAGS_DECL;
 279
 280     PERL_ARGS_ASSERT_REGCPPUSH;
 281
 282     if (paren_elems_to_push < 0)
 283         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 284                    (int)paren_elems_to_push, (int)maxopenparen,
 285                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 286
 287     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 288         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 289                    " out of range (%lu-%ld)",
 290                    total_elems,
 291                    (unsigned long)maxopenparen,
 292                    (long)parenfloor);
 293
 294     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 295
 296     DEBUG_BUFFERS_r(
 297         if ((int)maxopenparen > (int)parenfloor)
 298             PerlIO_printf(Perl_debug_log,
 299                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 300                 PTR2UV(rex),
 301                 PTR2UV(rex->offs)
 302             );
 303     );
 304     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 305 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 306         SSPUSHIV(rex->offs[p].end);
 307         SSPUSHIV(rex->offs[p].start);
 308         SSPUSHINT(rex->offs[p].start_tmp);
 309         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 310             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 311             (UV)p,
 312             (IV)rex->offs[p].start,
 313             (IV)rex->offs[p].start_tmp,
 314             (IV)rex->offs[p].end
 315         ));
 316     }
 317 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 318     SSPUSHINT(maxopenparen);
 319     SSPUSHINT(rex->lastparen);
 320     SSPUSHINT(rex->lastcloseparen);
 321     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 322
 323     return retval;
 324 }
 325
 326 /* These are needed since we do not localize EVAL nodes: */
 327 #define REGCP_SET(cp)                                           \
 328     DEBUG_STATE_r(                                              \
 329             PerlIO_printf(Perl_debug_log,                       \
 330                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 331                 (IV)PL_savestack_ix));                          \
 332     cp = PL_savestack_ix
 333
 334 #define REGCP_UNWIND(cp)                                        \
 335     DEBUG_STATE_r(                                              \
 336         if (cp != PL_savestack_ix)                              \
 337             PerlIO_printf(Perl_debug_log,                       \
 338                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 339                 (IV)(cp), (IV)PL_savestack_ix));                \
 340     regcpblow(cp)
 341
 342 #define UNWIND_PAREN(lp, lcp)               \
 343     for (n = rex->lastparen; n > lp; n--)   \
 344         rex->offs[n].end = -1;              \
 345     rex->lastparen = n;                     \
 346     rex->lastcloseparen = lcp;
 347
 348
 349 STATIC void
 350 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 351 {
 352     UV i;
 353     U32 paren;
 354     GET_RE_DEBUG_FLAGS_DECL;
 355
 356     PERL_ARGS_ASSERT_REGCPPOP;
 357
 358     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 359     i = SSPOPUV;
 360     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 361     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 362     rex->lastcloseparen = SSPOPINT;
 363     rex->lastparen = SSPOPINT;
 364     *maxopenparen_p = SSPOPINT;
 365
 366     i -= REGCP_OTHER_ELEMS;
 367     /* Now restore the parentheses context. */
 368     DEBUG_BUFFERS_r(
 369         if (i || rex->lastparen + 1 <= rex->nparens)
 370             PerlIO_printf(Perl_debug_log,
 371                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 372                 PTR2UV(rex),
 373                 PTR2UV(rex->offs)
 374             );
 375     );
 376     paren = *maxopenparen_p;
 377     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 378         SSize_t tmps;
 379         rex->offs[paren].start_tmp = SSPOPINT;
 380         rex->offs[paren].start = SSPOPIV;
 381         tmps = SSPOPIV;
 382         if (paren <= rex->lastparen)
 383             rex->offs[paren].end = tmps;
 384         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 385             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 386             (UV)paren,
 387             (IV)rex->offs[paren].start,
 388             (IV)rex->offs[paren].start_tmp,
 389             (IV)rex->offs[paren].end,
 390             (paren > rex->lastparen ? "(skipped)" : ""));
 391         );
 392         paren--;
 393     }
 394 #if 1
 395     /* It would seem that the similar code in regtry()
 396      * already takes care of this, and in fact it is in
 397      * a better location to since this code can #if 0-ed out
 398      * but the code in regtry() is needed or otherwise tests
 399      * requiring null fields (pat.t#187 and split.t#{13,14}
 400      * (as of patchlevel 7877)  will fail.  Then again,
 401      * this code seems to be necessary or otherwise
 402      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 403      * --jhi updated by dapm */
 404     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 405         if (i > *maxopenparen_p)
 406             rex->offs[i].start = -1;
 407         rex->offs[i].end = -1;
 408         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 409             "    \\%"UVuf": %s   ..-1 undeffing\n",
 410             (UV)i,
 411             (i > *maxopenparen_p) ? "-1" : "  "
 412         ));
 413     }
 414 #endif
 415 }
 416
 417 /* restore the parens and associated vars at savestack position ix,
 418  * but without popping the stack */
 419
 420 STATIC void
 421 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 422 {
 423     I32 tmpix = PL_savestack_ix;
 424     PL_savestack_ix = ix;
 425     regcppop(rex, maxopenparen_p);
 426     PL_savestack_ix = tmpix;
 427 }
 428
 429 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 430
 431 STATIC bool
 432 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 433 {
 434     /* Returns a boolean as to whether or not 'character' is a member of the
 435      * Posix character class given by 'classnum' that should be equivalent to a
 436      * value in the typedef '_char_class_number'.
 437      *
 438      * Ideally this could be replaced by a just an array of function pointers
 439      * to the C library functions that implement the macros this calls.
 440      * However, to compile, the precise function signatures are required, and
 441      * these may vary from platform to to platform.  To avoid having to figure
 442      * out what those all are on each platform, I (khw) am using this method,
 443      * which adds an extra layer of function call overhead (unless the C
 444      * optimizer strips it away).  But we don't particularly care about
 445      * performance with locales anyway. */
 446
 447     switch ((_char_class_number) classnum) {
 448         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 449         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 450         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 451         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 452         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 453                                         || isUPPER_LC(character);
 454         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 455         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 456         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 457         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 458         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 459         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 460         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 461         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 462         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 463         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 464         default:    /* VERTSPACE should never occur in locales */
 465             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 466     }
 467
 468     NOT_REACHED; /* NOTREACHED */
 469     return FALSE;
 470 }
 471
 472 STATIC bool
 473 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 474 {
 475     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 476      * 'character' is a member of the Posix character class given by 'classnum'
 477      * that should be equivalent to a value in the typedef
 478      * '_char_class_number'.
 479      *
 480      * This just calls isFOO_lc on the code point for the character if it is in
 481      * the range 0-255.  Outside that range, all characters use Unicode
 482      * rules, ignoring any locale.  So use the Unicode function if this class
 483      * requires a swash, and use the Unicode macro otherwise. */
 484
 485     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 486
 487     if (UTF8_IS_INVARIANT(*character)) {
 488         return isFOO_lc(classnum, *character);
 489     }
 490     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 491         return isFOO_lc(classnum,
 492                         TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 493     }
 494
 495     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 496
 497     if (classnum < _FIRST_NON_SWASH_CC) {
 498
 499         /* Initialize the swash unless done already */
 500         if (! PL_utf8_swash_ptrs[classnum]) {
 501             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 502             PL_utf8_swash_ptrs[classnum] =
 503                     _core_swash_init("utf8",
 504                                      "",
 505                                      &PL_sv_undef, 1, 0,
 506                                      PL_XPosix_ptrs[classnum], &flags);
 507         }
 508
 509         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 510                                  character,
 511                                  TRUE /* is UTF */ ));
 512     }
 513
 514     switch ((_char_class_number) classnum) {
 515         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 516         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 517         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 518         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 519         default:                 break;
 520     }
 521
 522     return FALSE; /* Things like CNTRL are always below 256 */
 523 }
 524
 525 /*
 526  * pregexec and friends
 527  */
 528
 529 #ifndef PERL_IN_XSUB_RE
 530 /*
 531  - pregexec - match a regexp against a string
 532  */
 533 I32
 534 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 535          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 536 /* stringarg: the point in the string at which to begin matching */
 537 /* strend:    pointer to null at end of string */
 538 /* strbeg:    real beginning of string */
 539 /* minend:    end of match must be >= minend bytes after stringarg. */
 540 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 541  *            itself is accessed via the pointers above */
 542 /* nosave:    For optimizations. */
 543 {
 544     PERL_ARGS_ASSERT_PREGEXEC;
 545
 546     return
 547         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 548                       nosave ? 0 : REXEC_COPY_STR);
 549 }
 550 #endif
 551
 552
 553
 554 /* re_intuit_start():
 555  *
 556  * Based on some optimiser hints, try to find the earliest position in the
 557  * string where the regex could match.
 558  *
 559  *   rx:     the regex to match against
 560  *   sv:     the SV being matched: only used for utf8 flag; the string
 561  *           itself is accessed via the pointers below. Note that on
 562  *           something like an overloaded SV, SvPOK(sv) may be false
 563  *           and the string pointers may point to something unrelated to
 564  *           the SV itself.
 565  *   strbeg: real beginning of string
 566  *   strpos: the point in the string at which to begin matching
 567  *   strend: pointer to the byte following the last char of the string
 568  *   flags   currently unused; set to 0
 569  *   data:   currently unused; set to NULL
 570  *
 571  * The basic idea of re_intuit_start() is to use some known information
 572  * about the pattern, namely:
 573  *
 574  *   a) the longest known anchored substring (i.e. one that's at a
 575  *      constant offset from the beginning of the pattern; but not
 576  *      necessarily at a fixed offset from the beginning of the
 577  *      string);
 578  *   b) the longest floating substring (i.e. one that's not at a constant
 579  *      offset from the beginning of the pattern);
 580  *   c) Whether the pattern is anchored to the string; either
 581  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 582  *      or anchored to pos(): /\G/;
 583  *   d) A start class: a real or synthetic character class which
 584  *      represents which characters are legal at the start of the pattern;
 585  *
 586  * to either quickly reject the match, or to find the earliest position
 587  * within the string at which the pattern might match, thus avoiding
 588  * running the full NFA engine at those earlier locations, only to
 589  * eventually fail and retry further along.
 590  *
 591  * Returns NULL if the pattern can't match, or returns the address within
 592  * the string which is the earliest place the match could occur.
 593  *
 594  * The longest of the anchored and floating substrings is called 'check'
 595  * and is checked first. The other is called 'other' and is checked
 596  * second. The 'other' substring may not be present.  For example,
 597  *
 598  *    /(abc|xyz)ABC\d{0,3}DEFG/
 599  *
 600  * will have
 601  *
 602  *   check substr (float)    = "DEFG", offset 6..9 chars
 603  *   other substr (anchored) = "ABC",  offset 3..3 chars
 604  *   stclass = [ax]
 605  *
 606  * Be aware that during the course of this function, sometimes 'anchored'
 607  * refers to a substring being anchored relative to the start of the
 608  * pattern, and sometimes to the pattern itself being anchored relative to
 609  * the string. For example:
 610  *
 611  *   /\dabc/:   "abc" is anchored to the pattern;
 612  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 613  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 614  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 615  *                    but the pattern is anchored to the string.
 616  */
 617
 618 char *
 619 Perl_re_intuit_start(pTHX_
 620                     REGEXP * const rx,
 621                     SV *sv,
 622                     const char * const strbeg,
 623                     char *strpos,
 624                     char *strend,
 625                     const U32 flags,
 626                     re_scream_pos_data *data)
 627 {
 628     struct regexp *const prog = ReANY(rx);
 629     SSize_t start_shift = prog->check_offset_min;
 630     /* Should be nonnegative! */
 631     SSize_t end_shift   = 0;
 632     /* current lowest pos in string where the regex can start matching */
 633     char *rx_origin = strpos;
 634     SV *check;
 635     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 636     U8   other_ix = 1 - prog->substrs->check_ix;
 637     bool ml_anch = 0;
 638     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 639     char *check_at = NULL;              /* check substr found at this pos */
 640     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 641     RXi_GET_DECL(prog,progi);
 642     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 643     regmatch_info *const reginfo = &reginfo_buf;
 644     GET_RE_DEBUG_FLAGS_DECL;
 645
 646     PERL_ARGS_ASSERT_RE_INTUIT_START;
 647     PERL_UNUSED_ARG(flags);
 648     PERL_UNUSED_ARG(data);
 649
 650     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 651                 "Intuit: trying to determine minimum start position...\n"));
 652
 653     /* for now, assume that all substr offsets are positive. If at some point
 654      * in the future someone wants to do clever things with look-behind and
 655      * -ve offsets, they'll need to fix up any code in this function
 656      * which uses these offsets. See the thread beginning
 657      * <20140113145929.GF27210@iabyn.com>
 658      */
 659     assert(prog->substrs->data[0].min_offset >= 0);
 660     assert(prog->substrs->data[0].max_offset >= 0);
 661     assert(prog->substrs->data[1].min_offset >= 0);
 662     assert(prog->substrs->data[1].max_offset >= 0);
 663     assert(prog->substrs->data[2].min_offset >= 0);
 664     assert(prog->substrs->data[2].max_offset >= 0);
 665
 666     /* for now, assume that if both present, that the floating substring
 667      * doesn't start before the anchored substring.
 668      * If you break this assumption (e.g. doing better optimisations
 669      * with lookahead/behind), then you'll need to audit the code in this
 670      * function carefully first
 671      */
 672     assert(
 673             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 674               && (prog->float_utf8    || prog->float_substr))
 675            || (prog->float_min_offset >= prog->anchored_offset));
 676
 677     /* byte rather than char calculation for efficiency. It fails
 678      * to quickly reject some cases that can't match, but will reject
 679      * them later after doing full char arithmetic */
 680     if (prog->minlen > strend - strpos) {
 681         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 682                               "  String too short...\n"));
 683         goto fail;
 684     }
 685
 686     RX_MATCH_UTF8_set(rx,utf8_target);
 687     reginfo->is_utf8_target = cBOOL(utf8_target);
 688     reginfo->info_aux = NULL;
 689     reginfo->strbeg = strbeg;
 690     reginfo->strend = strend;
 691     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 692     reginfo->intuit = 1;
 693     /* not actually used within intuit, but zero for safety anyway */
 694     reginfo->poscache_maxiter = 0;
 695
 696     if (utf8_target) {
 697         if (!prog->check_utf8 && prog->check_substr)
 698             to_utf8_substr(prog);
 699         check = prog->check_utf8;
 700     } else {
 701         if (!prog->check_substr && prog->check_utf8) {
 702             if (! to_byte_substr(prog)) {
 703                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 704             }
 705         }
 706         check = prog->check_substr;
 707     }
 708
 709     /* dump the various substring data */
 710     DEBUG_OPTIMISE_MORE_r({
 711         int i;
 712         for (i=0; i<=2; i++) {
 713             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 714                                   : prog->substrs->data[i].substr);
 715             if (!sv)
 716                 continue;
 717
 718             PerlIO_printf(Perl_debug_log,
 719                 "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 720                 " useful=%"IVdf" utf8=%d [%s]\n",
 721                 i,
 722                 (IV)prog->substrs->data[i].min_offset,
 723                 (IV)prog->substrs->data[i].max_offset,
 724                 (IV)prog->substrs->data[i].end_shift,
 725                 BmUSEFUL(sv),
 726                 utf8_target ? 1 : 0,
 727                 SvPEEK(sv));
 728         }
 729     });
 730
 731     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 732
 733         /* ml_anch: check after \n?
 734          *
 735          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 736          * with /.*.../, these flags will have been added by the
 737          * compiler:
 738          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 739          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 740          */
 741         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 742                    && !(prog->intflags & PREGf_IMPLICIT);
 743
 744         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 745             /* we are only allowed to match at BOS or \G */
 746
 747             /* trivially reject if there's a BOS anchor and we're not at BOS.
 748              *
 749              * Note that we don't try to do a similar quick reject for
 750              * \G, since generally the caller will have calculated strpos
 751              * based on pos() and gofs, so the string is already correctly
 752              * anchored by definition; and handling the exceptions would
 753              * be too fiddly (e.g. REXEC_IGNOREPOS).
 754              */
 755             if (   strpos != strbeg
 756                 && (prog->intflags & PREGf_ANCH_SBOL))
 757             {
 758                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 759                                 "  Not at start...\n"));
 760                 goto fail;
 761             }
 762
 763             /* in the presence of an anchor, the anchored (relative to the
 764              * start of the regex) substr must also be anchored relative
 765              * to strpos. So quickly reject if substr isn't found there.
 766              * This works for \G too, because the caller will already have
 767              * subtracted gofs from pos, and gofs is the offset from the
 768              * \G to the start of the regex. For example, in /.abc\Gdef/,
 769              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 770              * caller will have set strpos=pos()-4; we look for the substr
 771              * at position pos()-4+1, which lines up with the "a" */
 772
 773             if (prog->check_offset_min == prog->check_offset_max
 774                 && !(prog->intflags & PREGf_CANY_SEEN))
 775             {
 776                 /* Substring at constant offset from beg-of-str... */
 777                 SSize_t slen = SvCUR(check);
 778                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 779
 780                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 781                     "  Looking for check substr at fixed offset %"IVdf"...\n",
 782                     (IV)prog->check_offset_min));
 783
 784                 if (SvTAIL(check)) {
 785                     /* In this case, the regex is anchored at the end too.
 786                      * Unless it's a multiline match, the lengths must match
 787                      * exactly, give or take a \n.  NB: slen >= 1 since
 788                      * the last char of check is \n */
 789                     if (!multiline
 790                         && (   strend - s > slen
 791                             || strend - s < slen - 1
 792                             || (strend - s == slen && strend[-1] != '\n')))
 793                     {
 794                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 795                                             "  String too long...\n"));
 796                         goto fail_finish;
 797                     }
 798                     /* Now should match s[0..slen-2] */
 799                     slen--;
 800                 }
 801                 if (slen && (*SvPVX_const(check) != *s
 802                     || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 803                 {
 804                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 805                                     "  String not equal...\n"));
 806                     goto fail_finish;
 807                 }
 808
 809                 check_at = s;
 810                 goto success_at_start;
 811             }
 812         }
 813     }
 814
 815     end_shift = prog->check_end_shift;
 816
 817 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 818     if (end_shift < 0)
 819         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 820                    (IV)end_shift, RX_PRECOMP(prog));
 821 #endif
 822
 823   restart:
 824
 825     /* This is the (re)entry point of the main loop in this function.
 826      * The goal of this loop is to:
 827      * 1) find the "check" substring in the region rx_origin..strend
 828      *    (adjusted by start_shift / end_shift). If not found, reject
 829      *    immediately.
 830      * 2) If it exists, look for the "other" substr too if defined; for
 831      *    example, if the check substr maps to the anchored substr, then
 832      *    check the floating substr, and vice-versa. If not found, go
 833      *    back to (1) with rx_origin suitably incremented.
 834      * 3) If we find an rx_origin position that doesn't contradict
 835      *    either of the substrings, then check the possible additional
 836      *    constraints on rx_origin of /^.../m or a known start class.
 837      *    If these fail, then depending on which constraints fail, jump
 838      *    back to here, or to various other re-entry points further along
 839      *    that skip some of the first steps.
 840      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 841      *    substring. If the start position was determined to be at the
 842      *    beginning of the string  - so, not rejected, but not optimised,
 843      *    since we have to run regmatch from position 0 - decrement the
 844      *    BmUSEFUL() count. Otherwise increment it.
 845      */
 846
 847
 848     /* first, look for the 'check' substring */
 849
 850     {
 851         U8* start_point;
 852         U8* end_point;
 853
 854         DEBUG_OPTIMISE_MORE_r({
 855             PerlIO_printf(Perl_debug_log,
 856                 "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 857                 " Start shift: %"IVdf" End shift %"IVdf
 858                 " Real end Shift: %"IVdf"\n",
 859                 (IV)(rx_origin - strpos),
 860                 (IV)prog->check_offset_min,
 861                 (IV)start_shift,
 862                 (IV)end_shift,
 863                 (IV)prog->check_end_shift);
 864         });
 865
 866         if (prog->intflags & PREGf_CANY_SEEN) {
 867             start_point= (U8*)(rx_origin + start_shift);
 868             end_point= (U8*)(strend - end_shift);
 869             if (start_point > end_point)
 870                 goto fail_finish;
 871         } else {
 872             end_point = HOP3(strend, -end_shift, strbeg);
 873             start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 874             if (!start_point)
 875                 goto fail_finish;
 876         }
 877
 878
 879         /* If the regex is absolutely anchored to either the start of the
 880          * string (SBOL) or to pos() (ANCH_GPOS), then
 881          * check_offset_max represents an upper bound on the string where
 882          * the substr could start. For the ANCH_GPOS case, we assume that
 883          * the caller of intuit will have already set strpos to
 884          * pos()-gofs, so in this case strpos + offset_max will still be
 885          * an upper bound on the substr.
 886          */
 887         if (!ml_anch
 888             && prog->intflags & PREGf_ANCH
 889             && prog->check_offset_max != SSize_t_MAX)
 890         {
 891             SSize_t len = SvCUR(check) - !!SvTAIL(check);
 892             const char * const anchor =
 893                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 894
 895             /* do a bytes rather than chars comparison. It's conservative;
 896              * so it skips doing the HOP if the result can't possibly end
 897              * up earlier than the old value of end_point.
 898              */
 899             if ((char*)end_point - anchor > prog->check_offset_max) {
 900                 end_point = HOP3lim((U8*)anchor,
 901                                 prog->check_offset_max,
 902                                 end_point -len)
 903                             + len;
 904             }
 905         }
 906
 907         DEBUG_OPTIMISE_MORE_r({
 908             PerlIO_printf(Perl_debug_log, "  fbm_instr len=%d str=<%.*s>\n",
 909                 (int)(end_point - start_point),
 910                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 911                 start_point);
 912         });
 913
 914         check_at = fbm_instr( start_point, end_point,
 915                       check, multiline ? FBMrf_MULTILINE : 0);
 916
 917         /* Update the count-of-usability, remove useless subpatterns,
 918             unshift s.  */
 919
 920         DEBUG_EXECUTE_r({
 921             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 922                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 923             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s%s",
 924                               (check_at ? "Found" : "Did not find"),
 925                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 926                     ? "anchored" : "floating"),
 927                 quoted,
 928                 RE_SV_TAIL(check),
 929                 (check_at ? " at offset " : "...\n") );
 930         });
 931
 932         if (!check_at)
 933             goto fail_finish;
 934         /* Finish the diagnostic message */
 935         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
 936
 937         /* set rx_origin to the minimum position where the regex could start
 938          * matching, given the constraint of the just-matched check substring.
 939          * But don't set it lower than previously.
 940          */
 941
 942         if (check_at - rx_origin > prog->check_offset_max)
 943             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 944     }
 945
 946
 947     /* now look for the 'other' substring if defined */
 948
 949     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 950                     : prog->substrs->data[other_ix].substr)
 951     {
 952         /* Take into account the "other" substring. */
 953         char *last, *last1;
 954         char *s;
 955         SV* must;
 956         struct reg_substr_datum *other;
 957
 958       do_other_substr:
 959         other = &prog->substrs->data[other_ix];
 960
 961         /* if "other" is anchored:
 962          * we've previously found a floating substr starting at check_at.
 963          * This means that the regex origin must lie somewhere
 964          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 965          * and max:                 HOP3(check_at, -check_offset_min)
 966          * (except that min will be >= strpos)
 967          * So the fixed  substr must lie somewhere between
 968          *  HOP3(min, anchored_offset)
 969          *  HOP3(max, anchored_offset) + SvCUR(substr)
 970          */
 971
 972         /* if "other" is floating
 973          * Calculate last1, the absolute latest point where the
 974          * floating substr could start in the string, ignoring any
 975          * constraints from the earlier fixed match. It is calculated
 976          * as follows:
 977          *
 978          * strend - prog->minlen (in chars) is the absolute latest
 979          * position within the string where the origin of the regex
 980          * could appear. The latest start point for the floating
 981          * substr is float_min_offset(*) on from the start of the
 982          * regex.  last1 simply combines thee two offsets.
 983          *
 984          * (*) You might think the latest start point should be
 985          * float_max_offset from the regex origin, and technically
 986          * you'd be correct. However, consider
 987          *    /a\d{2,4}bcd\w/
 988          * Here, float min, max are 3,5 and minlen is 7.
 989          * This can match either
 990          *    /a\d\dbcd\w/
 991          *    /a\d\d\dbcd\w/
 992          *    /a\d\d\d\dbcd\w/
 993          * In the first case, the regex matches minlen chars; in the
 994          * second, minlen+1, in the third, minlen+2.
 995          * In the first case, the floating offset is 3 (which equals
 996          * float_min), in the second, 4, and in the third, 5 (which
 997          * equals float_max). In all cases, the floating string bcd
 998          * can never start more than 4 chars from the end of the
 999          * string, which equals minlen - float_min. As the substring
1000          * starts to match more than float_min from the start of the
1001          * regex, it makes the regex match more than minlen chars,
1002          * and the two cancel each other out. So we can always use
1003          * float_min - minlen, rather than float_max - minlen for the
1004          * latest position in the string.
1005          *
1006          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1007          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1008          */
1009
1010         assert(prog->minlen >= other->min_offset);
1011         last1 = HOP3c(strend,
1012                         other->min_offset - prog->minlen, strbeg);
1013
1014         if (other_ix) {/* i.e. if (other-is-float) */
1015             /* last is the latest point where the floating substr could
1016              * start, *given* any constraints from the earlier fixed
1017              * match. This constraint is that the floating string starts
1018              * <= float_max_offset chars from the regex origin (rx_origin).
1019              * If this value is less than last1, use it instead.
1020              */
1021             assert(rx_origin <= last1);
1022             last =
1023                 /* this condition handles the offset==infinity case, and
1024                  * is a short-cut otherwise. Although it's comparing a
1025                  * byte offset to a char length, it does so in a safe way,
1026                  * since 1 char always occupies 1 or more bytes,
1027                  * so if a string range is  (last1 - rx_origin) bytes,
1028                  * it will be less than or equal to  (last1 - rx_origin)
1029                  * chars; meaning it errs towards doing the accurate HOP3
1030                  * rather than just using last1 as a short-cut */
1031                 (last1 - rx_origin) < other->max_offset
1032                     ? last1
1033                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1034         }
1035         else {
1036             assert(strpos + start_shift <= check_at);
1037             last = HOP4c(check_at, other->min_offset - start_shift,
1038                         strbeg, strend);
1039         }
1040
1041         s = HOP3c(rx_origin, other->min_offset, strend);
1042         if (s < other_last)     /* These positions already checked */
1043             s = other_last;
1044
1045         must = utf8_target ? other->utf8_substr : other->substr;
1046         assert(SvPOK(must));
1047         s = fbm_instr(
1048             (unsigned char*)s,
1049             (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1050             must,
1051             multiline ? FBMrf_MULTILINE : 0
1052         );
1053         DEBUG_EXECUTE_r({
1054             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1055                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1056             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s",
1057                 s ? "Found" : "Contradicts",
1058                 other_ix ? "floating" : "anchored",
1059                 quoted, RE_SV_TAIL(must));
1060         });
1061
1062
1063         if (!s) {
1064             /* last1 is latest possible substr location. If we didn't
1065              * find it before there, we never will */
1066             if (last >= last1) {
1067                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1068                                         ", giving up...\n"));
1069                 goto fail_finish;
1070             }
1071
1072             /* try to find the check substr again at a later
1073              * position. Maybe next time we'll find the "other" substr
1074              * in range too */
1075             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1076                 ", trying %s at offset %ld...\n",
1077                 (other_ix ? "floating" : "anchored"),
1078                 (long)(HOP3c(check_at, 1, strend) - strpos)));
1079
1080             other_last = HOP3c(last, 1, strend) /* highest failure */;
1081             rx_origin =
1082                 other_ix /* i.e. if other-is-float */
1083                     ? HOP3c(rx_origin, 1, strend)
1084                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1085             goto restart;
1086         }
1087         else {
1088             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1089                   (long)(s - strpos)));
1090
1091             if (other_ix) { /* if (other-is-float) */
1092                 /* other_last is set to s, not s+1, since its possible for
1093                  * a floating substr to fail first time, then succeed
1094                  * second time at the same floating position; e.g.:
1095                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1096                  * The first time round, anchored and float match at
1097                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1098                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1099                  */
1100                 other_last = s;
1101             }
1102             else {
1103                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1104                 other_last = HOP3c(s, 1, strend);
1105             }
1106         }
1107     }
1108     else {
1109         DEBUG_OPTIMISE_MORE_r(
1110             PerlIO_printf(Perl_debug_log,
1111                 "  Check-only match: offset min:%"IVdf" max:%"IVdf
1112                 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1113                 " strend-strpos:%"IVdf"\n",
1114                 (IV)prog->check_offset_min,
1115                 (IV)prog->check_offset_max,
1116                 (IV)(check_at-strpos),
1117                 (IV)(rx_origin-strpos),
1118                 (IV)(rx_origin-check_at),
1119                 (IV)(strend-strpos)
1120             )
1121         );
1122     }
1123
1124   postprocess_substr_matches:
1125
1126     /* handle the extra constraint of /^.../m if present */
1127
1128     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1129         char *s;
1130
1131         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1132                         "  looking for /^/m anchor"));
1133
1134         /* we have failed the constraint of a \n before rx_origin.
1135          * Find the next \n, if any, even if it's beyond the current
1136          * anchored and/or floating substrings. Whether we should be
1137          * scanning ahead for the next \n or the next substr is debatable.
1138          * On the one hand you'd expect rare substrings to appear less
1139          * often than \n's. On the other hand, searching for \n means
1140          * we're effectively flipping been check_substr and "\n" on each
1141          * iteration as the current "rarest" string candidate, which
1142          * means for example that we'll quickly reject the whole string if
1143          * hasn't got a \n, rather than trying every substr position
1144          * first
1145          */
1146
1147         s = HOP3c(strend, - prog->minlen, strpos);
1148         if (s <= rx_origin ||
1149             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1150         {
1151             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1152                             "  Did not find /%s^%s/m...\n",
1153                             PL_colors[0], PL_colors[1]));
1154             goto fail_finish;
1155         }
1156
1157         /* earliest possible origin is 1 char after the \n.
1158          * (since *rx_origin == '\n', it's safe to ++ here rather than
1159          * HOP(rx_origin, 1)) */
1160         rx_origin++;
1161
1162         if (prog->substrs->check_ix == 0  /* check is anchored */
1163             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1164         {
1165             /* Position contradicts check-string; either because
1166              * check was anchored (and thus has no wiggle room),
1167              * or check was float and rx_origin is above the float range */
1168             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1169                 "  Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1170                 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1171             goto restart;
1172         }
1173
1174         /* if we get here, the check substr must have been float,
1175          * is in range, and we may or may not have had an anchored
1176          * "other" substr which still contradicts */
1177         assert(prog->substrs->check_ix); /* check is float */
1178
1179         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1180             /* whoops, the anchored "other" substr exists, so we still
1181              * contradict. On the other hand, the float "check" substr
1182              * didn't contradict, so just retry the anchored "other"
1183              * substr */
1184             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1185                 "  Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1186                 PL_colors[0], PL_colors[1],
1187                 (long)(rx_origin - strpos),
1188                 (long)(rx_origin - strpos + prog->anchored_offset)));
1189             goto do_other_substr;
1190         }
1191
1192         /* success: we don't contradict the found floating substring
1193          * (and there's no anchored substr). */
1194         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1195             "  Found /%s^%s/m at offset %ld...\n",
1196             PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1197     }
1198     else {
1199         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1200             "  (multiline anchor test skipped)\n"));
1201     }
1202
1203   success_at_start:
1204
1205
1206     /* if we have a starting character class, then test that extra constraint.
1207      * (trie stclasses are too expensive to use here, we are better off to
1208      * leave it to regmatch itself) */
1209
1210     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1211         const U8* const str = (U8*)STRING(progi->regstclass);
1212
1213         /* XXX this value could be pre-computed */
1214         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1215                     ?  (reginfo->is_utf8_pat
1216                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1217                         : STR_LEN(progi->regstclass))
1218                     : 1);
1219         char * endpos;
1220         char *s;
1221         /* latest pos that a matching float substr constrains rx start to */
1222         char *rx_max_float = NULL;
1223
1224         /* if the current rx_origin is anchored, either by satisfying an
1225          * anchored substring constraint, or a /^.../m constraint, then we
1226          * can reject the current origin if the start class isn't found
1227          * at the current position. If we have a float-only match, then
1228          * rx_origin is constrained to a range; so look for the start class
1229          * in that range. if neither, then look for the start class in the
1230          * whole rest of the string */
1231
1232         /* XXX DAPM it's not clear what the minlen test is for, and why
1233          * it's not used in the floating case. Nothing in the test suite
1234          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1235          * Here are some old comments, which may or may not be correct:
1236          *
1237          *   minlen == 0 is possible if regstclass is \b or \B,
1238          *   and the fixed substr is ''$.
1239          *   Since minlen is already taken into account, rx_origin+1 is
1240          *   before strend; accidentally, minlen >= 1 guaranties no false
1241          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1242          *   0) below assumes that regstclass does not come from lookahead...
1243          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1244          *   This leaves EXACTF-ish only, which are dealt with in
1245          *   find_byclass().
1246          */
1247
1248         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1249             endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1250         else if (prog->float_substr || prog->float_utf8) {
1251             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1252             endpos= HOP3c(rx_max_float, cl_l, strend);
1253         }
1254         else
1255             endpos= strend;
1256
1257         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1258             "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1259             " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1260               (IV)start_shift, (IV)(check_at - strbeg),
1261               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1262
1263         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1264                             reginfo);
1265         if (!s) {
1266             if (endpos == strend) {
1267                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1268                                 "  Could not match STCLASS...\n") );
1269                 goto fail;
1270             }
1271             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1272                                "  This position contradicts STCLASS...\n") );
1273             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1274                         && !(prog->intflags & PREGf_IMPLICIT))
1275                 goto fail;
1276
1277             /* Contradict one of substrings */
1278             if (prog->anchored_substr || prog->anchored_utf8) {
1279                 if (prog->substrs->check_ix == 1) { /* check is float */
1280                     /* Have both, check_string is floating */
1281                     assert(rx_origin + start_shift <= check_at);
1282                     if (rx_origin + start_shift != check_at) {
1283                         /* not at latest position float substr could match:
1284                          * Recheck anchored substring, but not floating.
1285                          * The condition above is in bytes rather than
1286                          * chars for efficiency. It's conservative, in
1287                          * that it errs on the side of doing 'goto
1288                          * do_other_substr', where a more accurate
1289                          * char-based calculation will be done */
1290                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1291                                   "  Looking for anchored substr starting at offset %ld...\n",
1292                                   (long)(other_last - strpos)) );
1293                         goto do_other_substr;
1294                     }
1295                 }
1296             }
1297             else {
1298                 /* float-only */
1299
1300                 if (ml_anch) {
1301                     /* In the presence of ml_anch, we might be able to
1302                      * find another \n without breaking the current float
1303                      * constraint. */
1304
1305                     /* strictly speaking this should be HOP3c(..., 1, ...),
1306                      * but since we goto a block of code that's going to
1307                      * search for the next \n if any, its safe here */
1308                     rx_origin++;
1309                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1310                               "  Looking for /%s^%s/m starting at offset %ld...\n",
1311                               PL_colors[0], PL_colors[1],
1312                               (long)(rx_origin - strpos)) );
1313                     goto postprocess_substr_matches;
1314                 }
1315
1316                 /* strictly speaking this can never be true; but might
1317                  * be if we ever allow intuit without substrings */
1318                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1319                     goto fail;
1320
1321                 rx_origin = rx_max_float;
1322             }
1323
1324             /* at this point, any matching substrings have been
1325              * contradicted. Start again... */
1326
1327             rx_origin = HOP3c(rx_origin, 1, strend);
1328
1329             /* uses bytes rather than char calculations for efficiency.
1330              * It's conservative: it errs on the side of doing 'goto restart',
1331              * where there is code that does a proper char-based test */
1332             if (rx_origin + start_shift + end_shift > strend) {
1333                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1334                                        "  Could not match STCLASS...\n") );
1335                 goto fail;
1336             }
1337             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1338                 "  Looking for %s substr starting at offset %ld...\n",
1339                 (prog->substrs->check_ix ? "floating" : "anchored"),
1340                 (long)(rx_origin + start_shift - strpos)) );
1341             goto restart;
1342         }
1343
1344         /* Success !!! */
1345
1346         if (rx_origin != s) {
1347             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1348                         "  By STCLASS: moving %ld --> %ld\n",
1349                                   (long)(rx_origin - strpos), (long)(s - strpos))
1350                    );
1351         }
1352         else {
1353             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1354                                   "  Does not contradict STCLASS...\n");
1355                    );
1356         }
1357     }
1358
1359     /* Decide whether using the substrings helped */
1360
1361     if (rx_origin != strpos) {
1362         /* Fixed substring is found far enough so that the match
1363            cannot start at strpos. */
1364
1365         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  try at offset...\n"));
1366         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1367     }
1368     else {
1369         /* The found rx_origin position does not prohibit matching at
1370          * strpos, so calling intuit didn't gain us anything. Decrement
1371          * the BmUSEFUL() count on the check substring, and if we reach
1372          * zero, free it.  */
1373         if (!(prog->intflags & PREGf_NAUGHTY)
1374             && (utf8_target ? (
1375                 prog->check_utf8                /* Could be deleted already */
1376                 && --BmUSEFUL(prog->check_utf8) < 0
1377                 && (prog->check_utf8 == prog->float_utf8)
1378             ) : (
1379                 prog->check_substr              /* Could be deleted already */
1380                 && --BmUSEFUL(prog->check_substr) < 0
1381                 && (prog->check_substr == prog->float_substr)
1382             )))
1383         {
1384             /* If flags & SOMETHING - do not do it many times on the same match */
1385             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  ... Disabling check substring...\n"));
1386             /* XXX Does the destruction order has to change with utf8_target? */
1387             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1388             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1389             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1390             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1391             check = NULL;                       /* abort */
1392             /* XXXX This is a remnant of the old implementation.  It
1393                     looks wasteful, since now INTUIT can use many
1394                     other heuristics. */
1395             prog->extflags &= ~RXf_USE_INTUIT;
1396         }
1397     }
1398
1399     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1400             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1401              PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1402
1403     return rx_origin;
1404
1405   fail_finish:                          /* Substring not found */
1406     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1407         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1408   fail:
1409     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1410                           PL_colors[4], PL_colors[5]));
1411     return NULL;
1412 }
1413
1414
1415 #define DECL_TRIE_TYPE(scan) \
1416     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1417                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1418                  trie_utf8l, trie_flu8 }                                            \
1419                     trie_type = ((scan->flags == EXACT)                             \
1420                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1421                                  : (scan->flags == EXACTL)                          \
1422                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1423                                     : (scan->flags == EXACTFA)                      \
1424                                       ? (utf8_target                                \
1425                                          ? trie_utf8_exactfa_fold                   \
1426                                          : trie_latin_utf8_exactfa_fold)            \
1427                                       : (scan->flags == EXACTFLU8                   \
1428                                          ? trie_flu8                                \
1429                                          : (utf8_target                             \
1430                                            ? trie_utf8_fold                         \
1431                                            :   trie_latin_utf8_fold)))
1432
1433 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1434 STMT_START {                                                                        \
1435     STRLEN skiplen;                                                                 \
1436     U8 flags = FOLD_FLAGS_FULL;                                                     \
1437     switch (trie_type) {                                                            \
1438     case trie_flu8:                                                                 \
1439         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1440         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1441             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1442         }                                                                           \
1443         goto do_trie_utf8_fold;                                                     \
1444     case trie_utf8_exactfa_fold:                                                    \
1445         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1446         /* FALLTHROUGH */                                                           \
1447     case trie_utf8_fold:                                                            \
1448       do_trie_utf8_fold:                                                            \
1449         if ( foldlen>0 ) {                                                          \
1450             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1451             foldlen -= len;                                                         \
1452             uscan += len;                                                           \
1453             len=0;                                                                  \
1454         } else {                                                                    \
1455             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1456             len = UTF8SKIP(uc);                                                     \
1457             skiplen = UNISKIP( uvc );                                               \
1458             foldlen -= skiplen;                                                     \
1459             uscan = foldbuf + skiplen;                                              \
1460         }                                                                           \
1461         break;                                                                      \
1462     case trie_latin_utf8_exactfa_fold:                                              \
1463         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1464         /* FALLTHROUGH */                                                           \
1465     case trie_latin_utf8_fold:                                                      \
1466         if ( foldlen>0 ) {                                                          \
1467             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1468             foldlen -= len;                                                         \
1469             uscan += len;                                                           \
1470             len=0;                                                                  \
1471         } else {                                                                    \
1472             len = 1;                                                                \
1473             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1474             skiplen = UNISKIP( uvc );                                               \
1475             foldlen -= skiplen;                                                     \
1476             uscan = foldbuf + skiplen;                                              \
1477         }                                                                           \
1478         break;                                                                      \
1479     case trie_utf8l:                                                                \
1480         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1481         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1482             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1483         }                                                                           \
1484         /* FALLTHROUGH */                                                           \
1485     case trie_utf8:                                                                 \
1486         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1487         break;                                                                      \
1488     case trie_plain:                                                                \
1489         uvc = (UV)*uc;                                                              \
1490         len = 1;                                                                    \
1491     }                                                                               \
1492     if (uvc < 256) {                                                                \
1493         charid = trie->charmap[ uvc ];                                              \
1494     }                                                                               \
1495     else {                                                                          \
1496         charid = 0;                                                                 \
1497         if (widecharmap) {                                                          \
1498             SV** const svpp = hv_fetch(widecharmap,                                 \
1499                         (char*)&uvc, sizeof(UV), 0);                                \
1500             if (svpp)                                                               \
1501                 charid = (U16)SvIV(*svpp);                                          \
1502         }                                                                           \
1503     }                                                                               \
1504 } STMT_END
1505
1506 #define DUMP_EXEC_POS(li,s,doutf8)                          \
1507     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1508                 startpos, doutf8)
1509
1510 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1511 STMT_START {                                              \
1512     while (s <= e) {                                      \
1513         if ( (COND)                                       \
1514              && (ln == 1 || folder(s, pat_string, ln))    \
1515              && (reginfo->intuit || regtry(reginfo, &s)) )\
1516             goto got_it;                                  \
1517         s++;                                              \
1518     }                                                     \
1519 } STMT_END
1520
1521 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1522 STMT_START {                                          \
1523     while (s < strend) {                              \
1524         CODE                                          \
1525         s += UTF8SKIP(s);                             \
1526     }                                                 \
1527 } STMT_END
1528
1529 #define REXEC_FBC_SCAN(CODE)                          \
1530 STMT_START {                                          \
1531     while (s < strend) {                              \
1532         CODE                                          \
1533         s++;                                          \
1534     }                                                 \
1535 } STMT_END
1536
1537 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1538 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1539     if (COND) {                                                \
1540         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1541             goto got_it;                                       \
1542         else                                                   \
1543             tmp = doevery;                                     \
1544     }                                                          \
1545     else                                                       \
1546         tmp = 1;                                               \
1547 )
1548
1549 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1550 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1551     if (COND) {                                                \
1552         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1553             goto got_it;                                       \
1554         else                                                   \
1555             tmp = doevery;                                     \
1556     }                                                          \
1557     else                                                       \
1558         tmp = 1;                                               \
1559 )
1560
1561 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1562     if (utf8_target) {                                         \
1563         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1564     }                                                          \
1565     else {                                                     \
1566         REXEC_FBC_CLASS_SCAN(COND);                            \
1567     }
1568
1569 /* The three macros below are slightly different versions of the same logic.
1570  *
1571  * The first is for /a and /aa when the target string is UTF-8.  This can only
1572  * match ascii, but it must advance based on UTF-8.   The other two handle the
1573  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1574  * for the boundary (or non-boundary) between a word and non-word character.
1575  * The utf8 and non-utf8 cases have the same logic, but the details must be
1576  * different.  Find the "wordness" of the character just prior to this one, and
1577  * compare it with the wordness of this one.  If they differ, we have a
1578  * boundary.  At the beginning of the string, pretend that the previous
1579  * character was a new-line.
1580  *
1581  * All these macros uncleanly have side-effects with each other and outside
1582  * variables.  So far it's been too much trouble to clean-up
1583  *
1584  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1585  *               a word character or not.
1586  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1587  *               word/non-word
1588  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1589  *
1590  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1591  * are looking for a boundary or for a non-boundary.  If we are looking for a
1592  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1593  * see if this tentative match actually works, and if so, to quit the loop
1594  * here.  And vice-versa if we are looking for a non-boundary.
1595  *
1596  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1597  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1598  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1599  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1600  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1601  * complement.  But in that branch we complement tmp, meaning that at the
1602  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1603  * which means at the top of the loop in the next iteration, it is
1604  * TEST_NON_UTF8(s-1) */
1605 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1606     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1607     tmp = TEST_NON_UTF8(tmp);                                                  \
1608     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1609         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1610             tmp = !tmp;                                                        \
1611             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1612         }                                                                      \
1613         else {                                                                 \
1614             IF_FAIL;                                                           \
1615         }                                                                      \
1616     );                                                                         \
1617
1618 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1619  * TEST_UTF8 is a macro that for the same input code points returns identically
1620  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1621 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1622     if (s == reginfo->strbeg) {                                                \
1623         tmp = '\n';                                                            \
1624     }                                                                          \
1625     else { /* Back-up to the start of the previous character */                \
1626         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1627         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1628                                                        0, UTF8_ALLOW_DEFAULT); \
1629     }                                                                          \
1630     tmp = TEST_UV(tmp);                                                        \
1631     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1632     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1633         if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1634             tmp = !tmp;                                                        \
1635             IF_SUCCESS;                                                        \
1636         }                                                                      \
1637         else {                                                                 \
1638             IF_FAIL;                                                           \
1639         }                                                                      \
1640     );
1641
1642 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1643  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1644  * macros below */
1645 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1646     if (utf8_target) {                                                         \
1647         UTF8_CODE                                                              \
1648     }                                                                          \
1649     else {  /* Not utf8 */                                                     \
1650         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1651         tmp = TEST_NON_UTF8(tmp);                                              \
1652         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1653             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1654                 IF_SUCCESS;                                                    \
1655                 tmp = !tmp;                                                    \
1656             }                                                                  \
1657             else {                                                             \
1658                 IF_FAIL;                                                       \
1659             }                                                                  \
1660         );                                                                     \
1661     }                                                                          \
1662     /* Here, things have been set up by the previous code so that tmp is the   \
1663      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1664      * utf8ness of the target).  We also have to check if this matches against \
1665      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1666      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1667      * string */                                                               \
1668     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1669         IF_SUCCESS;                                                            \
1670     }                                                                          \
1671     else {                                                                     \
1672         IF_FAIL;                                                               \
1673     }
1674
1675 /* This is the macro to use when we want to see if something that looks like it
1676  * could match, actually does, and if so exits the loop */
1677 #define REXEC_FBC_TRYIT                            \
1678     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1679         goto got_it
1680
1681 /* The only difference between the BOUND and NBOUND cases is that
1682  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1683  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1684  * with the other one being empty (PLACEHOLDER is defined as empty).
1685  *
1686  * The TEST_FOO parameters are for operating on different forms of input, but
1687  * all should be ones that return identically for the same underlying code
1688  * points */
1689 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1690     FBC_BOUND_COMMON(                                                          \
1691           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1692           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1693
1694 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1695     FBC_BOUND_COMMON(                                                          \
1696             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1697             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1698
1699 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1700     FBC_BOUND_COMMON(                                                          \
1701           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1702           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1703
1704 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1705     FBC_BOUND_COMMON(                                                          \
1706             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1707             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1708
1709 /* Takes a pointer to an inversion list, a pointer to its corresponding
1710  * inversion map, and a code point, and returns the code point's value
1711  * according to the two arrays.  It assumes that all code points have a value.
1712  * This is used as the base macro for macros for particular properties */
1713 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1714                              invmap[_invlist_search(invlist, cp)]
1715
1716 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1717  * of a code point, returning the value for the first code point in the string.
1718  * And it takes the particular macro name that finds the desired value given a
1719  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1720 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1721              (__ASSERT_(pos < strend)                                          \
1722                  /* Note assumes is valid UTF-8 */                             \
1723              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1724
1725 /* Returns the GCB value for the input code point */
1726 #define getGCB_VAL_CP(cp)                                                      \
1727           _generic_GET_BREAK_VAL_CP(                                           \
1728                                     PL_GCB_invlist,                            \
1729                                     Grapheme_Cluster_Break_invmap,             \
1730                                     (cp))
1731
1732 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1733  * bounded by pos and strend */
1734 #define getGCB_VAL_UTF8(pos, strend)                                           \
1735     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1736
1737
1738 /* Returns the SB value for the input code point */
1739 #define getSB_VAL_CP(cp)                                                       \
1740           _generic_GET_BREAK_VAL_CP(                                           \
1741                                     PL_SB_invlist,                             \
1742                                     Sentence_Break_invmap,                     \
1743                                     (cp))
1744
1745 /* Returns the SB value for the first code point in the UTF-8 encoded string
1746  * bounded by pos and strend */
1747 #define getSB_VAL_UTF8(pos, strend)                                            \
1748     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1749
1750 /* Returns the WB value for the input code point */
1751 #define getWB_VAL_CP(cp)                                                       \
1752           _generic_GET_BREAK_VAL_CP(                                           \
1753                                     PL_WB_invlist,                             \
1754                                     Word_Break_invmap,                         \
1755                                     (cp))
1756
1757 /* Returns the WB value for the first code point in the UTF-8 encoded string
1758  * bounded by pos and strend */
1759 #define getWB_VAL_UTF8(pos, strend)                                            \
1760     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1761
1762 /* We know what class REx starts with.  Try to find this position... */
1763 /* if reginfo->intuit, its a dryrun */
1764 /* annoyingly all the vars in this routine have different names from their counterparts
1765    in regmatch. /grrr */
1766 STATIC char *
1767 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1768     const char *strend, regmatch_info *reginfo)
1769 {
1770     dVAR;
1771     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1772     char *pat_string;   /* The pattern's exactish string */
1773     char *pat_end;          /* ptr to end char of pat_string */
1774     re_fold_t folder;   /* Function for computing non-utf8 folds */
1775     const U8 *fold_array;   /* array for folding ords < 256 */
1776     STRLEN ln;
1777     STRLEN lnc;
1778     U8 c1;
1779     U8 c2;
1780     char *e;
1781     I32 tmp = 1;        /* Scratch variable? */
1782     const bool utf8_target = reginfo->is_utf8_target;
1783     UV utf8_fold_flags = 0;
1784     const bool is_utf8_pat = reginfo->is_utf8_pat;
1785     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1786                                    with a result inverts that result, as 0^1 =
1787                                    1 and 1^1 = 0 */
1788     _char_class_number classnum;
1789
1790     RXi_GET_DECL(prog,progi);
1791
1792     PERL_ARGS_ASSERT_FIND_BYCLASS;
1793
1794     /* We know what class it must start with. */
1795     switch (OP(c)) {
1796     case ANYOFL:
1797         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1798         /* FALLTHROUGH */
1799     case ANYOF:
1800         if (utf8_target) {
1801             REXEC_FBC_UTF8_CLASS_SCAN(
1802                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1803         }
1804         else {
1805             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1806         }
1807         break;
1808     case CANY:
1809         REXEC_FBC_SCAN(
1810             if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1811                 goto got_it;
1812             else
1813                 tmp = doevery;
1814         );
1815         break;
1816
1817     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1818         assert(! is_utf8_pat);
1819         /* FALLTHROUGH */
1820     case EXACTFA:
1821         if (is_utf8_pat || utf8_target) {
1822             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1823             goto do_exactf_utf8;
1824         }
1825         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1826         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1827         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1828
1829     case EXACTF:   /* This node only generated for non-utf8 patterns */
1830         assert(! is_utf8_pat);
1831         if (utf8_target) {
1832             utf8_fold_flags = 0;
1833             goto do_exactf_utf8;
1834         }
1835         fold_array = PL_fold;
1836         folder = foldEQ;
1837         goto do_exactf_non_utf8;
1838
1839     case EXACTFL:
1840         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1841         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1842             utf8_fold_flags = FOLDEQ_LOCALE;
1843             goto do_exactf_utf8;
1844         }
1845         fold_array = PL_fold_locale;
1846         folder = foldEQ_locale;
1847         goto do_exactf_non_utf8;
1848
1849     case EXACTFU_SS:
1850         if (is_utf8_pat) {
1851             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1852         }
1853         goto do_exactf_utf8;
1854
1855     case EXACTFLU8:
1856             if (! utf8_target) {    /* All code points in this node require
1857                                        UTF-8 to express.  */
1858                 break;
1859             }
1860             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1861                                              | FOLDEQ_S2_FOLDS_SANE;
1862             goto do_exactf_utf8;
1863
1864     case EXACTFU:
1865         if (is_utf8_pat || utf8_target) {
1866             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1867             goto do_exactf_utf8;
1868         }
1869
1870         /* Any 'ss' in the pattern should have been replaced by regcomp,
1871          * so we don't have to worry here about this single special case
1872          * in the Latin1 range */
1873         fold_array = PL_fold_latin1;
1874         folder = foldEQ_latin1;
1875
1876         /* FALLTHROUGH */
1877
1878       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1879                            are no glitches with fold-length differences
1880                            between the target string and pattern */
1881
1882         /* The idea in the non-utf8 EXACTF* cases is to first find the
1883          * first character of the EXACTF* node and then, if necessary,
1884          * case-insensitively compare the full text of the node.  c1 is the
1885          * first character.  c2 is its fold.  This logic will not work for
1886          * Unicode semantics and the german sharp ss, which hence should
1887          * not be compiled into a node that gets here. */
1888         pat_string = STRING(c);
1889         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1890
1891         /* We know that we have to match at least 'ln' bytes (which is the
1892          * same as characters, since not utf8).  If we have to match 3
1893          * characters, and there are only 2 availabe, we know without
1894          * trying that it will fail; so don't start a match past the
1895          * required minimum number from the far end */
1896         e = HOP3c(strend, -((SSize_t)ln), s);
1897
1898         if (reginfo->intuit && e < s) {
1899             e = s;                      /* Due to minlen logic of intuit() */
1900         }
1901
1902         c1 = *pat_string;
1903         c2 = fold_array[c1];
1904         if (c1 == c2) { /* If char and fold are the same */
1905             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1906         }
1907         else {
1908             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1909         }
1910         break;
1911
1912       do_exactf_utf8:
1913       {
1914         unsigned expansion;
1915
1916         /* If one of the operands is in utf8, we can't use the simpler folding
1917          * above, due to the fact that many different characters can have the
1918          * same fold, or portion of a fold, or different- length fold */
1919         pat_string = STRING(c);
1920         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1921         pat_end = pat_string + ln;
1922         lnc = is_utf8_pat       /* length to match in characters */
1923                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1924                 : ln;
1925
1926         /* We have 'lnc' characters to match in the pattern, but because of
1927          * multi-character folding, each character in the target can match
1928          * up to 3 characters (Unicode guarantees it will never exceed
1929          * this) if it is utf8-encoded; and up to 2 if not (based on the
1930          * fact that the Latin 1 folds are already determined, and the
1931          * only multi-char fold in that range is the sharp-s folding to
1932          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1933          * string character.  Adjust lnc accordingly, rounding up, so that
1934          * if we need to match at least 4+1/3 chars, that really is 5. */
1935         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1936         lnc = (lnc + expansion - 1) / expansion;
1937
1938         /* As in the non-UTF8 case, if we have to match 3 characters, and
1939          * only 2 are left, it's guaranteed to fail, so don't start a
1940          * match that would require us to go beyond the end of the string
1941          */
1942         e = HOP3c(strend, -((SSize_t)lnc), s);
1943
1944         if (reginfo->intuit && e < s) {
1945             e = s;                      /* Due to minlen logic of intuit() */
1946         }
1947
1948         /* XXX Note that we could recalculate e to stop the loop earlier,
1949          * as the worst case expansion above will rarely be met, and as we
1950          * go along we would usually find that e moves further to the left.
1951          * This would happen only after we reached the point in the loop
1952          * where if there were no expansion we should fail.  Unclear if
1953          * worth the expense */
1954
1955         while (s <= e) {
1956             char *my_strend= (char *)strend;
1957             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1958                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1959                 && (reginfo->intuit || regtry(reginfo, &s)) )
1960             {
1961                 goto got_it;
1962             }
1963             s += (utf8_target) ? UTF8SKIP(s) : 1;
1964         }
1965         break;
1966     }
1967
1968     case BOUNDL:
1969         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1970         if (FLAGS(c) != TRADITIONAL_BOUND) {
1971             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
1972                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
1973             goto do_boundu;
1974         }
1975
1976         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1977         break;
1978
1979     case NBOUNDL:
1980         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1981         if (FLAGS(c) != TRADITIONAL_BOUND) {
1982             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
1983                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
1984             goto do_nboundu;
1985         }
1986
1987         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1988         break;
1989
1990     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
1991                    meaning */
1992         assert(FLAGS(c) == TRADITIONAL_BOUND);
1993
1994         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1995         break;
1996
1997     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
1998                    meaning */
1999         assert(FLAGS(c) == TRADITIONAL_BOUND);
2000
2001         FBC_BOUND_A(isWORDCHAR_A);
2002         break;
2003
2004     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2005                    meaning */
2006         assert(FLAGS(c) == TRADITIONAL_BOUND);
2007
2008         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2009         break;
2010
2011     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2012                    meaning */
2013         assert(FLAGS(c) == TRADITIONAL_BOUND);
2014
2015         FBC_NBOUND_A(isWORDCHAR_A);
2016         break;
2017
2018     case NBOUNDU:
2019         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2020             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2021             break;
2022         }
2023
2024       do_nboundu:
2025
2026         to_complement = 1;
2027         /* FALLTHROUGH */
2028
2029     case BOUNDU:
2030       do_boundu:
2031         switch((bound_type) FLAGS(c)) {
2032             case TRADITIONAL_BOUND:
2033                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2034                 break;
2035             case GCB_BOUND:
2036                 if (s == reginfo->strbeg) { /* GCB always matches at begin and
2037                                                end */
2038                     if (to_complement ^ cBOOL(reginfo->intuit
2039                                                       || regtry(reginfo, &s)))
2040                     {
2041                         goto got_it;
2042                     }
2043                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2044                 }
2045
2046                 if (utf8_target) {
2047                     PL_GCB_enum before = getGCB_VAL_UTF8(
2048                                                reghop3((U8*)s, -1,
2049                                                        (U8*)(reginfo->strbeg)),
2050                                                (U8*) reginfo->strend);
2051                     while (s < strend) {
2052                         PL_GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2053                                                         (U8*) reginfo->strend);
2054                         if (to_complement ^ isGCB(before, after)) {
2055                             if (reginfo->intuit || regtry(reginfo, &s)) {
2056                                 goto got_it;
2057                             }
2058                             before = after;
2059                         }
2060                         s += UTF8SKIP(s);
2061                     }
2062                 }
2063                 else {  /* Not utf8.  Everything is a GCB except between CR and
2064                            LF */
2065                     while (s < strend) {
2066                         if (to_complement ^ (UCHARAT(s - 1) != '\r'
2067                                              || UCHARAT(s) != '\n'))
2068                         {
2069                             if (reginfo->intuit || regtry(reginfo, &s)) {
2070                                 goto got_it;
2071                             }
2072                             s++;
2073                         }
2074                     }
2075                 }
2076
2077                 if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
2078                     goto got_it;
2079                 }
2080                 break;
2081
2082             case SB_BOUND:
2083                 if (s == reginfo->strbeg) { /* SB always matches at beginning */
2084                     if (to_complement
2085                                 ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
2086                     {
2087                         goto got_it;
2088                     }
2089
2090                     /* Didn't match.  Go try at the next position */
2091                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2092                 }
2093
2094                 if (utf8_target) {
2095                     PL_SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2096                                                         -1,
2097                                                         (U8*)(reginfo->strbeg)),
2098                                                       (U8*) reginfo->strend);
2099                     while (s < strend) {
2100                         PL_SB_enum after = getSB_VAL_UTF8((U8*) s,
2101                                                          (U8*) reginfo->strend);
2102                         if (to_complement ^ isSB(before,
2103                                                  after,
2104                                                  (U8*) reginfo->strbeg,
2105                                                  (U8*) s,
2106                                                  (U8*) reginfo->strend,
2107                                                  utf8_target))
2108                         {
2109                             if (reginfo->intuit || regtry(reginfo, &s)) {
2110                                 goto got_it;
2111                             }
2112                             before = after;
2113                         }
2114                         s += UTF8SKIP(s);
2115                     }
2116                 }
2117                 else {  /* Not utf8. */
2118                     PL_SB_enum before = getSB_VAL_CP((U8) *(s -1));
2119                     while (s < strend) {
2120                         PL_SB_enum after = getSB_VAL_CP((U8) *s);
2121                         if (to_complement ^ isSB(before,
2122                                                  after,
2123                                                  (U8*) reginfo->strbeg,
2124                                                  (U8*) s,
2125                                                  (U8*) reginfo->strend,
2126                                                  utf8_target))
2127                         {
2128                             if (reginfo->intuit || regtry(reginfo, &s)) {
2129                                 goto got_it;
2130                             }
2131                             before = after;
2132                         }
2133                         s++;
2134                     }
2135                 }
2136
2137                 /* Here are at the final position in the target string.  The SB
2138                  * value is always true here, so matches, depending on other
2139                  * constraints */
2140                 if (to_complement ^ cBOOL(reginfo->intuit
2141                                                       || regtry(reginfo, &s)))
2142                 {
2143                     goto got_it;
2144                 }
2145
2146                 break;
2147
2148             case WB_BOUND:
2149                 if (s == reginfo->strbeg) {
2150                     if (to_complement ^ cBOOL(reginfo->intuit
2151                                               || regtry(reginfo, &s)))
2152                     {
2153                         goto got_it;
2154                     }
2155                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2156                 }
2157
2158                 if (utf8_target) {
2159                     /* We are at a boundary between char_sub_0 and char_sub_1.
2160                      * We also keep track of the value for char_sub_-1 as we
2161                      * loop through the line.   Context may be needed to make a
2162                      * determination, and if so, this can save having to
2163                      * recalculate it */
2164                     PL_WB_enum previous = PL_WB_UNKNOWN;
2165                     PL_WB_enum before = getWB_VAL_UTF8(
2166                                               reghop3((U8*)s,
2167                                                       -1,
2168                                                       (U8*)(reginfo->strbeg)),
2169                                               (U8*) reginfo->strend);
2170                     while (s < strend) {
2171                         PL_WB_enum after = getWB_VAL_UTF8((U8*) s,
2172                                                         (U8*) reginfo->strend);
2173                         if (to_complement ^ isWB(previous,
2174                                                  before,
2175                                                  after,
2176                                                  (U8*) reginfo->strbeg,
2177                                                  (U8*) s,
2178                                                  (U8*) reginfo->strend,
2179                                                  utf8_target))
2180                         {
2181                             if (reginfo->intuit || regtry(reginfo, &s)) {
2182                                 goto got_it;
2183                             }
2184                             previous = before;
2185                             before = after;
2186                         }
2187                         s += UTF8SKIP(s);
2188                     }
2189                 }
2190                 else {  /* Not utf8. */
2191                     PL_WB_enum previous = PL_WB_UNKNOWN;
2192                     PL_WB_enum before = getWB_VAL_CP((U8) *(s -1));
2193                     while (s < strend) {
2194                         PL_WB_enum after = getWB_VAL_CP((U8) *s);
2195                         if (to_complement ^ isWB(previous,
2196                                                  before,
2197                                                  after,
2198                                                  (U8*) reginfo->strbeg,
2199                                                  (U8*) s,
2200                                                  (U8*) reginfo->strend,
2201                                                  utf8_target))
2202                         {
2203                             if (reginfo->intuit || regtry(reginfo, &s)) {
2204                                 goto got_it;
2205                             }
2206                             previous = before;
2207                             before = after;
2208                         }
2209                         s++;
2210                     }
2211                 }
2212
2213                 if (to_complement ^ cBOOL(reginfo->intuit
2214                                           || regtry(reginfo, &s)))
2215                 {
2216                     goto got_it;
2217                 }
2218
2219                 break;
2220         }
2221         break;
2222
2223     case LNBREAK:
2224         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2225                         is_LNBREAK_latin1_safe(s, strend)
2226         );
2227         break;
2228
2229     /* The argument to all the POSIX node types is the class number to pass to
2230      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2231
2232     case NPOSIXL:
2233         to_complement = 1;
2234         /* FALLTHROUGH */
2235
2236     case POSIXL:
2237         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2238         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2239                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2240         break;
2241
2242     case NPOSIXD:
2243         to_complement = 1;
2244         /* FALLTHROUGH */
2245
2246     case POSIXD:
2247         if (utf8_target) {
2248             goto posix_utf8;
2249         }
2250         goto posixa;
2251
2252     case NPOSIXA:
2253         if (utf8_target) {
2254             /* The complement of something that matches only ASCII matches all
2255              * non-ASCII, plus everything in ASCII that isn't in the class. */
2256             REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2257                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2258             break;
2259         }
2260
2261         to_complement = 1;
2262         /* FALLTHROUGH */
2263
2264     case POSIXA:
2265       posixa:
2266         /* Don't need to worry about utf8, as it can match only a single
2267          * byte invariant character. */
2268         REXEC_FBC_CLASS_SCAN(
2269                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2270         break;
2271
2272     case NPOSIXU:
2273         to_complement = 1;
2274         /* FALLTHROUGH */
2275
2276     case POSIXU:
2277         if (! utf8_target) {
2278             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2279                                                                     FLAGS(c))));
2280         }
2281         else {
2282
2283           posix_utf8:
2284             classnum = (_char_class_number) FLAGS(c);
2285             if (classnum < _FIRST_NON_SWASH_CC) {
2286                 while (s < strend) {
2287
2288                     /* We avoid loading in the swash as long as possible, but
2289                      * should we have to, we jump to a separate loop.  This
2290                      * extra 'if' statement is what keeps this code from being
2291                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2292                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2293                         goto found_above_latin1;
2294                     }
2295                     if ((UTF8_IS_INVARIANT(*s)
2296                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2297                                                                 classnum)))
2298                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
2299                             && to_complement ^ cBOOL(
2300                                 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2301                                                                       *(s + 1)),
2302                                               classnum))))
2303                     {
2304                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2305                             goto got_it;
2306                         else {
2307                             tmp = doevery;
2308                         }
2309                     }
2310                     else {
2311                         tmp = 1;
2312                     }
2313                     s += UTF8SKIP(s);
2314                 }
2315             }
2316             else switch (classnum) {    /* These classes are implemented as
2317                                            macros */
2318                 case _CC_ENUM_SPACE:
2319                     REXEC_FBC_UTF8_CLASS_SCAN(
2320                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
2321                     break;
2322
2323                 case _CC_ENUM_BLANK:
2324                     REXEC_FBC_UTF8_CLASS_SCAN(
2325                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
2326                     break;
2327
2328                 case _CC_ENUM_XDIGIT:
2329                     REXEC_FBC_UTF8_CLASS_SCAN(
2330                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2331                     break;
2332
2333                 case _CC_ENUM_VERTSPACE:
2334                     REXEC_FBC_UTF8_CLASS_SCAN(
2335                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
2336                     break;
2337
2338                 case _CC_ENUM_CNTRL:
2339                     REXEC_FBC_UTF8_CLASS_SCAN(
2340                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
2341                     break;
2342
2343                 default:
2344                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2345                     NOT_REACHED; /* NOTREACHED */
2346             }
2347         }
2348         break;
2349
2350       found_above_latin1:   /* Here we have to load a swash to get the result
2351                                for the current code point */
2352         if (! PL_utf8_swash_ptrs[classnum]) {
2353             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2354             PL_utf8_swash_ptrs[classnum] =
2355                     _core_swash_init("utf8",
2356                                      "",
2357                                      &PL_sv_undef, 1, 0,
2358                                      PL_XPosix_ptrs[classnum], &flags);
2359         }
2360
2361         /* This is a copy of the loop above for swash classes, though using the
2362          * FBC macro instead of being expanded out.  Since we've loaded the
2363          * swash, we don't have to check for that each time through the loop */
2364         REXEC_FBC_UTF8_CLASS_SCAN(
2365                 to_complement ^ cBOOL(_generic_utf8(
2366                                       classnum,
2367                                       s,
2368                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2369                                                   (U8 *) s, TRUE))));
2370         break;
2371
2372     case AHOCORASICKC:
2373     case AHOCORASICK:
2374         {
2375             DECL_TRIE_TYPE(c);
2376             /* what trie are we using right now */
2377             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2378             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2379             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2380
2381             const char *last_start = strend - trie->minlen;
2382 #ifdef DEBUGGING
2383             const char *real_start = s;
2384 #endif
2385             STRLEN maxlen = trie->maxlen;
2386             SV *sv_points;
2387             U8 **points; /* map of where we were in the input string
2388                             when reading a given char. For ASCII this
2389                             is unnecessary overhead as the relationship
2390                             is always 1:1, but for Unicode, especially
2391                             case folded Unicode this is not true. */
2392             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2393             U8 *bitmap=NULL;
2394
2395
2396             GET_RE_DEBUG_FLAGS_DECL;
2397
2398             /* We can't just allocate points here. We need to wrap it in
2399              * an SV so it gets freed properly if there is a croak while
2400              * running the match */
2401             ENTER;
2402             SAVETMPS;
2403             sv_points=newSV(maxlen * sizeof(U8 *));
2404             SvCUR_set(sv_points,
2405                 maxlen * sizeof(U8 *));
2406             SvPOK_on(sv_points);
2407             sv_2mortal(sv_points);
2408             points=(U8**)SvPV_nolen(sv_points );
2409             if ( trie_type != trie_utf8_fold
2410                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2411             {
2412                 if (trie->bitmap)
2413                     bitmap=(U8*)trie->bitmap;
2414                 else
2415                     bitmap=(U8*)ANYOF_BITMAP(c);
2416             }
2417             /* this is the Aho-Corasick algorithm modified a touch
2418                to include special handling for long "unknown char" sequences.
2419                The basic idea being that we use AC as long as we are dealing
2420                with a possible matching char, when we encounter an unknown char
2421                (and we have not encountered an accepting state) we scan forward
2422                until we find a legal starting char.
2423                AC matching is basically that of trie matching, except that when
2424                we encounter a failing transition, we fall back to the current
2425                states "fail state", and try the current char again, a process
2426                we repeat until we reach the root state, state 1, or a legal
2427                transition. If we fail on the root state then we can either
2428                terminate if we have reached an accepting state previously, or
2429                restart the entire process from the beginning if we have not.
2430
2431              */
2432             while (s <= last_start) {
2433                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2434                 U8 *uc = (U8*)s;
2435                 U16 charid = 0;
2436                 U32 base = 1;
2437                 U32 state = 1;
2438                 UV uvc = 0;
2439                 STRLEN len = 0;
2440                 STRLEN foldlen = 0;
2441                 U8 *uscan = (U8*)NULL;
2442                 U8 *leftmost = NULL;
2443 #ifdef DEBUGGING
2444                 U32 accepted_word= 0;
2445 #endif
2446                 U32 pointpos = 0;
2447
2448                 while ( state && uc <= (U8*)strend ) {
2449                     int failed=0;
2450                     U32 word = aho->states[ state ].wordnum;
2451
2452                     if( state==1 ) {
2453                         if ( bitmap ) {
2454                             DEBUG_TRIE_EXECUTE_r(
2455                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2456                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2457                                         (char *)uc, utf8_target );
2458                                     PerlIO_printf( Perl_debug_log,
2459                                         " Scanning for legal start char...\n");
2460                                 }
2461                             );
2462                             if (utf8_target) {
2463                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2464                                     uc += UTF8SKIP(uc);
2465                                 }
2466                             } else {
2467                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2468                                     uc++;
2469                                 }
2470                             }
2471                             s= (char *)uc;
2472                         }
2473                         if (uc >(U8*)last_start) break;
2474                     }
2475
2476                     if ( word ) {
2477                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2478                         if (!leftmost || lpos < leftmost) {
2479                             DEBUG_r(accepted_word=word);
2480                             leftmost= lpos;
2481                         }
2482                         if (base==0) break;
2483
2484                     }
2485                     points[pointpos++ % maxlen]= uc;
2486                     if (foldlen || uc < (U8*)strend) {
2487                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2488                                          widecharmap, uc,
2489                                          uscan, len, uvc, charid, foldlen,
2490                                          foldbuf, uniflags);
2491                         DEBUG_TRIE_EXECUTE_r({
2492                             dump_exec_pos( (char *)uc, c, strend,
2493                                         real_start, s, utf8_target);
2494                             PerlIO_printf(Perl_debug_log,
2495                                 " Charid:%3u CP:%4"UVxf" ",
2496                                  charid, uvc);
2497                         });
2498                     }
2499                     else {
2500                         len = 0;
2501                         charid = 0;
2502                     }
2503
2504
2505                     do {
2506 #ifdef DEBUGGING
2507                         word = aho->states[ state ].wordnum;
2508 #endif
2509                         base = aho->states[ state ].trans.base;
2510
2511                         DEBUG_TRIE_EXECUTE_r({
2512                             if (failed)
2513                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2514                                     s,   utf8_target );
2515                             PerlIO_printf( Perl_debug_log,
2516                                 "%sState: %4"UVxf", word=%"UVxf,
2517                                 failed ? " Fail transition to " : "",
2518                                 (UV)state, (UV)word);
2519                         });
2520                         if ( base ) {
2521                             U32 tmp;
2522                             I32 offset;
2523                             if (charid &&
2524                                  ( ((offset = base + charid
2525                                     - 1 - trie->uniquecharcount)) >= 0)
2526                                  && ((U32)offset < trie->lasttrans)
2527                                  && trie->trans[offset].check == state
2528                                  && (tmp=trie->trans[offset].next))
2529                             {
2530                                 DEBUG_TRIE_EXECUTE_r(
2531                                     PerlIO_printf( Perl_debug_log," - legal\n"));
2532                                 state = tmp;
2533                                 break;
2534                             }
2535                             else {
2536                                 DEBUG_TRIE_EXECUTE_r(
2537                                     PerlIO_printf( Perl_debug_log," - fail\n"));
2538                                 failed = 1;
2539                                 state = aho->fail[state];
2540                             }
2541                         }
2542                         else {
2543                             /* we must be accepting here */
2544                             DEBUG_TRIE_EXECUTE_r(
2545                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
2546                             failed = 1;
2547                             break;
2548                         }
2549                     } while(state);
2550                     uc += len;
2551                     if (failed) {
2552                         if (leftmost)
2553                             break;
2554                         if (!state) state = 1;
2555                     }
2556                 }
2557                 if ( aho->states[ state ].wordnum ) {
2558                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2559                     if (!leftmost || lpos < leftmost) {
2560                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2561                         leftmost = lpos;
2562                     }
2563                 }
2564                 if (leftmost) {
2565                     s = (char*)leftmost;
2566                     DEBUG_TRIE_EXECUTE_r({
2567                         PerlIO_printf(
2568                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2569                             (UV)accepted_word, (IV)(s - real_start)
2570                         );
2571                     });
2572                     if (reginfo->intuit || regtry(reginfo, &s)) {
2573                         FREETMPS;
2574                         LEAVE;
2575                         goto got_it;
2576                     }
2577                     s = HOPc(s,1);
2578                     DEBUG_TRIE_EXECUTE_r({
2579                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2580                     });
2581                 } else {
2582                     DEBUG_TRIE_EXECUTE_r(
2583                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2584                     break;
2585                 }
2586             }
2587             FREETMPS;
2588             LEAVE;
2589         }
2590         break;
2591     default:
2592         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2593     }
2594     return 0;
2595   got_it:
2596     return s;
2597 }
2598
2599 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2600  * flags have same meanings as with regexec_flags() */
2601
2602 static void
2603 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2604                             char *strbeg,
2605                             char *strend,
2606                             SV *sv,
2607                             U32 flags,
2608                             bool utf8_target)
2609 {
2610     struct regexp *const prog = ReANY(rx);
2611
2612     if (flags & REXEC_COPY_STR) {
2613 #ifdef PERL_ANY_COW
2614         if (SvCANCOW(sv)) {
2615             if (DEBUG_C_TEST) {
2616                 PerlIO_printf(Perl_debug_log,
2617                               "Copy on write: regexp capture, type %d\n",
2618                               (int) SvTYPE(sv));
2619             }
2620             /* Create a new COW SV to share the match string and store
2621              * in saved_copy, unless the current COW SV in saved_copy
2622              * is valid and suitable for our purpose */
2623             if ((   prog->saved_copy
2624                  && SvIsCOW(prog->saved_copy)
2625                  && SvPOKp(prog->saved_copy)
2626                  && SvIsCOW(sv)
2627                  && SvPOKp(sv)
2628                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2629             {
2630                 /* just reuse saved_copy SV */
2631                 if (RXp_MATCH_COPIED(prog)) {
2632                     Safefree(prog->subbeg);
2633                     RXp_MATCH_COPIED_off(prog);
2634                 }
2635             }
2636             else {
2637                 /* create new COW SV to share string */
2638                 RX_MATCH_COPY_FREE(rx);
2639                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2640             }
2641             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2642             assert (SvPOKp(prog->saved_copy));
2643             prog->sublen  = strend - strbeg;
2644             prog->suboffset = 0;
2645             prog->subcoffset = 0;
2646         } else
2647 #endif
2648         {
2649             SSize_t min = 0;
2650             SSize_t max = strend - strbeg;
2651             SSize_t sublen;
2652
2653             if (    (flags & REXEC_COPY_SKIP_POST)
2654                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2655                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2656             ) { /* don't copy $' part of string */
2657                 U32 n = 0;
2658                 max = -1;
2659                 /* calculate the right-most part of the string covered
2660                  * by a capture. Due to look-ahead, this may be to
2661                  * the right of $&, so we have to scan all captures */
2662                 while (n <= prog->lastparen) {
2663                     if (prog->offs[n].end > max)
2664                         max = prog->offs[n].end;
2665                     n++;
2666                 }
2667                 if (max == -1)
2668                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2669                             ? prog->offs[0].start
2670                             : 0;
2671                 assert(max >= 0 && max <= strend - strbeg);
2672             }
2673
2674             if (    (flags & REXEC_COPY_SKIP_PRE)
2675                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2676                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2677             ) { /* don't copy $` part of string */
2678                 U32 n = 0;
2679                 min = max;
2680                 /* calculate the left-most part of the string covered
2681                  * by a capture. Due to look-behind, this may be to
2682                  * the left of $&, so we have to scan all captures */
2683                 while (min && n <= prog->lastparen) {
2684                     if (   prog->offs[n].start != -1
2685                         && prog->offs[n].start < min)
2686                     {
2687                         min = prog->offs[n].start;
2688                     }
2689                     n++;
2690                 }
2691                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2692                     && min >  prog->offs[0].end
2693                 )
2694                     min = prog->offs[0].end;
2695
2696             }
2697
2698             assert(min >= 0 && min <= max && min <= strend - strbeg);
2699             sublen = max - min;
2700
2701             if (RX_MATCH_COPIED(rx)) {
2702                 if (sublen > prog->sublen)
2703                     prog->subbeg =
2704                             (char*)saferealloc(prog->subbeg, sublen+1);
2705             }
2706             else
2707                 prog->subbeg = (char*)safemalloc(sublen+1);
2708             Copy(strbeg + min, prog->subbeg, sublen, char);
2709             prog->subbeg[sublen] = '\0';
2710             prog->suboffset = min;
2711             prog->sublen = sublen;
2712             RX_MATCH_COPIED_on(rx);
2713         }
2714         prog->subcoffset = prog->suboffset;
2715         if (prog->suboffset && utf8_target) {
2716             /* Convert byte offset to chars.
2717              * XXX ideally should only compute this if @-/@+
2718              * has been seen, a la PL_sawampersand ??? */
2719
2720             /* If there's a direct correspondence between the
2721              * string which we're matching and the original SV,
2722              * then we can use the utf8 len cache associated with
2723              * the SV. In particular, it means that under //g,
2724              * sv_pos_b2u() will use the previously cached
2725              * position to speed up working out the new length of
2726              * subcoffset, rather than counting from the start of
2727              * the string each time. This stops
2728              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2729              * from going quadratic */
2730             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2731                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2732                                                 SV_GMAGIC|SV_CONST_RETURN);
2733             else
2734                 prog->subcoffset = utf8_length((U8*)strbeg,
2735                                     (U8*)(strbeg+prog->suboffset));
2736         }
2737     }
2738     else {
2739         RX_MATCH_COPY_FREE(rx);
2740         prog->subbeg = strbeg;
2741         prog->suboffset = 0;
2742         prog->subcoffset = 0;
2743         prog->sublen = strend - strbeg;
2744     }
2745 }
2746
2747
2748
2749
2750 /*
2751  - regexec_flags - match a regexp against a string
2752  */
2753 I32
2754 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2755               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2756 /* stringarg: the point in the string at which to begin matching */
2757 /* strend:    pointer to null at end of string */
2758 /* strbeg:    real beginning of string */
2759 /* minend:    end of match must be >= minend bytes after stringarg. */
2760 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2761  *            itself is accessed via the pointers above */
2762 /* data:      May be used for some additional optimizations.
2763               Currently unused. */
2764 /* flags:     For optimizations. See REXEC_* in regexp.h */
2765
2766 {
2767     struct regexp *const prog = ReANY(rx);
2768     char *s;
2769     regnode *c;
2770     char *startpos;
2771     SSize_t minlen;             /* must match at least this many chars */
2772     SSize_t dontbother = 0;     /* how many characters not to try at end */
2773     const bool utf8_target = cBOOL(DO_UTF8(sv));
2774     I32 multiline;
2775     RXi_GET_DECL(prog,progi);
2776     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2777     regmatch_info *const reginfo = &reginfo_buf;
2778     regexp_paren_pair *swap = NULL;
2779     I32 oldsave;
2780     GET_RE_DEBUG_FLAGS_DECL;
2781
2782     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2783     PERL_UNUSED_ARG(data);
2784
2785     /* Be paranoid... */
2786     if (prog == NULL || stringarg == NULL) {
2787         Perl_croak(aTHX_ "NULL regexp parameter");
2788     }
2789
2790     DEBUG_EXECUTE_r(
2791         debug_start_match(rx, utf8_target, stringarg, strend,
2792         "Matching");
2793     );
2794
2795     startpos = stringarg;
2796
2797     if (prog->intflags & PREGf_GPOS_SEEN) {
2798         MAGIC *mg;
2799
2800         /* set reginfo->ganch, the position where \G can match */
2801
2802         reginfo->ganch =
2803             (flags & REXEC_IGNOREPOS)
2804             ? stringarg /* use start pos rather than pos() */
2805             : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2806               /* Defined pos(): */
2807             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2808             : strbeg; /* pos() not defined; use start of string */
2809
2810         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2811             "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2812
2813         /* in the presence of \G, we may need to start looking earlier in
2814          * the string than the suggested start point of stringarg:
2815          * if prog->gofs is set, then that's a known, fixed minimum
2816          * offset, such as
2817          * /..\G/:   gofs = 2
2818          * /ab|c\G/: gofs = 1
2819          * or if the minimum offset isn't known, then we have to go back
2820          * to the start of the string, e.g. /w+\G/
2821          */
2822
2823         if (prog->intflags & PREGf_ANCH_GPOS) {
2824             startpos  = reginfo->ganch - prog->gofs;
2825             if (startpos <
2826                 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2827             {
2828                 DEBUG_r(PerlIO_printf(Perl_debug_log,
2829                         "fail: ganch-gofs before earliest possible start\n"));
2830                 return 0;
2831             }
2832         }
2833         else if (prog->gofs) {
2834             if (startpos - prog->gofs < strbeg)
2835                 startpos = strbeg;
2836             else
2837                 startpos -= prog->gofs;
2838         }
2839         else if (prog->intflags & PREGf_GPOS_FLOAT)
2840             startpos = strbeg;
2841     }
2842
2843     minlen = prog->minlen;
2844     if ((startpos + minlen) > strend || startpos < strbeg) {
2845         DEBUG_r(PerlIO_printf(Perl_debug_log,
2846                     "Regex match can't succeed, so not even tried\n"));
2847         return 0;
2848     }
2849
2850     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2851      * which will call destuctors to reset PL_regmatch_state, free higher
2852      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2853      * regmatch_info_aux_eval */
2854
2855     oldsave = PL_savestack_ix;
2856
2857     s = startpos;
2858
2859     if ((prog->extflags & RXf_USE_INTUIT)
2860         && !(flags & REXEC_CHECKED))
2861     {
2862         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2863                                     flags, NULL);
2864         if (!s)
2865             return 0;
2866
2867         if (prog->extflags & RXf_CHECK_ALL) {
2868             /* we can match based purely on the result of INTUIT.
2869              * Set up captures etc just for $& and $-[0]
2870              * (an intuit-only match wont have $1,$2,..) */
2871             assert(!prog->nparens);
2872
2873             /* s/// doesn't like it if $& is earlier than where we asked it to
2874              * start searching (which can happen on something like /.\G/) */
2875             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2876                     && (s < stringarg))
2877             {
2878                 /* this should only be possible under \G */
2879                 assert(prog->intflags & PREGf_GPOS_SEEN);
2880                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2881                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2882                 goto phooey;
2883             }
2884
2885             /* match via INTUIT shouldn't have any captures.
2886              * Let @-, @+, $^N know */
2887             prog->lastparen = prog->lastcloseparen = 0;
2888             RX_MATCH_UTF8_set(rx, utf8_target);
2889             prog->offs[0].start = s - strbeg;
2890             prog->offs[0].end = utf8_target
2891                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2892                 : s - strbeg + prog->minlenret;
2893             if ( !(flags & REXEC_NOT_FIRST) )
2894                 S_reg_set_capture_string(aTHX_ rx,
2895                                         strbeg, strend,
2896                                         sv, flags, utf8_target);
2897
2898             return 1;
2899         }
2900     }
2901
2902     multiline = prog->extflags & RXf_PMf_MULTILINE;
2903
2904     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2905         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2906                               "String too short [regexec_flags]...\n"));
2907         goto phooey;
2908     }
2909
2910     /* Check validity of program. */
2911     if (UCHARAT(progi->program) != REG_MAGIC) {
2912         Perl_croak(aTHX_ "corrupted regexp program");
2913     }
2914
2915     RX_MATCH_TAINTED_off(rx);
2916     RX_MATCH_UTF8_set(rx, utf8_target);
2917
2918     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2919     reginfo->intuit = 0;
2920     reginfo->is_utf8_target = cBOOL(utf8_target);
2921     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2922     reginfo->warned = FALSE;
2923     reginfo->strbeg  = strbeg;
2924     reginfo->sv = sv;
2925     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2926     reginfo->strend = strend;
2927     /* see how far we have to get to not match where we matched before */
2928     reginfo->till = stringarg + minend;
2929
2930     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2931         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2932            S_cleanup_regmatch_info_aux has executed (registered by
2933            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2934            magic belonging to this SV.
2935            Not newSVsv, either, as it does not COW.
2936         */
2937         reginfo->sv = newSV(0);
2938         SvSetSV_nosteal(reginfo->sv, sv);
2939         SAVEFREESV(reginfo->sv);
2940     }
2941
2942     /* reserve next 2 or 3 slots in PL_regmatch_state:
2943      * slot N+0: may currently be in use: skip it
2944      * slot N+1: use for regmatch_info_aux struct
2945      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2946      * slot N+3: ready for use by regmatch()
2947      */
2948
2949     {
2950         regmatch_state *old_regmatch_state;
2951         regmatch_slab  *old_regmatch_slab;
2952         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2953
2954         /* on first ever match, allocate first slab */
2955         if (!PL_regmatch_slab) {
2956             Newx(PL_regmatch_slab, 1, regmatch_slab);
2957             PL_regmatch_slab->prev = NULL;
2958             PL_regmatch_slab->next = NULL;
2959             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2960         }
2961
2962         old_regmatch_state = PL_regmatch_state;
2963         old_regmatch_slab  = PL_regmatch_slab;
2964
2965         for (i=0; i <= max; i++) {
2966             if (i == 1)
2967                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2968             else if (i ==2)
2969                 reginfo->info_aux_eval =
2970                 reginfo->info_aux->info_aux_eval =
2971                             &(PL_regmatch_state->u.info_aux_eval);
2972
2973             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2974                 PL_regmatch_state = S_push_slab(aTHX);
2975         }
2976
2977         /* note initial PL_regmatch_state position; at end of match we'll
2978          * pop back to there and free any higher slabs */
2979
2980         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2981         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2982         reginfo->info_aux->poscache = NULL;
2983
2984         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2985
2986         if ((prog->extflags & RXf_EVAL_SEEN))
2987             S_setup_eval_state(aTHX_ reginfo);
2988         else
2989             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2990     }
2991
2992     /* If there is a "must appear" string, look for it. */
2993
2994     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2995         /* We have to be careful. If the previous successful match
2996            was from this regex we don't want a subsequent partially
2997            successful match to clobber the old results.
2998            So when we detect this possibility we add a swap buffer
2999            to the re, and switch the buffer each match. If we fail,
3000            we switch it back; otherwise we leave it swapped.
3001         */
3002         swap = prog->offs;
3003         /* do we need a save destructor here for eval dies? */
3004         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3005         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3006             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3007             PTR2UV(prog),
3008             PTR2UV(swap),
3009             PTR2UV(prog->offs)
3010         ));
3011     }
3012
3013     /* Simplest case: anchored match need be tried only once, or with
3014      * MBOL, only at the beginning of each line.
3015      *
3016      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3017      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3018      * match at the start of the string then it won't match anywhere else
3019      * either; while with /.*.../, if it doesn't match at the beginning,
3020      * the earliest it could match is at the start of the next line */
3021
3022     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3023         char *end;
3024
3025         if (regtry(reginfo, &s))
3026             goto got_it;
3027
3028         if (!(prog->intflags & PREGf_ANCH_MBOL))
3029             goto phooey;
3030
3031         /* didn't match at start, try at other newline positions */
3032
3033         if (minlen)
3034             dontbother = minlen - 1;
3035         end = HOP3c(strend, -dontbother, strbeg) - 1;
3036
3037         /* skip to next newline */
3038
3039         while (s <= end) { /* note it could be possible to match at the end of the string */
3040             /* NB: newlines are the same in unicode as they are in latin */
3041             if (*s++ != '\n')
3042                 continue;
3043             if (prog->check_substr || prog->check_utf8) {
3044             /* note that with PREGf_IMPLICIT, intuit can only fail
3045              * or return the start position, so it's of limited utility.
3046              * Nevertheless, I made the decision that the potential for
3047              * quick fail was still worth it - DAPM */
3048                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3049                 if (!s)
3050                     goto phooey;
3051             }
3052             if (regtry(reginfo, &s))
3053                 goto got_it;
3054         }
3055         goto phooey;
3056     } /* end anchored search */
3057
3058     if (prog->intflags & PREGf_ANCH_GPOS)
3059     {
3060         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3061         assert(prog->intflags & PREGf_GPOS_SEEN);
3062         /* For anchored \G, the only position it can match from is
3063          * (ganch-gofs); we already set startpos to this above; if intuit
3064          * moved us on from there, we can't possibly succeed */
3065         assert(startpos == reginfo->ganch - prog->gofs);
3066         if (s == startpos && regtry(reginfo, &s))
3067             goto got_it;
3068         goto phooey;
3069     }
3070
3071     /* Messy cases:  unanchored match. */
3072     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3073         /* we have /x+whatever/ */
3074         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3075         char ch;
3076 #ifdef DEBUGGING
3077         int did_match = 0;
3078 #endif
3079         if (utf8_target) {
3080             if (! prog->anchored_utf8) {
3081                 to_utf8_substr(prog);
3082             }
3083             ch = SvPVX_const(prog->anchored_utf8)[0];
3084             REXEC_FBC_SCAN(
3085                 if (*s == ch) {
3086                     DEBUG_EXECUTE_r( did_match = 1 );
3087                     if (regtry(reginfo, &s)) goto got_it;
3088                     s += UTF8SKIP(s);
3089                     while (s < strend && *s == ch)
3090                         s += UTF8SKIP(s);
3091                 }
3092             );
3093
3094         }
3095         else {
3096             if (! prog->anchored_substr) {
3097                 if (! to_byte_substr(prog)) {
3098                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3099                 }
3100             }
3101             ch = SvPVX_const(prog->anchored_substr)[0];
3102             REXEC_FBC_SCAN(
3103                 if (*s == ch) {
3104                     DEBUG_EXECUTE_r( did_match = 1 );
3105                     if (regtry(reginfo, &s)) goto got_it;
3106                     s++;
3107                     while (s < strend && *s == ch)
3108                         s++;
3109                 }
3110             );
3111         }
3112         DEBUG_EXECUTE_r(if (!did_match)
3113                 PerlIO_printf(Perl_debug_log,
3114                                   "Did not find anchored character...\n")
3115                );
3116     }
3117     else if (prog->anchored_substr != NULL
3118               || prog->anchored_utf8 != NULL
3119               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3120                   && prog->float_max_offset < strend - s)) {
3121         SV *must;
3122         SSize_t back_max;
3123         SSize_t back_min;
3124         char *last;
3125         char *last1;            /* Last position checked before */
3126 #ifdef DEBUGGING
3127         int did_match = 0;
3128 #endif
3129         if (prog->anchored_substr || prog->anchored_utf8) {
3130             if (utf8_target) {
3131                 if (! prog->anchored_utf8) {
3132                     to_utf8_substr(prog);
3133                 }
3134                 must = prog->anchored_utf8;
3135             }
3136             else {
3137                 if (! prog->anchored_substr) {
3138                     if (! to_byte_substr(prog)) {
3139                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3140                     }
3141                 }
3142                 must = prog->anchored_substr;
3143             }
3144             back_max = back_min = prog->anchored_offset;
3145         } else {
3146             if (utf8_target) {
3147                 if (! prog->float_utf8) {
3148                     to_utf8_substr(prog);
3149                 }
3150                 must = prog->float_utf8;
3151             }
3152             else {
3153                 if (! prog->float_substr) {
3154                     if (! to_byte_substr(prog)) {
3155                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3156                     }
3157                 }
3158                 must = prog->float_substr;
3159             }
3160             back_max = prog->float_max_offset;
3161             back_min = prog->float_min_offset;
3162         }
3163
3164         if (back_min<0) {
3165             last = strend;
3166         } else {
3167             last = HOP3c(strend,        /* Cannot start after this */
3168                   -(SSize_t)(CHR_SVLEN(must)
3169                          - (SvTAIL(must) != 0) + back_min), strbeg);
3170         }
3171         if (s > reginfo->strbeg)
3172             last1 = HOPc(s, -1);
3173         else
3174             last1 = s - 1;      /* bogus */
3175
3176         /* XXXX check_substr already used to find "s", can optimize if
3177            check_substr==must. */
3178         dontbother = 0;
3179         strend = HOPc(strend, -dontbother);
3180         while ( (s <= last) &&
3181                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3182                                   (unsigned char*)strend, must,
3183                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3184             DEBUG_EXECUTE_r( did_match = 1 );
3185             if (HOPc(s, -back_max) > last1) {
3186                 last1 = HOPc(s, -back_min);
3187                 s = HOPc(s, -back_max);
3188             }
3189             else {
3190                 char * const t = (last1 >= reginfo->strbeg)
3191                                     ? HOPc(last1, 1) : last1 + 1;
3192
3193                 last1 = HOPc(s, -back_min);
3194                 s = t;
3195             }
3196             if (utf8_target) {
3197                 while (s <= last1) {
3198                     if (regtry(reginfo, &s))
3199                         goto got_it;
3200                     if (s >= last1) {
3201                         s++; /* to break out of outer loop */
3202                         break;
3203                     }
3204                     s += UTF8SKIP(s);
3205                 }
3206             }
3207             else {
3208                 while (s <= last1) {
3209                     if (regtry(reginfo, &s))
3210                         goto got_it;
3211                     s++;
3212                 }
3213             }
3214         }
3215         DEBUG_EXECUTE_r(if (!did_match) {
3216             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3217                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3218             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
3219                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3220                                ? "anchored" : "floating"),
3221                 quoted, RE_SV_TAIL(must));
3222         });
3223         goto phooey;
3224     }
3225     else if ( (c = progi->regstclass) ) {
3226         if (minlen) {
3227             const OPCODE op = OP(progi->regstclass);
3228             /* don't bother with what can't match */
3229             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
3230                 strend = HOPc(strend, -(minlen - 1));
3231         }
3232         DEBUG_EXECUTE_r({
3233             SV * const prop = sv_newmortal();
3234             regprop(prog, prop, c, reginfo, NULL);
3235             {
3236                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3237                     s,strend-s,60);
3238                 PerlIO_printf(Perl_debug_log,
3239                     "Matching stclass %.*s against %s (%d bytes)\n",
3240                     (int)SvCUR(prop), SvPVX_const(prop),
3241                      quoted, (int)(strend - s));
3242             }
3243         });
3244         if (find_byclass(prog, c, s, strend, reginfo))
3245             goto got_it;
3246         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
3247     }
3248     else {
3249         dontbother = 0;
3250         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3251             /* Trim the end. */
3252             char *last= NULL;
3253             SV* float_real;
3254             STRLEN len;
3255             const char *little;
3256
3257             if (utf8_target) {
3258                 if (! prog->float_utf8) {
3259                     to_utf8_substr(prog);
3260                 }
3261                 float_real = prog->float_utf8;
3262             }
3263             else {
3264                 if (! prog->float_substr) {
3265                     if (! to_byte_substr(prog)) {
3266                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3267                     }
3268                 }
3269                 float_real = prog->float_substr;
3270             }
3271
3272             little = SvPV_const(float_real, len);
3273             if (SvTAIL(float_real)) {
3274                     /* This means that float_real contains an artificial \n on
3275                      * the end due to the presence of something like this:
3276                      * /foo$/ where we can match both "foo" and "foo\n" at the
3277                      * end of the string.  So we have to compare the end of the
3278                      * string first against the float_real without the \n and
3279                      * then against the full float_real with the string.  We
3280                      * have to watch out for cases where the string might be
3281                      * smaller than the float_real or the float_real without
3282                      * the \n. */
3283                     char *checkpos= strend - len;
3284                     DEBUG_OPTIMISE_r(
3285                         PerlIO_printf(Perl_debug_log,
3286                             "%sChecking for float_real.%s\n",
3287                             PL_colors[4], PL_colors[5]));
3288                     if (checkpos + 1 < strbeg) {
3289                         /* can't match, even if we remove the trailing \n
3290                          * string is too short to match */
3291                         DEBUG_EXECUTE_r(
3292                             PerlIO_printf(Perl_debug_log,
3293                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3294                                 PL_colors[4], PL_colors[5]));
3295                         goto phooey;
3296                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3297                         /* can match, the end of the string matches without the
3298                          * "\n" */
3299                         last = checkpos + 1;
3300                     } else if (checkpos < strbeg) {
3301                         /* cant match, string is too short when the "\n" is
3302                          * included */
3303                         DEBUG_EXECUTE_r(
3304                             PerlIO_printf(Perl_debug_log,
3305                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3306                                 PL_colors[4], PL_colors[5]));
3307                         goto phooey;
3308                     } else if (!multiline) {
3309                         /* non multiline match, so compare with the "\n" at the
3310                          * end of the string */
3311                         if (memEQ(checkpos, little, len)) {
3312                             last= checkpos;
3313                         } else {
3314                             DEBUG_EXECUTE_r(
3315                                 PerlIO_printf(Perl_debug_log,
3316                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3317                                     PL_colors[4], PL_colors[5]));
3318                             goto phooey;
3319                         }
3320                     } else {
3321                         /* multiline match, so we have to search for a place
3322                          * where the full string is located */
3323                         goto find_last;
3324                     }
3325             } else {
3326                   find_last:
3327                     if (len)
3328                         last = rninstr(s, strend, little, little + len);
3329                     else
3330                         last = strend;  /* matching "$" */
3331             }
3332             if (!last) {
3333                 /* at one point this block contained a comment which was
3334                  * probably incorrect, which said that this was a "should not
3335                  * happen" case.  Even if it was true when it was written I am
3336                  * pretty sure it is not anymore, so I have removed the comment
3337                  * and replaced it with this one. Yves */
3338                 DEBUG_EXECUTE_r(
3339                     PerlIO_printf(Perl_debug_log,
3340                         "%sString does not contain required substring, cannot match.%s\n",
3341                         PL_colors[4], PL_colors[5]
3342                     ));
3343                 goto phooey;
3344             }
3345             dontbother = strend - last + prog->float_min_offset;
3346         }
3347         if (minlen && (dontbother < minlen))
3348             dontbother = minlen - 1;
3349         strend -= dontbother;              /* this one's always in bytes! */
3350         /* We don't know much -- general case. */
3351         if (utf8_target) {
3352             for (;;) {
3353                 if (regtry(reginfo, &s))
3354                     goto got_it;
3355                 if (s >= strend)
3356                     break;
3357                 s += UTF8SKIP(s);
3358             };
3359         }
3360         else {
3361             do {
3362                 if (regtry(reginfo, &s))
3363                     goto got_it;
3364             } while (s++ < strend);
3365         }
3366     }
3367
3368     /* Failure. */
3369     goto phooey;
3370
3371   got_it:
3372     /* s/// doesn't like it if $& is earlier than where we asked it to
3373      * start searching (which can happen on something like /.\G/) */
3374     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3375             && (prog->offs[0].start < stringarg - strbeg))
3376     {
3377         /* this should only be possible under \G */
3378         assert(prog->intflags & PREGf_GPOS_SEEN);
3379         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3380             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3381         goto phooey;
3382     }
3383
3384     DEBUG_BUFFERS_r(
3385         if (swap)
3386             PerlIO_printf(Perl_debug_log,
3387                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3388                 PTR2UV(prog),
3389                 PTR2UV(swap)
3390             );
3391     );
3392     Safefree(swap);
3393
3394     /* clean up; this will trigger destructors that will free all slabs
3395      * above the current one, and cleanup the regmatch_info_aux
3396      * and regmatch_info_aux_eval sructs */
3397
3398     LEAVE_SCOPE(oldsave);
3399
3400     if (RXp_PAREN_NAMES(prog))
3401         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3402
3403     /* make sure $`, $&, $', and $digit will work later */
3404     if ( !(flags & REXEC_NOT_FIRST) )
3405         S_reg_set_capture_string(aTHX_ rx,
3406                                     strbeg, reginfo->strend,
3407                                     sv, flags, utf8_target);
3408
3409     return 1;
3410
3411   phooey:
3412     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3413                           PL_colors[4], PL_colors[5]));
3414
3415     /* clean up; this will trigger destructors that will free all slabs
3416      * above the current one, and cleanup the regmatch_info_aux
3417      * and regmatch_info_aux_eval sructs */
3418
3419     LEAVE_SCOPE(oldsave);
3420
3421     if (swap) {
3422         /* we failed :-( roll it back */
3423         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3424             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3425             PTR2UV(prog),
3426             PTR2UV(prog->offs),
3427             PTR2UV(swap)
3428         ));
3429         Safefree(prog->offs);
3430         prog->offs = swap;
3431     }
3432     return 0;
3433 }
3434
3435
3436 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3437  * Do inc before dec, in case old and new rex are the same */
3438 #define SET_reg_curpm(Re2)                          \
3439     if (reginfo->info_aux_eval) {                   \
3440         (void)ReREFCNT_inc(Re2);                    \
3441         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3442         PM_SETRE((PL_reg_curpm), (Re2));            \
3443     }
3444
3445
3446 /*
3447  - regtry - try match at specific point
3448  */
3449 STATIC I32                      /* 0 failure, 1 success */
3450 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3451 {
3452     CHECKPOINT lastcp;
3453     REGEXP *const rx = reginfo->prog;
3454     regexp *const prog = ReANY(rx);
3455     SSize_t result;
3456     RXi_GET_DECL(prog,progi);
3457     GET_RE_DEBUG_FLAGS_DECL;
3458
3459     PERL_ARGS_ASSERT_REGTRY;
3460
3461     reginfo->cutpoint=NULL;
3462
3463     prog->offs[0].start = *startposp - reginfo->strbeg;
3464     prog->lastparen = 0;
3465     prog->lastcloseparen = 0;
3466
3467     /* XXXX What this code is doing here?!!!  There should be no need
3468        to do this again and again, prog->lastparen should take care of
3469        this!  --ilya*/
3470
3471     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3472      * Actually, the code in regcppop() (which Ilya may be meaning by
3473      * prog->lastparen), is not needed at all by the test suite
3474      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3475      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3476      * Meanwhile, this code *is* needed for the
3477      * above-mentioned test suite tests to succeed.  The common theme
3478      * on those tests seems to be returning null fields from matches.
3479      * --jhi updated by dapm */
3480 #if 1
3481     if (prog->nparens) {
3482         regexp_paren_pair *pp = prog->offs;
3483         I32 i;
3484         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3485             ++pp;
3486             pp->start = -1;
3487             pp->end = -1;
3488         }
3489     }
3490 #endif
3491     REGCP_SET(lastcp);
3492     result = regmatch(reginfo, *startposp, progi->program + 1);
3493     if (result != -1) {
3494         prog->offs[0].end = result;
3495         return 1;
3496     }
3497     if (reginfo->cutpoint)
3498         *startposp= reginfo->cutpoint;
3499     REGCP_UNWIND(lastcp);
3500     return 0;
3501 }
3502
3503
3504 #define sayYES goto yes
3505 #define sayNO goto no
3506 #define sayNO_SILENT goto no_silent
3507
3508 /* we dont use STMT_START/END here because it leads to
3509    "unreachable code" warnings, which are bogus, but distracting. */
3510 #define CACHEsayNO \
3511     if (ST.cache_mask) \
3512        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3513     sayNO
3514
3515 /* this is used to determine how far from the left messages like
3516    'failed...' are printed. It should be set such that messages
3517    are inline with the regop output that created them.
3518 */
3519 #define REPORT_CODE_OFF 32
3520
3521
3522 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3523 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3524 #define CHRTEST_NOT_A_CP_1 -999
3525 #define CHRTEST_NOT_A_CP_2 -998
3526
3527 /* grab a new slab and return the first slot in it */
3528
3529 STATIC regmatch_state *
3530 S_push_slab(pTHX)
3531 {
3532 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3533     dMY_CXT;
3534 #endif
3535     regmatch_slab *s = PL_regmatch_slab->next;
3536     if (!s) {
3537         Newx(s, 1, regmatch_slab);
3538         s->prev = PL_regmatch_slab;
3539         s->next = NULL;
3540         PL_regmatch_slab->next = s;
3541     }
3542     PL_regmatch_slab = s;
3543     return SLAB_FIRST(s);
3544 }
3545
3546
3547 /* push a new state then goto it */
3548
3549 #define PUSH_STATE_GOTO(state, node, input) \
3550     pushinput = input; \
3551     scan = node; \
3552     st->resume_state = state; \
3553     goto push_state;
3554
3555 /* push a new state with success backtracking, then goto it */
3556
3557 #define PUSH_YES_STATE_GOTO(state, node, input) \
3558     pushinput = input; \
3559     scan = node; \
3560     st->resume_state = state; \
3561     goto push_yes_state;
3562
3563
3564
3565
3566 /*
3567
3568 regmatch() - main matching routine
3569
3570 This is basically one big switch statement in a loop. We execute an op,
3571 set 'next' to point the next op, and continue. If we come to a point which
3572 we may need to backtrack to on failure such as (A|B|C), we push a
3573 backtrack state onto the backtrack stack. On failure, we pop the top
3574 state, and re-enter the loop at the state indicated. If there are no more
3575 states to pop, we return failure.
3576
3577 Sometimes we also need to backtrack on success; for example /A+/, where
3578 after successfully matching one A, we need to go back and try to
3579 match another one; similarly for lookahead assertions: if the assertion
3580 completes successfully, we backtrack to the state just before the assertion
3581 and then carry on.  In these cases, the pushed state is marked as
3582 'backtrack on success too'. This marking is in fact done by a chain of
3583 pointers, each pointing to the previous 'yes' state. On success, we pop to
3584 the nearest yes state, discarding any intermediate failure-only states.
3585 Sometimes a yes state is pushed just to force some cleanup code to be
3586 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3587 it to free the inner regex.
3588
3589 Note that failure backtracking rewinds the cursor position, while
3590 success backtracking leaves it alone.
3591
3592 A pattern is complete when the END op is executed, while a subpattern
3593 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3594 ops trigger the "pop to last yes state if any, otherwise return true"
3595 behaviour.
3596
3597 A common convention in this function is to use A and B to refer to the two
3598 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3599 the subpattern to be matched possibly multiple times, while B is the entire
3600 rest of the pattern. Variable and state names reflect this convention.
3601
3602 The states in the main switch are the union of ops and failure/success of
3603 substates associated with with that op.  For example, IFMATCH is the op
3604 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3605 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3606 successfully matched A and IFMATCH_A_fail is a state saying that we have
3607 just failed to match A. Resume states always come in pairs. The backtrack
3608 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3609 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3610 on success or failure.
3611
3612 The struct that holds a backtracking state is actually a big union, with
3613 one variant for each major type of op. The variable st points to the
3614 top-most backtrack struct. To make the code clearer, within each
3615 block of code we #define ST to alias the relevant union.
3616
3617 Here's a concrete example of a (vastly oversimplified) IFMATCH
3618 implementation:
3619
3620     switch (state) {
3621     ....
3622
3623 #define ST st->u.ifmatch
3624
3625     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3626         ST.foo = ...; // some state we wish to save
3627         ...
3628         // push a yes backtrack state with a resume value of
3629         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3630         // first node of A:
3631         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3632         // NOTREACHED
3633
3634     case IFMATCH_A: // we have successfully executed A; now continue with B
3635         next = B;
3636         bar = ST.foo; // do something with the preserved value
3637         break;
3638
3639     case IFMATCH_A_fail: // A failed, so the assertion failed
3640         ...;   // do some housekeeping, then ...
3641         sayNO; // propagate the failure
3642
3643 #undef ST
3644
3645     ...
3646     }
3647
3648 For any old-timers reading this who are familiar with the old recursive
3649 approach, the code above is equivalent to:
3650
3651     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3652     {
3653         int foo = ...
3654         ...
3655         if (regmatch(A)) {
3656             next = B;
3657             bar = foo;
3658             break;
3659         }
3660         ...;   // do some housekeeping, then ...
3661         sayNO; // propagate the failure
3662     }
3663
3664 The topmost backtrack state, pointed to by st, is usually free. If you
3665 want to claim it, populate any ST.foo fields in it with values you wish to
3666 save, then do one of
3667
3668         PUSH_STATE_GOTO(resume_state, node, newinput);
3669         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3670
3671 which sets that backtrack state's resume value to 'resume_state', pushes a
3672 new free entry to the top of the backtrack stack, then goes to 'node'.
3673 On backtracking, the free slot is popped, and the saved state becomes the
3674 new free state. An ST.foo field in this new top state can be temporarily
3675 accessed to retrieve values, but once the main loop is re-entered, it
3676 becomes available for reuse.
3677
3678 Note that the depth of the backtrack stack constantly increases during the
3679 left-to-right execution of the pattern, rather than going up and down with
3680 the pattern nesting. For example the stack is at its maximum at Z at the
3681 end of the pattern, rather than at X in the following:
3682
3683     /(((X)+)+)+....(Y)+....Z/
3684
3685 The only exceptions to this are lookahead/behind assertions and the cut,
3686 (?>A), which pop all the backtrack states associated with A before
3687 continuing.
3688
3689 Backtrack state structs are allocated in slabs of about 4K in size.
3690 PL_regmatch_state and st always point to the currently active state,
3691 and PL_regmatch_slab points to the slab currently containing
3692 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3693 allocated, and is never freed until interpreter destruction. When the slab
3694 is full, a new one is allocated and chained to the end. At exit from
3695 regmatch(), slabs allocated since entry are freed.
3696
3697 */
3698
3699
3700 #define DEBUG_STATE_pp(pp)                                  \
3701     DEBUG_STATE_r({                                         \
3702         DUMP_EXEC_POS(locinput, scan, utf8_target);         \
3703         PerlIO_printf(Perl_debug_log,                       \
3704             "    %*s"pp" %s%s%s%s%s\n",                     \
3705             depth*2, "",                                    \
3706             PL_reg_name[st->resume_state],                  \
3707             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3708             ((st==yes_state) ? "Y" : ""),                   \
3709             ((st==mark_state) ? "M" : ""),                  \
3710             ((st==yes_state||st==mark_state) ? "]" : "")    \
3711         );                                                  \
3712     });
3713
3714
3715 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3716
3717 #ifdef DEBUGGING
3718
3719 STATIC void
3720 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3721     const char *start, const char *end, const char *blurb)
3722 {
3723     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3724
3725     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3726
3727     if (!PL_colorset)
3728             reginitcolors();
3729     {
3730         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3731             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3732
3733         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3734             start, end - start, 60);
3735
3736         PerlIO_printf(Perl_debug_log,
3737             "%s%s REx%s %s against %s\n",
3738                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3739
3740         if (utf8_target||utf8_pat)
3741             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3742                 utf8_pat ? "pattern" : "",
3743                 utf8_pat && utf8_target ? " and " : "",
3744                 utf8_target ? "string" : ""
3745             );
3746     }
3747 }
3748
3749 STATIC void
3750 S_dump_exec_pos(pTHX_ const char *locinput,
3751                       const regnode *scan,
3752                       const char *loc_regeol,
3753                       const char *loc_bostr,
3754                       const char *loc_reg_starttry,
3755                       const bool utf8_target)
3756 {
3757     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3758     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3759     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3760     /* The part of the string before starttry has one color
3761        (pref0_len chars), between starttry and current
3762        position another one (pref_len - pref0_len chars),
3763        after the current position the third one.
3764        We assume that pref0_len <= pref_len, otherwise we
3765        decrease pref0_len.  */
3766     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3767         ? (5 + taill) - l : locinput - loc_bostr;
3768     int pref0_len;
3769
3770     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3771
3772     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3773         pref_len++;
3774     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3775     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3776         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3777               ? (5 + taill) - pref_len : loc_regeol - locinput);
3778     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3779         l--;
3780     if (pref0_len < 0)
3781         pref0_len = 0;
3782     if (pref0_len > pref_len)
3783         pref0_len = pref_len;
3784     {
3785         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3786
3787         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3788             (locinput - pref_len),pref0_len, 60, 4, 5);
3789
3790         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3791                     (locinput - pref_len + pref0_len),
3792                     pref_len - pref0_len, 60, 2, 3);
3793
3794         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3795                     locinput, loc_regeol - locinput, 10, 0, 1);
3796
3797         const STRLEN tlen=len0+len1+len2;
3798         PerlIO_printf(Perl_debug_log,
3799                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3800                     (IV)(locinput - loc_bostr),
3801                     len0, s0,
3802                     len1, s1,
3803                     (docolor ? "" : "> <"),
3804                     len2, s2,
3805                     (int)(tlen > 19 ? 0 :  19 - tlen),
3806                     "");
3807     }
3808 }
3809
3810 #endif
3811
3812 /* reg_check_named_buff_matched()
3813  * Checks to see if a named buffer has matched. The data array of
3814  * buffer numbers corresponding to the buffer is expected to reside
3815  * in the regexp->data->data array in the slot stored in the ARG() of
3816  * node involved. Note that this routine doesn't actually care about the
3817  * name, that information is not preserved from compilation to execution.
3818  * Returns the index of the leftmost defined buffer with the given name
3819  * or 0 if non of the buffers matched.
3820  */
3821 STATIC I32
3822 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3823 {
3824     I32 n;
3825     RXi_GET_DECL(rex,rexi);
3826     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3827     I32 *nums=(I32*)SvPVX(sv_dat);
3828
3829     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3830
3831     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3832         if ((I32)rex->lastparen >= nums[n] &&
3833             rex->offs[nums[n]].end != -1)
3834         {
3835             return nums[n];
3836         }
3837     }
3838     return 0;
3839 }
3840
3841
3842 static bool
3843 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3844         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3845 {
3846     /* This function determines if there are one or two characters that match
3847      * the first character of the passed-in EXACTish node <text_node>, and if
3848      * so, returns them in the passed-in pointers.
3849      *
3850      * If it determines that no possible character in the target string can
3851      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3852      * the first character in <text_node> requires UTF-8 to represent, and the
3853      * target string isn't in UTF-8.)
3854      *
3855      * If there are more than two characters that could match the beginning of
3856      * <text_node>, or if more context is required to determine a match or not,
3857      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3858      *
3859      * The motiviation behind this function is to allow the caller to set up
3860      * tight loops for matching.  If <text_node> is of type EXACT, there is
3861      * only one possible character that can match its first character, and so
3862      * the situation is quite simple.  But things get much more complicated if
3863      * folding is involved.  It may be that the first character of an EXACTFish
3864      * node doesn't participate in any possible fold, e.g., punctuation, so it
3865      * can be matched only by itself.  The vast majority of characters that are
3866      * in folds match just two things, their lower and upper-case equivalents.
3867      * But not all are like that; some have multiple possible matches, or match
3868      * sequences of more than one character.  This function sorts all that out.
3869      *
3870      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3871      * loop of trying to match A*, we know we can't exit where the thing
3872      * following it isn't a B.  And something can't be a B unless it is the
3873      * beginning of B.  By putting a quick test for that beginning in a tight
3874      * loop, we can rule out things that can't possibly be B without having to
3875      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3876      * character, we can make a tight loop matching A*, using the outputs of
3877      * this function.
3878      *
3879      * If the target string to match isn't in UTF-8, and there aren't
3880      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3881      * the one or two possible octets (which are characters in this situation)
3882      * that can match.  In all cases, if there is only one character that can
3883      * match, *<c1p> and *<c2p> will be identical.
3884      *
3885      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3886      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3887      * can match the beginning of <text_node>.  They should be declared with at
3888      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3889      * undefined what these contain.)  If one or both of the buffers are
3890      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3891      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3892      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3893      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3894      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3895
3896     const bool utf8_target = reginfo->is_utf8_target;
3897
3898     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3899     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
3900     bool use_chrtest_void = FALSE;
3901     const bool is_utf8_pat = reginfo->is_utf8_pat;
3902
3903     /* Used when we have both utf8 input and utf8 output, to avoid converting
3904      * to/from code points */
3905     bool utf8_has_been_setup = FALSE;
3906
3907     dVAR;
3908
3909     U8 *pat = (U8*)STRING(text_node);
3910     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3911
3912     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
3913
3914         /* In an exact node, only one thing can be matched, that first
3915          * character.  If both the pat and the target are UTF-8, we can just
3916          * copy the input to the output, avoiding finding the code point of
3917          * that character */
3918         if (!is_utf8_pat) {
3919             c2 = c1 = *pat;
3920         }
3921         else if (utf8_target) {
3922             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3923             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3924             utf8_has_been_setup = TRUE;
3925         }
3926         else {
3927             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3928         }
3929     }
3930     else { /* an EXACTFish node */
3931         U8 *pat_end = pat + STR_LEN(text_node);
3932
3933         /* An EXACTFL node has at least some characters unfolded, because what
3934          * they match is not known until now.  So, now is the time to fold
3935          * the first few of them, as many as are needed to determine 'c1' and
3936          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
3937          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3938          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
3939          * need to fold as many characters as a single character can fold to,
3940          * so that later we can check if the first ones are such a multi-char
3941          * fold.  But, in such a pattern only locale-problematic characters
3942          * aren't folded, so we can skip this completely if the first character
3943          * in the node isn't one of the tricky ones */
3944         if (OP(text_node) == EXACTFL) {
3945
3946             if (! is_utf8_pat) {
3947                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3948                 {
3949                     folded[0] = folded[1] = 's';
3950                     pat = folded;
3951                     pat_end = folded + 2;
3952                 }
3953             }
3954             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3955                 U8 *s = pat;
3956                 U8 *d = folded;
3957                 int i;
3958
3959                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3960                     if (isASCII(*s)) {
3961                         *(d++) = (U8) toFOLD_LC(*s);
3962                         s++;
3963                     }
3964                     else {
3965                         STRLEN len;
3966                         _to_utf8_fold_flags(s,
3967                                             d,
3968                                             &len,
3969                                             FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3970                         d += len;
3971                         s += UTF8SKIP(s);
3972                     }
3973                 }
3974
3975                 pat = folded;
3976                 pat_end = d;
3977             }
3978         }
3979
3980         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3981              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
3982         {
3983             /* Multi-character folds require more context to sort out.  Also
3984              * PL_utf8_foldclosures used below doesn't handle them, so have to
3985              * be handled outside this routine */
3986             use_chrtest_void = TRUE;
3987         }
3988         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3989             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3990             if (c1 > 255) {
3991                 /* Load the folds hash, if not already done */
3992                 SV** listp;
3993                 if (! PL_utf8_foldclosures) {
3994                     _load_PL_utf8_foldclosures();
3995                 }
3996
3997                 /* The fold closures data structure is a hash with the keys
3998                  * being the UTF-8 of every character that is folded to, like
3999                  * 'k', and the values each an array of all code points that
4000                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4001                  * Multi-character folds are not included */
4002                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4003                                         (char *) pat,
4004                                         UTF8SKIP(pat),
4005                                         FALSE))))
4006                 {
4007                     /* Not found in the hash, therefore there are no folds
4008                     * containing it, so there is only a single character that
4009                     * could match */
4010                     c2 = c1;
4011                 }
4012                 else {  /* Does participate in folds */
4013                     AV* list = (AV*) *listp;
4014                     if (av_tindex(list) != 1) {
4015
4016                         /* If there aren't exactly two folds to this, it is
4017                          * outside the scope of this function */
4018                         use_chrtest_void = TRUE;
4019                     }
4020                     else {  /* There are two.  Get them */
4021                         SV** c_p = av_fetch(list, 0, FALSE);
4022                         if (c_p == NULL) {
4023                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4024                         }
4025                         c1 = SvUV(*c_p);
4026
4027                         c_p = av_fetch(list, 1, FALSE);
4028                         if (c_p == NULL) {
4029                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4030                         }
4031                         c2 = SvUV(*c_p);
4032
4033                         /* Folds that cross the 255/256 boundary are forbidden
4034                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4035                          * one is ASCIII.  Since the pattern character is above
4036                          * 255, and its only other match is below 256, the only
4037                          * legal match will be to itself.  We have thrown away
4038                          * the original, so have to compute which is the one
4039                          * above 255. */
4040                         if ((c1 < 256) != (c2 < 256)) {
4041                             if ((OP(text_node) == EXACTFL
4042                                  && ! IN_UTF8_CTYPE_LOCALE)
4043                                 || ((OP(text_node) == EXACTFA
4044                                     || OP(text_node) == EXACTFA_NO_TRIE)
4045                                     && (isASCII(c1) || isASCII(c2))))
4046                             {
4047                                 if (c1 < 256) {
4048                                     c1 = c2;
4049                                 }
4050                                 else {
4051                                     c2 = c1;
4052                                 }
4053                             }
4054                         }
4055                     }
4056                 }
4057             }
4058             else /* Here, c1 is <= 255 */
4059                 if (utf8_target
4060                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4061                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4062                     && ((OP(text_node) != EXACTFA
4063                         && OP(text_node) != EXACTFA_NO_TRIE)
4064                         || ! isASCII(c1)))
4065             {
4066                 /* Here, there could be something above Latin1 in the target
4067                  * which folds to this character in the pattern.  All such
4068                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4069                  * than two characters involved in their folds, so are outside
4070                  * the scope of this function */
4071                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4072                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4073                 }
4074                 else {
4075                     use_chrtest_void = TRUE;
4076                 }
4077             }
4078             else { /* Here nothing above Latin1 can fold to the pattern
4079                       character */
4080                 switch (OP(text_node)) {
4081
4082                     case EXACTFL:   /* /l rules */
4083                         c2 = PL_fold_locale[c1];
4084                         break;
4085
4086                     case EXACTF:   /* This node only generated for non-utf8
4087                                     patterns */
4088                         assert(! is_utf8_pat);
4089                         if (! utf8_target) {    /* /d rules */
4090                             c2 = PL_fold[c1];
4091                             break;
4092                         }
4093                         /* FALLTHROUGH */
4094                         /* /u rules for all these.  This happens to work for
4095                         * EXACTFA as nothing in Latin1 folds to ASCII */
4096                     case EXACTFA_NO_TRIE:   /* This node only generated for
4097                                             non-utf8 patterns */
4098                         assert(! is_utf8_pat);
4099                         /* FALLTHROUGH */
4100                     case EXACTFA:
4101                     case EXACTFU_SS:
4102                     case EXACTFU:
4103                         c2 = PL_fold_latin1[c1];
4104                         break;
4105
4106                     default:
4107                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4108                         NOT_REACHED; /* NOTREACHED */
4109                 }
4110             }
4111         }
4112     }
4113
4114     /* Here have figured things out.  Set up the returns */
4115     if (use_chrtest_void) {
4116         *c2p = *c1p = CHRTEST_VOID;
4117     }
4118     else if (utf8_target) {
4119         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4120             uvchr_to_utf8(c1_utf8, c1);
4121             uvchr_to_utf8(c2_utf8, c2);
4122         }
4123
4124         /* Invariants are stored in both the utf8 and byte outputs; Use
4125          * negative numbers otherwise for the byte ones.  Make sure that the
4126          * byte ones are the same iff the utf8 ones are the same */
4127         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4128         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4129                 ? *c2_utf8
4130                 : (c1 == c2)
4131                   ? CHRTEST_NOT_A_CP_1
4132                   : CHRTEST_NOT_A_CP_2;
4133     }
4134     else if (c1 > 255) {
4135        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4136                            can represent */
4137            return FALSE;
4138        }
4139
4140        *c1p = *c2p = c2;    /* c2 is the only representable value */
4141     }
4142     else {  /* c1 is representable; see about c2 */
4143        *c1p = c1;
4144        *c2p = (c2 < 256) ? c2 : c1;
4145     }
4146
4147     return TRUE;
4148 }
4149
4150 /* This creates a single number by combining two, with 'before' being like the
4151  * 10's digit, but this isn't necessarily base 10; it is base however many
4152  * elements of the enum there are */
4153 #define GCBcase(before, after) ((PL_GCB_ENUM_COUNT * before) + after)
4154
4155 STATIC bool
4156 S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
4157 {
4158     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4159      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4160
4161     switch (GCBcase(before, after)) {
4162
4163         /*  Break at the start and end of text.
4164             GB1.   sot ÷
4165             GB2.   ÷ eot
4166
4167             Break before and after controls except between CR and LF
4168             GB4.  ( Control | CR | LF )  ÷
4169             GB5.   ÷  ( Control | CR | LF )
4170
4171             Otherwise, break everywhere.
4172             GB10.  Any  ÷  Any */
4173         default:
4174             return TRUE;
4175
4176         /* Do not break between a CR and LF.
4177             GB3.  CR  ×  LF */
4178         case GCBcase(PL_GCB_CR, PL_GCB_LF):
4179             return FALSE;
4180
4181         /* Do not break Hangul syllable sequences.
4182             GB6.  L  ×  ( L | V | LV | LVT ) */
4183         case GCBcase(PL_GCB_L, PL_GCB_L):
4184         case GCBcase(PL_GCB_L, PL_GCB_V):
4185         case GCBcase(PL_GCB_L, PL_GCB_LV):
4186         case GCBcase(PL_GCB_L, PL_GCB_LVT):
4187             return FALSE;
4188
4189         /*  GB7.  ( LV | V )  ×  ( V | T ) */
4190         case GCBcase(PL_GCB_LV, PL_GCB_V):
4191         case GCBcase(PL_GCB_LV, PL_GCB_T):
4192         case GCBcase(PL_GCB_V, PL_GCB_V):
4193         case GCBcase(PL_GCB_V, PL_GCB_T):
4194             return FALSE;
4195
4196         /*  GB8.  ( LVT | T)  ×  T */
4197         case GCBcase(PL_GCB_LVT, PL_GCB_T):
4198         case GCBcase(PL_GCB_T, PL_GCB_T):
4199             return FALSE;
4200
4201         /* Do not break between regional indicator symbols.
4202             GB8a.  Regional_Indicator  ×  Regional_Indicator */
4203         case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Regional_Indicator):
4204             return FALSE;
4205
4206         /* Do not break before extending characters.
4207             GB9.     ×  Extend */
4208         case GCBcase(PL_GCB_Other, PL_GCB_Extend):
4209         case GCBcase(PL_GCB_Extend, PL_GCB_Extend):
4210         case GCBcase(PL_GCB_L, PL_GCB_Extend):
4211         case GCBcase(PL_GCB_LV, PL_GCB_Extend):
4212         case GCBcase(PL_GCB_LVT, PL_GCB_Extend):
4213         case GCBcase(PL_GCB_Prepend, PL_GCB_Extend):
4214         case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Extend):
4215         case GCBcase(PL_GCB_SpacingMark, PL_GCB_Extend):
4216         case GCBcase(PL_GCB_T, PL_GCB_Extend):
4217         case GCBcase(PL_GCB_V, PL_GCB_Extend):
4218             return FALSE;
4219
4220         /* Do not break before SpacingMarks, or after Prepend characters.
4221             GB9a.     ×  SpacingMark */
4222         case GCBcase(PL_GCB_Other, PL_GCB_SpacingMark):
4223         case GCBcase(PL_GCB_Extend, PL_GCB_SpacingMark):
4224         case GCBcase(PL_GCB_L, PL_GCB_SpacingMark):
4225         case GCBcase(PL_GCB_LV, PL_GCB_SpacingMark):
4226         case GCBcase(PL_GCB_LVT, PL_GCB_SpacingMark):
4227         case GCBcase(PL_GCB_Prepend, PL_GCB_SpacingMark):
4228         case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_SpacingMark):
4229         case GCBcase(PL_GCB_SpacingMark, PL_GCB_SpacingMark):
4230         case GCBcase(PL_GCB_T, PL_GCB_SpacingMark):
4231         case GCBcase(PL_GCB_V, PL_GCB_SpacingMark):
4232             return FALSE;
4233
4234         /* GB9b.  Prepend  ×   */
4235         case GCBcase(PL_GCB_Prepend, PL_GCB_Other):
4236         case GCBcase(PL_GCB_Prepend, PL_GCB_L):
4237         case GCBcase(PL_GCB_Prepend, PL_GCB_LV):
4238         case GCBcase(PL_GCB_Prepend, PL_GCB_LVT):
4239         case GCBcase(PL_GCB_Prepend, PL_GCB_Prepend):
4240         case GCBcase(PL_GCB_Prepend, PL_GCB_Regional_Indicator):
4241         case GCBcase(PL_GCB_Prepend, PL_GCB_T):
4242         case GCBcase(PL_GCB_Prepend, PL_GCB_V):
4243             return FALSE;
4244     }
4245
4246     NOT_REACHED; /* NOTREACHED */
4247 }
4248
4249 #define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
4250
4251 STATIC bool
4252 S_isSB(pTHX_ PL_SB_enum before,
4253              PL_SB_enum after,
4254              const U8 * const strbeg,
4255              const U8 * const curpos,
4256              const U8 * const strend,
4257              const bool utf8_target)
4258 {
4259     /* returns a boolean indicating if there is a Sentence Boundary Break
4260      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4261
4262     U8 * lpos = (U8 *) curpos;
4263     U8 * temp_pos;
4264     PL_SB_enum backup;
4265
4266     PERL_ARGS_ASSERT_ISSB;
4267
4268     /* Break at the start and end of text.
4269         SB1.  sot  ÷
4270         SB2.  ÷  eot */
4271     if (before == PL_SB_EDGE || after == PL_SB_EDGE) {
4272         return TRUE;
4273     }
4274
4275     /* SB 3: Do not break within CRLF. */
4276     if (before == PL_SB_CR && after == PL_SB_LF) {
4277         return FALSE;
4278     }
4279
4280     /* Break after paragraph separators.  (though why CR and LF are considered
4281      * so is beyond me (khw)
4282        SB4.  Sep | CR | LF  ÷ */
4283     if (before == PL_SB_Sep || before == PL_SB_CR || before == PL_SB_LF) {
4284         return TRUE;
4285     }
4286
4287     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4288      * (See Section 6.2, Replacing Ignore Rules.)
4289         SB5.  X (Extend | Format)*  →  X */
4290     if (after == PL_SB_Extend || after == PL_SB_Format) {
4291         return FALSE;
4292     }
4293
4294     if (before == PL_SB_Extend || before == PL_SB_Format) {
4295         before = backup_one_SB(strbeg, &lpos, utf8_target);
4296     }
4297
4298     /* Do not break after ambiguous terminators like period, if they are
4299      * immediately followed by a number or lowercase letter, if they are
4300      * between uppercase letters, if the first following letter (optionally
4301      * after certain punctuation) is lowercase, or if they are followed by
4302      * "continuation" punctuation such as comma, colon, or semicolon. For
4303      * example, a period may be an abbreviation or numeric period, and thus may
4304      * not mark the end of a sentence.
4305
4306      * SB6. ATerm  ×  Numeric */
4307     if (before == PL_SB_ATerm && after == PL_SB_Numeric) {
4308         return FALSE;
4309     }
4310
4311     /* SB7.  Upper ATerm  ×  Upper */
4312     if (before == PL_SB_ATerm && after == PL_SB_Upper) {
4313         temp_pos = lpos;
4314         if (PL_SB_Upper == backup_one_SB(strbeg, &temp_pos, utf8_target)) {
4315             return FALSE;
4316         }
4317     }
4318
4319     /* SB8a.  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
4320      * SB10.  (STerm | ATerm) Close* Sp*  ×  ( Sp | Sep | CR | LF )      */
4321     backup = before;
4322     temp_pos = lpos;
4323     while (backup == PL_SB_Sp) {
4324         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4325     }
4326     while (backup == PL_SB_Close) {
4327         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4328     }
4329     if ((backup == PL_SB_STerm || backup == PL_SB_ATerm)
4330         && (   after == PL_SB_SContinue
4331             || after == PL_SB_STerm
4332             || after == PL_SB_ATerm
4333             || after == PL_SB_Sp
4334             || after == PL_SB_Sep
4335             || after == PL_SB_CR
4336             || after == PL_SB_LF))
4337     {
4338         return FALSE;
4339     }
4340
4341     /* SB8.  ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR | LF |
4342      *                                              STerm | ATerm) )* Lower */
4343     if (backup == PL_SB_ATerm) {
4344         U8 * rpos = (U8 *) curpos;
4345         PL_SB_enum later = after;
4346
4347         while (    later != PL_SB_OLetter
4348                 && later != PL_SB_Upper
4349                 && later != PL_SB_Lower
4350                 && later != PL_SB_Sep
4351                 && later != PL_SB_CR
4352                 && later != PL_SB_LF
4353                 && later != PL_SB_STerm
4354                 && later != PL_SB_ATerm
4355                 && later != PL_SB_EDGE)
4356         {
4357             later = advance_one_SB(&rpos, strend, utf8_target);
4358         }
4359         if (later == PL_SB_Lower) {
4360             return FALSE;
4361         }
4362     }
4363
4364     /* Break after sentence terminators, but include closing punctuation,
4365      * trailing spaces, and a paragraph separator (if present). [See note
4366      * below.]
4367      * SB9.  ( STerm | ATerm ) Close*  ×  ( Close | Sp | Sep | CR | LF ) */
4368     backup = before;
4369     temp_pos = lpos;
4370     while (backup == PL_SB_Close) {
4371         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4372     }
4373     if ((backup == PL_SB_STerm || backup == PL_SB_ATerm)
4374         && (   after == PL_SB_Close
4375             || after == PL_SB_Sp
4376             || after == PL_SB_Sep
4377             || after == PL_SB_CR
4378             || after == PL_SB_LF))
4379     {
4380         return FALSE;
4381     }
4382
4383
4384     /* SB11.  ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )?  ÷ */
4385     temp_pos = lpos;
4386     backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4387     if (   backup == PL_SB_Sep
4388         || backup == PL_SB_CR
4389         || backup == PL_SB_LF)
4390     {
4391         lpos = temp_pos;
4392     }
4393     else {
4394         backup = before;
4395     }
4396     while (backup == PL_SB_Sp) {
4397         backup = backup_one_SB(strbeg, &lpos, utf8_target);
4398     }
4399     while (backup == PL_SB_Close) {
4400         backup = backup_one_SB(strbeg, &lpos, utf8_target);
4401     }
4402     if (backup == PL_SB_STerm || backup == PL_SB_ATerm) {
4403         return TRUE;
4404     }
4405
4406     /* Otherwise, do not break.
4407     SB12.  Any  ×  Any */
4408
4409     return FALSE;
4410 }
4411
4412 STATIC PL_SB_enum
4413 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4414 {
4415     PL_SB_enum sb;
4416
4417     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4418
4419     if (*curpos >= strend) {
4420         return PL_SB_EDGE;
4421     }
4422
4423     if (utf8_target) {
4424         do {
4425             *curpos += UTF8SKIP(*curpos);
4426             if (*curpos >= strend) {
4427                 return PL_SB_EDGE;
4428             }
4429             sb = getSB_VAL_UTF8(*curpos, strend);
4430         } while (sb == PL_SB_Extend || sb == PL_SB_Format);
4431     }
4432     else {
4433         do {
4434             (*curpos)++;
4435             if (*curpos >= strend) {
4436                 return PL_SB_EDGE;
4437             }
4438             sb = getSB_VAL_CP(**curpos);
4439         } while (sb == PL_SB_Extend || sb == PL_SB_Format);
4440     }
4441
4442     return sb;
4443 }
4444
4445 STATIC PL_SB_enum
4446 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4447 {
4448     PL_SB_enum sb;
4449
4450     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4451
4452     if (*curpos < strbeg) {
4453         return PL_SB_EDGE;
4454     }
4455
4456     if (utf8_target) {
4457         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4458         if (! prev_char_pos) {
4459             return PL_SB_EDGE;
4460         }
4461
4462         /* Back up over Extend and Format.  curpos is always just to the right
4463          * of the characater whose value we are getting */
4464         do {
4465             U8 * prev_prev_char_pos;
4466             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4467                                                                       strbeg)))
4468             {
4469                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4470                 *curpos = prev_char_pos;
4471                 prev_char_pos = prev_prev_char_pos;
4472             }
4473             else {
4474                 *curpos = (U8 *) strbeg;
4475                 return PL_SB_EDGE;
4476             }
4477         } while (sb == PL_SB_Extend || sb == PL_SB_Format);
4478     }
4479     else {
4480         do {
4481             if (*curpos - 2 < strbeg) {
4482                 *curpos = (U8 *) strbeg;
4483                 return PL_SB_EDGE;
4484             }
4485             (*curpos)--;
4486             sb = getSB_VAL_CP(*(*curpos - 1));
4487         } while (sb == PL_SB_Extend || sb == PL_SB_Format);
4488     }
4489
4490     return sb;
4491 }
4492
4493 #define WBcase(before, after) ((PL_WB_ENUM_COUNT * before) + after)
4494
4495 STATIC bool
4496 S_isWB(pTHX_ PL_WB_enum previous,
4497              PL_WB_enum before,
4498              PL_WB_enum after,
4499              const U8 * const strbeg,
4500              const U8 * const curpos,
4501              const U8 * const strend,
4502              const bool utf8_target)
4503 {
4504     /*  Return a boolean as to if the boundary between 'before' and 'after' is
4505      *  a Unicode word break, using their published algorithm.  Context may be
4506      *  needed to make this determination.  If the value for the character
4507      *  before 'before' is known, it is passed as 'previous'; otherwise that
4508      *  should be set to PL_WB_UNKNOWN.  The other input parameters give the
4509      *  boundaries and current position in the matching of the string.  That
4510      *  is, 'curpos' marks the position where the character whose wb value is
4511      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
4512
4513     U8 * before_pos = (U8 *) curpos;
4514     U8 * after_pos = (U8 *) curpos;
4515
4516     PERL_ARGS_ASSERT_ISWB;
4517
4518     /* WB1 and WB2: Break at the start and end of text. */
4519     if (before == PL_WB_EDGE || after == PL_WB_EDGE) {
4520         return TRUE;
4521     }
4522
4523     /* WB 3: Do not break within CRLF. */
4524     if (before == PL_WB_CR && after == PL_WB_LF) {
4525         return FALSE;
4526     }
4527
4528     /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
4529      * and LF) */
4530     if (   before == PL_WB_CR || before == PL_WB_LF || before == PL_WB_Newline
4531         || after ==  PL_WB_CR || after ==  PL_WB_LF || after ==  PL_WB_Newline)
4532     {
4533         return TRUE;
4534     }
4535
4536     /* Ignore Format and Extend characters, except when they appear at the
4537      * beginning of a region of text.
4538      * WB4.  X (Extend | Format)*  →  X. */
4539
4540     if (after == PL_WB_Extend || after == PL_WB_Format) {
4541         return FALSE;
4542     }
4543
4544     if (before == PL_WB_Extend || before == PL_WB_Format) {
4545         before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4546     }
4547
4548     switch (WBcase(before, after)) {
4549             /* Otherwise, break everywhere (including around ideographs).
4550                 WB14.  Any  ÷  Any */
4551             default:
4552                 return TRUE;
4553
4554             /* Do not break between most letters.
4555                 WB5.  (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
4556             case WBcase(PL_WB_ALetter, PL_WB_ALetter):
4557             case WBcase(PL_WB_ALetter, PL_WB_Hebrew_Letter):
4558             case WBcase(PL_WB_Hebrew_Letter, PL_WB_ALetter):
4559             case WBcase(PL_WB_Hebrew_Letter, PL_WB_Hebrew_Letter):
4560                 return FALSE;
4561
4562             /* Do not break letters across certain punctuation.
4563                 WB6.  (ALetter | Hebrew_Letter)
4564                         × (MidLetter | MidNumLet | Single_Quote) (ALetter
4565                                                             | Hebrew_Letter) */
4566             case WBcase(PL_WB_ALetter, PL_WB_MidLetter):
4567             case WBcase(PL_WB_ALetter, PL_WB_MidNumLet):
4568             case WBcase(PL_WB_ALetter, PL_WB_Single_Quote):
4569             case WBcase(PL_WB_Hebrew_Letter, PL_WB_MidLetter):
4570             case WBcase(PL_WB_Hebrew_Letter, PL_WB_MidNumLet):
4571             /*case WBcase(PL_WB_Hebrew_Letter, PL_WB_Single_Quote):*/
4572                 after = advance_one_WB(&after_pos, strend, utf8_target);
4573                 return after != PL_WB_ALetter && after != PL_WB_Hebrew_Letter;
4574
4575             /* WB7.  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
4576              *                    Single_Quote) ×  (ALetter | Hebrew_Letter) */
4577             case WBcase(PL_WB_MidLetter, PL_WB_ALetter):
4578             case WBcase(PL_WB_MidLetter, PL_WB_Hebrew_Letter):
4579             case WBcase(PL_WB_MidNumLet, PL_WB_ALetter):
4580             case WBcase(PL_WB_MidNumLet, PL_WB_Hebrew_Letter):
4581             case WBcase(PL_WB_Single_Quote, PL_WB_ALetter):
4582             case WBcase(PL_WB_Single_Quote, PL_WB_Hebrew_Letter):
4583                 before
4584                   = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4585                 return before != PL_WB_ALetter && before != PL_WB_Hebrew_Letter;
4586
4587             /* WB7a.  Hebrew_Letter  ×  Single_Quote */
4588             case WBcase(PL_WB_Hebrew_Letter, PL_WB_Single_Quote):
4589                 return FALSE;
4590
4591             /* WB7b.  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
4592             case WBcase(PL_WB_Hebrew_Letter, PL_WB_Double_Quote):
4593                 return advance_one_WB(&after_pos, strend, utf8_target)
4594                                                         != PL_WB_Hebrew_Letter;
4595
4596             /* WB7c.  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
4597             case WBcase(PL_WB_Double_Quote, PL_WB_Hebrew_Letter):
4598                 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4599                                                         != PL_WB_Hebrew_Letter;
4600
4601             /* Do not break within sequences of digits, or digits adjacent to
4602              * letters (“3a”, or “A3”).
4603                 WB8.  Numeric  ×  Numeric */
4604             case WBcase(PL_WB_Numeric, PL_WB_Numeric):
4605                 return FALSE;
4606
4607             /* WB9.  (ALetter | Hebrew_Letter)  ×  Numeric */
4608             case WBcase(PL_WB_ALetter, PL_WB_Numeric):
4609             case WBcase(PL_WB_Hebrew_Letter, PL_WB_Numeric):
4610                 return FALSE;
4611
4612             /* WB10.  Numeric  ×  (ALetter | Hebrew_Letter) */
4613             case WBcase(PL_WB_Numeric, PL_WB_ALetter):
4614             case WBcase(PL_WB_Numeric, PL_WB_Hebrew_Letter):
4615                 return FALSE;
4616
4617             /* Do not break within sequences, such as “3.2” or “3,456.789”.
4618                 WB11.   Numeric (MidNum | MidNumLet | Single_Quote)  ×  Numeric
4619              */
4620             case WBcase(PL_WB_MidNum, PL_WB_Numeric):
4621             case WBcase(PL_WB_MidNumLet, PL_WB_Numeric):
4622             case WBcase(PL_WB_Single_Quote, PL_WB_Numeric):
4623                 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4624                                                                != PL_WB_Numeric;
4625
4626             /*  WB12.   Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric
4627              *  */
4628             case WBcase(PL_WB_Numeric, PL_WB_MidNum):
4629             case WBcase(PL_WB_Numeric, PL_WB_MidNumLet):
4630             case WBcase(PL_WB_Numeric, PL_WB_Single_Quote):
4631                 return advance_one_WB(&after_pos, strend, utf8_target)
4632                                                                != PL_WB_Numeric;
4633
4634             /* Do not break between Katakana.
4635                WB13.  Katakana  ×  Katakana */
4636             case WBcase(PL_WB_Katakana, PL_WB_Katakana):
4637                 return FALSE;
4638
4639             /* Do not break from extenders.
4640                WB13a.  (ALetter | Hebrew_Letter | Numeric | Katakana |
4641                                             ExtendNumLet)  ×  ExtendNumLet */
4642             case WBcase(PL_WB_ALetter, PL_WB_ExtendNumLet):
4643             case WBcase(PL_WB_Hebrew_Letter, PL_WB_ExtendNumLet):
4644             case WBcase(PL_WB_Numeric, PL_WB_ExtendNumLet):
4645             case WBcase(PL_WB_Katakana, PL_WB_ExtendNumLet):
4646             case WBcase(PL_WB_ExtendNumLet, PL_WB_ExtendNumLet):
4647                 return FALSE;
4648
4649             /* WB13b.  ExtendNumLet  ×  (ALetter | Hebrew_Letter | Numeric
4650              *                                                 | Katakana) */
4651             case WBcase(PL_WB_ExtendNumLet, PL_WB_ALetter):
4652             case WBcase(PL_WB_ExtendNumLet, PL_WB_Hebrew_Letter):
4653             case WBcase(PL_WB_ExtendNumLet, PL_WB_Numeric):
4654             case WBcase(PL_WB_ExtendNumLet, PL_WB_Katakana):
4655                 return FALSE;
4656
4657             /* Do not break between regional indicator symbols.
4658                WB13c.  Regional_Indicator  ×  Regional_Indicator */
4659             case WBcase(PL_WB_Regional_Indicator, PL_WB_Regional_Indicator):
4660                 return FALSE;
4661
4662     }
4663
4664     NOT_REACHED; /* NOTREACHED */
4665 }
4666
4667 STATIC PL_WB_enum
4668 S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4669 {
4670     PL_WB_enum wb;
4671
4672     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
4673
4674     if (*curpos >= strend) {
4675         return PL_WB_EDGE;
4676     }
4677
4678     if (utf8_target) {
4679
4680         /* Advance over Extend and Format */
4681         do {
4682             *curpos += UTF8SKIP(*curpos);
4683             if (*curpos >= strend) {
4684                 return PL_WB_EDGE;
4685             }
4686             wb = getWB_VAL_UTF8(*curpos, strend);
4687         } while (wb == PL_WB_Extend || wb == PL_WB_Format);
4688     }
4689     else {
4690         do {
4691             (*curpos)++;
4692             if (*curpos >= strend) {
4693                 return PL_WB_EDGE;
4694             }
4695             wb = getWB_VAL_CP(**curpos);
4696         } while (wb == PL_WB_Extend || wb == PL_WB_Format);
4697     }
4698
4699     return wb;
4700 }
4701
4702 STATIC PL_WB_enum
4703 S_backup_one_WB(pTHX_ PL_WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4704 {
4705     PL_WB_enum wb;
4706
4707     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
4708
4709     /* If we know what the previous character's break value is, don't have
4710         * to look it up */
4711     if (*previous != PL_WB_UNKNOWN) {
4712         wb = *previous;
4713         *previous = PL_WB_UNKNOWN;
4714         /* XXX Note that doesn't change curpos, and maybe should */
4715
4716         /* But we always back up over these two types */
4717         if (wb != PL_WB_Extend && wb != PL_WB_Format) {
4718             return wb;
4719         }
4720     }
4721
4722     if (*curpos < strbeg) {
4723         return PL_WB_EDGE;
4724     }
4725
4726     if (utf8_target) {
4727         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4728         if (! prev_char_pos) {
4729             return PL_WB_EDGE;
4730         }
4731
4732         /* Back up over Extend and Format.  curpos is always just to the right
4733          * of the characater whose value we are getting */
4734         do {
4735             U8 * prev_prev_char_pos;
4736             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
4737                                                    -1,
4738                                                    strbeg)))
4739             {
4740                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4741                 *curpos = prev_char_pos;
4742                 prev_char_pos = prev_prev_char_pos;
4743             }
4744             else {
4745                 *curpos = (U8 *) strbeg;
4746                 return PL_WB_EDGE;
4747             }
4748         } while (wb == PL_WB_Extend || wb == PL_WB_Format);
4749     }
4750     else {
4751         do {
4752             if (*curpos - 2 < strbeg) {
4753                 *curpos = (U8 *) strbeg;
4754                 return PL_WB_EDGE;
4755             }
4756             (*curpos)--;
4757             wb = getWB_VAL_CP(*(*curpos - 1));
4758         } while (wb == PL_WB_Extend || wb == PL_WB_Format);
4759     }
4760
4761     return wb;
4762 }
4763
4764 /* returns -1 on failure, $+[0] on success */
4765 STATIC SSize_t
4766 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
4767 {
4768 #if PERL_VERSION < 9 && !defined(PERL_CORE)
4769     dMY_CXT;
4770 #endif
4771     dVAR;
4772     const bool utf8_target = reginfo->is_utf8_target;
4773     const U32 uniflags = UTF8_ALLOW_DEFAULT;
4774     REGEXP *rex_sv = reginfo->prog;
4775     regexp *rex = ReANY(rex_sv);
4776     RXi_GET_DECL(rex,rexi);
4777     /* the current state. This is a cached copy of PL_regmatch_state */
4778     regmatch_state *st;
4779     /* cache heavy used fields of st in registers */
4780     regnode *scan;
4781     regnode *next;
4782     U32 n = 0;  /* general value; init to avoid compiler warning */
4783     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
4784     char *locinput = startpos;
4785     char *pushinput; /* where to continue after a PUSH */
4786     I32 nextchr;   /* is always set to UCHARAT(locinput) */
4787
4788     bool result = 0;        /* return value of S_regmatch */
4789     int depth = 0;          /* depth of backtrack stack */
4790     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
4791     const U32 max_nochange_depth =
4792         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
4793         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
4794     regmatch_state *yes_state = NULL; /* state to pop to on success of
4795                                                             subpattern */
4796     /* mark_state piggy backs on the yes_state logic so that when we unwind
4797        the stack on success we can update the mark_state as we go */
4798     regmatch_state *mark_state = NULL; /* last mark state we have seen */
4799     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
4800     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
4801     U32 state_num;
4802     bool no_final = 0;      /* prevent failure from backtracking? */
4803     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
4804     char *startpoint = locinput;
4805     SV *popmark = NULL;     /* are we looking for a mark? */
4806     SV *sv_commit = NULL;   /* last mark name seen in failure */
4807     SV *sv_yes_mark = NULL; /* last mark name we have seen
4808                                during a successful match */
4809     U32 lastopen = 0;       /* last open we saw */
4810     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4811     SV* const oreplsv = GvSVn(PL_replgv);
4812     /* these three flags are set by various ops to signal information to
4813      * the very next op. They have a useful lifetime of exactly one loop
4814      * iteration, and are not preserved or restored by state pushes/pops
4815      */
4816     bool sw = 0;            /* the condition value in (?(cond)a|b) */
4817     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
4818     int logical = 0;        /* the following EVAL is:
4819                                 0: (?{...})
4820                                 1: (?(?{...})X|Y)
4821                                 2: (??{...})
4822                                or the following IFMATCH/UNLESSM is:
4823                                 false: plain (?=foo)
4824                                 true:  used as a condition: (?(?=foo))
4825                             */
4826     PAD* last_pad = NULL;
4827     dMULTICALL;
4828     I32 gimme = G_SCALAR;
4829     CV *caller_cv = NULL;       /* who called us */
4830     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
4831     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
4832     U32 maxopenparen = 0;       /* max '(' index seen so far */
4833     int to_complement;  /* Invert the result? */
4834     _char_class_number classnum;
4835     bool is_utf8_pat = reginfo->is_utf8_pat;
4836     bool match = FALSE;
4837
4838
4839 #ifdef DEBUGGING
4840     GET_RE_DEBUG_FLAGS_DECL;
4841 #endif
4842
4843     /* protect against undef(*^R) */
4844     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
4845
4846     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
4847     multicall_oldcatch = 0;
4848     multicall_cv = NULL;
4849     cx = NULL;
4850     PERL_UNUSED_VAR(multicall_cop);
4851     PERL_UNUSED_VAR(newsp);
4852
4853
4854     PERL_ARGS_ASSERT_REGMATCH;
4855
4856     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
4857             PerlIO_printf(Perl_debug_log,"regmatch start\n");
4858     }));
4859
4860     st = PL_regmatch_state;
4861
4862     /* Note that nextchr is a byte even in UTF */
4863     SET_nextchr;
4864     scan = prog;
4865     while (scan != NULL) {
4866
4867         DEBUG_EXECUTE_r( {
4868             SV * const prop = sv_newmortal();
4869             regnode *rnext=regnext(scan);
4870             DUMP_EXEC_POS( locinput, scan, utf8_target );
4871             regprop(rex, prop, scan, reginfo, NULL);
4872
4873             PerlIO_printf(Perl_debug_log,
4874                     "%3"IVdf":%*s%s(%"IVdf")\n",
4875                     (IV)(scan - rexi->program), depth*2, "",
4876                     SvPVX_const(prop),
4877                     (PL_regkind[OP(scan)] == END || !rnext) ?
4878                         0 : (IV)(rnext - rexi->program));
4879         });
4880
4881         next = scan + NEXT_OFF(scan);
4882         if (next == scan)
4883             next = NULL;
4884         state_num = OP(scan);
4885
4886       reenter_switch:
4887         to_complement = 0;
4888
4889         SET_nextchr;
4890         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
4891
4892         switch (state_num) {
4893         case SBOL: /*  /^../ and /\A../  */
4894             if (locinput == reginfo->strbeg)
4895                 break;
4896             sayNO;
4897
4898         case MBOL: /*  /^../m  */
4899             if (locinput == reginfo->strbeg ||
4900                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4901             {
4902                 break;
4903             }
4904             sayNO;
4905
4906         case GPOS: /*  \G  */
4907             if (locinput == reginfo->ganch)
4908                 break;
4909             sayNO;
4910
4911         case KEEPS: /*   \K  */
4912             /* update the startpoint */
4913             st->u.keeper.val = rex->offs[0].start;
4914             rex->offs[0].start = locinput - reginfo->strbeg;
4915             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4916             /* NOTREACHED */
4917             NOT_REACHED; /* NOTREACHED */
4918
4919         case KEEPS_next_fail:
4920             /* rollback the start point change */
4921             rex->offs[0].start = st->u.keeper.val;
4922             sayNO_SILENT;
4923             /* NOTREACHED */
4924             NOT_REACHED; /* NOTREACHED */
4925
4926         case MEOL: /* /..$/m  */
4927             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4928                 sayNO;
4929             break;
4930
4931         case SEOL: /* /..$/  */
4932             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4933                 sayNO;
4934             if (reginfo->strend - locinput > 1)
4935                 sayNO;
4936             break;
4937
4938         case EOS: /*  \z  */
4939             if (!NEXTCHR_IS_EOS)
4940                 sayNO;
4941             break;
4942
4943         case SANY: /*  /./s  */
4944             if (NEXTCHR_IS_EOS)
4945                 sayNO;
4946             goto increment_locinput;
4947
4948         case CANY: /*  \C  */
4949             if (NEXTCHR_IS_EOS)
4950                 sayNO;
4951             locinput++;
4952             break;
4953
4954         case REG_ANY: /*  /./  */
4955             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4956                 sayNO;
4957             goto increment_locinput;
4958
4959
4960 #undef  ST
4961 #define ST st->u.trie
4962         case TRIEC: /* (ab|cd) with known charclass */
4963             /* In this case the charclass data is available inline so
4964                we can fail fast without a lot of extra overhead.
4965              */
4966             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4967                 DEBUG_EXECUTE_r(
4968                     PerlIO_printf(Perl_debug_log,
4969                               "%*s  %sfailed to match trie start class...%s\n",
4970                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4971                 );
4972                 sayNO_SILENT;
4973                 /* NOTREACHED */
4974                 NOT_REACHED; /* NOTREACHED */
4975             }
4976             /* FALLTHROUGH */
4977         case TRIE:  /* (ab|cd)  */
4978             /* the basic plan of execution of the trie is:
4979              * At the beginning, run though all the states, and
4980              * find the longest-matching word. Also remember the position
4981              * of the shortest matching word. For example, this pattern:
4982              *    1  2 3 4    5
4983              *    ab|a|x|abcd|abc
4984              * when matched against the string "abcde", will generate
4985              * accept states for all words except 3, with the longest
4986              * matching word being 4, and the shortest being 2 (with
4987              * the position being after char 1 of the string).
4988              *
4989              * Then for each matching word, in word order (i.e. 1,2,4,5),
4990              * we run the remainder of the pattern; on each try setting
4991              * the current position to the character following the word,
4992              * returning to try the next word on failure.
4993              *
4994              * We avoid having to build a list of words at runtime by
4995              * using a compile-time structure, wordinfo[].prev, which
4996              * gives, for each word, the previous accepting word (if any).
4997              * In the case above it would contain the mappings 1->2, 2->0,
4998              * 3->0, 4->5, 5->1.  We can use this table to generate, from
4999              * the longest word (4 above), a list of all words, by
5000              * following the list of prev pointers; this gives us the
5001              * unordered list 4,5,1,2. Then given the current word we have
5002              * just tried, we can go through the list and find the
5003              * next-biggest word to try (so if we just failed on word 2,
5004              * the next in the list is 4).
5005              *
5006              * Since at runtime we don't record the matching position in
5007              * the string for each word, we have to work that out for
5008              * each word we're about to process. The wordinfo table holds
5009              * the character length of each word; given that we recorded
5010              * at the start: the position of the shortest word and its
5011              * length in chars, we just need to move the pointer the
5012              * difference between the two char lengths. Depending on
5013              * Unicode status and folding, that's cheap or expensive.
5014              *
5015              * This algorithm is optimised for the case where are only a
5016              * small number of accept states, i.e. 0,1, or maybe 2.
5017              * With lots of accepts states, and having to try all of them,
5018              * it becomes quadratic on number of accept states to find all
5019              * the next words.
5020              */
5021
5022             {
5023                 /* what type of TRIE am I? (utf8 makes this contextual) */
5024                 DECL_TRIE_TYPE(scan);
5025
5026                 /* what trie are we using right now */
5027                 reg_trie_data * const trie
5028                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5029                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5030                 U32 state = trie->startstate;
5031
5032                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5033                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5034                     if (utf8_target
5035                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5036                         && scan->flags == EXACTL)
5037                     {
5038                         /* We only output for EXACTL, as we let the folder
5039                          * output this message for EXACTFLU8 to avoid
5040                          * duplication */
5041                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5042                                                                reginfo->strend);
5043                     }
5044                 }
5045                 if (   trie->bitmap
5046                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5047                 {
5048                     if (trie->states[ state ].wordnum) {
5049                          DEBUG_EXECUTE_r(
5050                             PerlIO_printf(Perl_debug_log,
5051                                           "%*s  %smatched empty string...%s\n",
5052                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5053                         );
5054                         if (!trie->jump)
5055                             break;
5056                     } else {
5057                         DEBUG_EXECUTE_r(
5058                             PerlIO_printf(Perl_debug_log,
5059                                           "%*s  %sfailed to match trie start class...%s\n",
5060                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5061                         );
5062                         sayNO_SILENT;
5063                    }
5064                 }
5065
5066             {
5067                 U8 *uc = ( U8* )locinput;
5068
5069                 STRLEN len = 0;
5070                 STRLEN foldlen = 0;
5071                 U8 *uscan = (U8*)NULL;
5072                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5073                 U32 charcount = 0; /* how many input chars we have matched */
5074                 U32 accepted = 0; /* have we seen any accepting states? */
5075
5076                 ST.jump = trie->jump;
5077                 ST.me = scan;
5078                 ST.firstpos = NULL;
5079                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5080                 ST.nextword = 0;
5081
5082                 /* fully traverse the TRIE; note the position of the
5083                    shortest accept state and the wordnum of the longest
5084                    accept state */
5085
5086                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5087                     U32 base = trie->states[ state ].trans.base;
5088                     UV uvc = 0;
5089                     U16 charid = 0;
5090                     U16 wordnum;
5091                     wordnum = trie->states[ state ].wordnum;
5092
5093                     if (wordnum) { /* it's an accept state */
5094                         if (!accepted) {
5095                             accepted = 1;
5096                             /* record first match position */
5097                             if (ST.longfold) {
5098                                 ST.firstpos = (U8*)locinput;
5099                                 ST.firstchars = 0;
5100                             }
5101                             else {
5102                                 ST.firstpos = uc;
5103                                 ST.firstchars = charcount;
5104                             }
5105                         }
5106                         if (!ST.nextword || wordnum < ST.nextword)
5107                             ST.nextword = wordnum;
5108                         ST.topword = wordnum;
5109                     }
5110
5111                     DEBUG_TRIE_EXECUTE_r({
5112                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
5113                                 PerlIO_printf( Perl_debug_log,
5114                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
5115                                     2+depth * 2, "", PL_colors[4],
5116                                     (UV)state, (accepted ? 'Y' : 'N'));
5117                     });
5118
5119                     /* read a char and goto next state */
5120                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5121                         I32 offset;
5122                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5123                                              uscan, len, uvc, charid, foldlen,
5124                                              foldbuf, uniflags);
5125                         charcount++;
5126                         if (foldlen>0)
5127                             ST.longfold = TRUE;
5128                         if (charid &&
5129                              ( ((offset =
5130                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5131
5132                              && ((U32)offset < trie->lasttrans)
5133                              && trie->trans[offset].check == state)
5134                         {
5135                             state = trie->trans[offset].next;
5136                         }
5137                         else {
5138                             state = 0;
5139                         }
5140                         uc += len;
5141
5142                     }
5143                     else {
5144                         state = 0;
5145                     }
5146                     DEBUG_TRIE_EXECUTE_r(
5147                         PerlIO_printf( Perl_debug_log,
5148                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5149                             charid, uvc, (UV)state, PL_colors[5] );
5150                     );
5151                 }
5152                 if (!accepted)
5153                    sayNO;
5154
5155                 /* calculate total number of accept states */
5156                 {
5157                     U16 w = ST.topword;
5158                     accepted = 0;
5159                     while (w) {
5160                         w = trie->wordinfo[w].prev;
5161                         accepted++;
5162                     }
5163                     ST.accepted = accepted;
5164                 }
5165
5166                 DEBUG_EXECUTE_r(
5167                     PerlIO_printf( Perl_debug_log,
5168                         "%*s  %sgot %"IVdf" possible matches%s\n",
5169                         REPORT_CODE_OFF + depth * 2, "",
5170                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5171                 );
5172                 goto trie_first_try; /* jump into the fail handler */
5173             }}
5174             /* NOTREACHED */
5175             NOT_REACHED; /* NOTREACHED */
5176
5177         case TRIE_next_fail: /* we failed - try next alternative */
5178         {
5179             U8 *uc;
5180             if ( ST.jump) {
5181                 REGCP_UNWIND(ST.cp);
5182                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5183             }
5184             if (!--ST.accepted) {
5185                 DEBUG_EXECUTE_r({
5186                     PerlIO_printf( Perl_debug_log,
5187                         "%*s  %sTRIE failed...%s\n",
5188                         REPORT_CODE_OFF+depth*2, "",
5189                         PL_colors[4],
5190                         PL_colors[5] );
5191                 });
5192                 sayNO_SILENT;
5193             }
5194             {
5195                 /* Find next-highest word to process.  Note that this code
5196                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5197                 U16 min = 0;
5198                 U16 word;
5199                 U16 const nextword = ST.nextword;
5200                 reg_trie_wordinfo * const wordinfo
5201                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5202                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5203                     if (word > nextword && (!min || word < min))
5204                         min = word;
5205                 }
5206                 ST.nextword = min;
5207             }
5208
5209           trie_first_try:
5210             if (do_cutgroup) {
5211                 do_cutgroup = 0;
5212                 no_final = 0;
5213             }
5214
5215             if ( ST.jump) {
5216                 ST.lastparen = rex->lastparen;
5217                 ST.lastcloseparen = rex->lastcloseparen;
5218                 REGCP_SET(ST.cp);
5219             }
5220
5221             /* find start char of end of current word */
5222             {
5223                 U32 chars; /* how many chars to skip */
5224                 reg_trie_data * const trie
5225                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5226
5227                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5228                             >=  ST.firstchars);
5229                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5230                             - ST.firstchars;
5231                 uc = ST.firstpos;
5232
5233                 if (ST.longfold) {
5234                     /* the hard option - fold each char in turn and find
5235                      * its folded length (which may be different */
5236                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5237                     STRLEN foldlen;
5238                     STRLEN len;
5239                     UV uvc;
5240                     U8 *uscan;
5241
5242                     while (chars) {
5243                         if (utf8_target) {
5244                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5245                                                     uniflags);
5246                             uc += len;
5247                         }
5248                         else {
5249                             uvc = *uc;
5250                             uc++;
5251                         }
5252                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5253                         uscan = foldbuf;
5254                         while (foldlen) {
5255                             if (!--chars)
5256                                 break;
5257                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5258                                             uniflags);
5259                             uscan += len;
5260                             foldlen -= len;
5261                         }
5262                     }
5263                 }
5264                 else {
5265                     if (utf8_target)
5266                         while (chars--)
5267                             uc += UTF8SKIP(uc);
5268                     else
5269                         uc += chars;
5270                 }
5271             }
5272
5273             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5274                             ? ST.jump[ST.nextword]
5275                             : NEXT_OFF(ST.me));
5276
5277             DEBUG_EXECUTE_r({
5278                 PerlIO_printf( Perl_debug_log,
5279                     "%*s  %sTRIE matched word #%d, continuing%s\n",
5280                     REPORT_CODE_OFF+depth*2, "",
5281                     PL_colors[4],
5282                     ST.nextword,
5283                     PL_colors[5]
5284                     );
5285             });
5286
5287             if (ST.accepted > 1 || has_cutgroup) {
5288                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5289                 /* NOTREACHED */
5290                 NOT_REACHED; /* NOTREACHED */
5291             }
5292             /* only one choice left - just continue */
5293             DEBUG_EXECUTE_r({
5294                 AV *const trie_words
5295                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5296                 SV ** const tmp = trie_words
5297                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5298                 SV *sv= tmp ? sv_newmortal() : NULL;
5299
5300                 PerlIO_printf( Perl_debug_log,
5301                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
5302                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
5303                     ST.nextword,
5304                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5305                             PL_colors[0], PL_colors[1],
5306                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5307                         )
5308                     : "not compiled under -Dr",
5309                     PL_colors[5] );
5310             });
5311
5312             locinput = (char*)uc;
5313             continue; /* execute rest of RE */
5314             /* NOTREACHED */
5315         }
5316 #undef  ST
5317
5318         case EXACTL:             /*  /abc/l       */
5319             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5320
5321             /* Complete checking would involve going through every character
5322              * matched by the string to see if any is above latin1.  But the
5323              * comparision otherwise might very well be a fast assembly
5324              * language routine, and I (khw) don't think slowing things down
5325              * just to check for this warning is worth it.  So this just checks
5326              * the first character */
5327             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5328                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5329             }
5330             /* FALLTHROUGH */
5331         case EXACT: {            /*  /abc/        */
5332             char *s = STRING(scan);
5333             ln = STR_LEN(scan);
5334             if (utf8_target != is_utf8_pat) {
5335                 /* The target and the pattern have differing utf8ness. */
5336                 char *l = locinput;
5337                 const char * const e = s + ln;
5338
5339                 if (utf8_target) {
5340                     /* The target is utf8, the pattern is not utf8.
5341                      * Above-Latin1 code points can't match the pattern;
5342                      * invariants match exactly, and the other Latin1 ones need
5343                      * to be downgraded to a single byte in order to do the
5344                      * comparison.  (If we could be confident that the target
5345                      * is not malformed, this could be refactored to have fewer
5346                      * tests by just assuming that if the first bytes match, it
5347                      * is an invariant, but there are tests in the test suite
5348                      * dealing with (??{...}) which violate this) */
5349                     while (s < e) {
5350                         if (l >= reginfo->strend
5351                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5352                         {
5353                             sayNO;
5354                         }
5355                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
5356                             if (*l != *s) {
5357                                 sayNO;
5358                             }
5359                             l++;
5360                         }
5361                         else {
5362                             if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5363                             {
5364                                 sayNO;
5365                             }
5366                             l += 2;
5367                         }
5368                         s++;
5369                     }
5370                 }
5371                 else {
5372                     /* The target is not utf8, the pattern is utf8. */
5373                     while (s < e) {
5374                         if (l >= reginfo->strend
5375                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5376                         {
5377                             sayNO;
5378                         }
5379                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
5380                             if (*s != *l) {
5381                                 sayNO;
5382                             }
5383                             s++;
5384                         }
5385                         else {
5386                             if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5387                             {
5388                                 sayNO;
5389                             }
5390                             s += 2;
5391                         }
5392                         l++;
5393                     }
5394                 }
5395                 locinput = l;
5396             }
5397             else {
5398                 /* The target and the pattern have the same utf8ness. */
5399                 /* Inline the first character, for speed. */
5400                 if (reginfo->strend - locinput < ln
5401                     || UCHARAT(s) != nextchr
5402                     || (ln > 1 && memNE(s, locinput, ln)))
5403                 {
5404                     sayNO;
5405                 }
5406                 locinput += ln;
5407             }
5408             break;
5409             }
5410
5411         case EXACTFL: {          /*  /abc/il      */
5412             re_fold_t folder;
5413             const U8 * fold_array;
5414             const char * s;
5415             U32 fold_utf8_flags;
5416
5417             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5418             folder = foldEQ_locale;
5419             fold_array = PL_fold_locale;
5420             fold_utf8_flags = FOLDEQ_LOCALE;
5421             goto do_exactf;
5422
5423         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
5424                                       is effectively /u; hence to match, target
5425                                       must be UTF-8. */
5426             if (! utf8_target) {
5427                 sayNO;
5428             }
5429             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5430                                              | FOLDEQ_S1_FOLDS_SANE;
5431             folder = foldEQ_latin1;
5432             fold_array = PL_fold_latin1;
5433             goto do_exactf;
5434
5435         case EXACTFU_SS:         /*  /\x{df}/iu   */
5436         case EXACTFU:            /*  /abc/iu      */
5437             folder = foldEQ_latin1;
5438             fold_array = PL_fold_latin1;
5439             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5440             goto do_exactf;
5441
5442         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
5443                                    patterns */
5444             assert(! is_utf8_pat);
5445             /* FALLTHROUGH */
5446         case EXACTFA:            /*  /abc/iaa     */
5447             folder = foldEQ_latin1;
5448             fold_array = PL_fold_latin1;
5449             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5450             goto do_exactf;
5451
5452         case EXACTF:             /*  /abc/i    This node only generated for
5453                                                non-utf8 patterns */
5454             assert(! is_utf8_pat);
5455             folder = foldEQ;
5456             fold_array = PL_fold;
5457             fold_utf8_flags = 0;
5458
5459           do_exactf:
5460             s = STRING(scan);
5461             ln = STR_LEN(scan);
5462
5463             if (utf8_target
5464                 || is_utf8_pat
5465                 || state_num == EXACTFU_SS
5466                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
5467             {
5468               /* Either target or the pattern are utf8, or has the issue where
5469                * the fold lengths may differ. */
5470                 const char * const l = locinput;
5471                 char *e = reginfo->strend;
5472
5473                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
5474                                         l, &e, 0,  utf8_target, fold_utf8_flags))
5475                 {
5476                     sayNO;
5477                 }
5478                 locinput = e;
5479                 break;
5480             }
5481
5482             /* Neither the target nor the pattern are utf8 */
5483             if (UCHARAT(s) != nextchr
5484                 && !NEXTCHR_IS_EOS
5485                 && UCHARAT(s) != fold_array[nextchr])
5486             {
5487                 sayNO;
5488             }
5489             if (reginfo->strend - locinput < ln)
5490                 sayNO;
5491             if (ln > 1 && ! folder(s, locinput, ln))
5492                 sayNO;
5493             locinput += ln;
5494             break;
5495         }
5496
5497         case NBOUNDL: /*  /\B/l  */
5498             to_complement = 1;
5499             /* FALLTHROUGH */
5500
5501         case BOUNDL:  /*  /\b/l  */
5502             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5503
5504             if (FLAGS(scan) != TRADITIONAL_BOUND) {
5505                 if (! IN_UTF8_CTYPE_LOCALE) {
5506                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
5507                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
5508                 }
5509                 goto boundu;
5510             }
5511
5512             if (utf8_target) {
5513                 if (locinput == reginfo->strbeg)
5514                     ln = isWORDCHAR_LC('\n');
5515                 else {
5516                     ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
5517                                                         (U8*)(reginfo->strbeg)));
5518                 }
5519                 n = (NEXTCHR_IS_EOS)
5520                     ? isWORDCHAR_LC('\n')
5521                     : isWORDCHAR_LC_utf8((U8*)locinput);
5522             }
5523             else { /* Here the string isn't utf8 */
5524                 ln = (locinput == reginfo->strbeg)
5525                      ? isWORDCHAR_LC('\n')
5526                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
5527                 n = (NEXTCHR_IS_EOS)
5528                     ? isWORDCHAR_LC('\n')
5529                     : isWORDCHAR_LC(nextchr);
5530             }
5531             if (to_complement ^ (ln == n)) {
5532                 sayNO;
5533             }
5534             break;
5535
5536         case NBOUND:  /*  /\B/   */
5537             to_complement = 1;
5538             /* FALLTHROUGH */
5539
5540         case BOUND:   /*  /\b/   */
5541             if (utf8_target) {
5542                 goto bound_utf8;
5543             }
5544             goto bound_ascii_match_only;
5545
5546         case NBOUNDA: /*  /\B/a  */
5547             to_complement = 1;
5548             /* FALLTHROUGH */
5549
5550         case BOUNDA:  /*  /\b/a  */
5551
5552           bound_ascii_match_only:
5553             /* Here the string isn't utf8, or is utf8 and only ascii characters
5554              * are to match \w.  In the latter case looking at the byte just
5555              * prior to the current one may be just the final byte of a
5556              * multi-byte character.  This is ok.  There are two cases:
5557              * 1) it is a single byte character, and then the test is doing
5558              *    just what it's supposed to.
5559              * 2) it is a multi-byte character, in which case the final byte is
5560              *    never mistakable for ASCII, and so the test will say it is
5561              *    not a word character, which is the correct answer. */
5562             ln = (locinput == reginfo->strbeg)
5563                  ? isWORDCHAR_A('\n')
5564                  : isWORDCHAR_A(UCHARAT(locinput - 1));
5565             n = (NEXTCHR_IS_EOS)
5566                 ? isWORDCHAR_A('\n')
5567                 : isWORDCHAR_A(nextchr);
5568             if (to_complement ^ (ln == n)) {
5569                 sayNO;
5570             }
5571             break;
5572
5573         case NBOUNDU: /*  /\B/u  */
5574             to_complement = 1;
5575             /* FALLTHROUGH */
5576
5577         case BOUNDU:  /*  /\b/u  */
5578
5579           boundu:
5580             if (utf8_target) {
5581
5582               bound_utf8:
5583                 switch((bound_type) FLAGS(scan)) {
5584                     case TRADITIONAL_BOUND:
5585                         ln = (locinput == reginfo->strbeg)
5586                              ? isWORDCHAR_L1('\n')
5587                              : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
5588                                                                 (U8*)(reginfo->strbeg)));
5589                         n = (NEXTCHR_IS_EOS)
5590                             ? isWORDCHAR_L1('\n')
5591                             : isWORDCHAR_utf8((U8*)locinput);
5592                         match = cBOOL(ln != n);
5593                         break;
5594                     case GCB_BOUND:
5595                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5596                             match = TRUE; /* GCB always matches at begin and
5597                                              end */
5598                         }
5599                         else {
5600                             /* Find the gcb values of previous and current
5601                              * chars, then see if is a break point */
5602                             match = isGCB(getGCB_VAL_UTF8(
5603                                                 reghop3((U8*)locinput,
5604                                                         -1,
5605                                                         (U8*)(reginfo->strbeg)),
5606                                                 (U8*) reginfo->strend),
5607                                           getGCB_VAL_UTF8((U8*) locinput,
5608                                                         (U8*) reginfo->strend));
5609                         }
5610                         break;
5611
5612                     case SB_BOUND: /* Always matches at begin and end */
5613                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5614                             match = TRUE;
5615                         }
5616                         else {
5617                             match = isSB(getSB_VAL_UTF8(
5618                                                 reghop3((U8*)locinput,
5619                                                         -1,
5620                                                         (U8*)(reginfo->strbeg)),
5621                                                 (U8*) reginfo->strend),
5622                                           getSB_VAL_UTF8((U8*) locinput,
5623                                                         (U8*) reginfo->strend),
5624                                           (U8*) reginfo->strbeg,
5625                                           (U8*) locinput,
5626                                           (U8*) reginfo->strend,
5627                                           utf8_target);
5628                         }
5629                         break;
5630
5631                     case WB_BOUND:
5632                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5633                             match = TRUE;
5634                         }
5635                         else {
5636                             match = isWB(PL_WB_UNKNOWN,
5637                                          getWB_VAL_UTF8(
5638                                                 reghop3((U8*)locinput,
5639                                                         -1,
5640                                                         (U8*)(reginfo->strbeg)),
5641                                                 (U8*) reginfo->strend),
5642                                           getWB_VAL_UTF8((U8*) locinput,
5643                                                         (U8*) reginfo->strend),
5644                                           (U8*) reginfo->strbeg,
5645                                           (U8*) locinput,
5646                                           (U8*) reginfo->strend,
5647                                           utf8_target);
5648                         }
5649                         break;
5650                 }
5651             }
5652             else {  /* Not utf8 target */
5653                 switch((bound_type) FLAGS(scan)) {
5654                     case TRADITIONAL_BOUND:
5655                         ln = (locinput == reginfo->strbeg)
5656                             ? isWORDCHAR_L1('\n')
5657                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
5658                         n = (NEXTCHR_IS_EOS)
5659                             ? isWORDCHAR_L1('\n')
5660                             : isWORDCHAR_L1(nextchr);
5661                         match = cBOOL(ln != n);
5662                         break;
5663
5664                     case GCB_BOUND:
5665                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5666                             match = TRUE; /* GCB always matches at begin and
5667                                              end */
5668                         }
5669                         else {  /* Only CR-LF combo isn't a GCB in 0-255
5670                                    range */
5671                             match =    UCHARAT(locinput - 1) != '\r'
5672                                     || UCHARAT(locinput) != '\n';
5673                         }
5674                         break;
5675
5676                     case SB_BOUND: /* Always matches at begin and end */
5677                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5678                             match = TRUE;
5679                         }
5680                         else {
5681                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
5682                                          getSB_VAL_CP(UCHARAT(locinput)),
5683                                          (U8*) reginfo->strbeg,
5684                                          (U8*) locinput,
5685                                          (U8*) reginfo->strend,
5686                                          utf8_target);
5687                         }
5688                         break;
5689
5690                     case WB_BOUND:
5691                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5692                             match = TRUE;
5693                         }
5694                         else {
5695                             match = isWB(PL_WB_UNKNOWN,
5696                                          getWB_VAL_CP(UCHARAT(locinput -1)),
5697                                          getWB_VAL_CP(UCHARAT(locinput)),
5698                                          (U8*) reginfo->strbeg,
5699                                          (U8*) locinput,
5700                                          (U8*) reginfo->strend,
5701                                          utf8_target);
5702                         }
5703                         break;
5704                 }
5705             }
5706
5707             if (to_complement ^ ! match) {
5708                 sayNO;
5709             }
5710             break;
5711
5712         case ANYOFL:  /*  /[abc]/l      */
5713             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5714             /* FALLTHROUGH */
5715         case ANYOF:  /*   /[abc]/       */
5716             if (NEXTCHR_IS_EOS)
5717                 sayNO;
5718             if (utf8_target) {
5719                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
5720                                                                    utf8_target))
5721                     sayNO;
5722                 locinput += UTF8SKIP(locinput);
5723             }
5724             else {
5725                 if (!REGINCLASS(rex, scan, (U8*)locinput))
5726                     sayNO;
5727                 locinput++;
5728             }
5729             break;
5730
5731         /* The argument (FLAGS) to all the POSIX node types is the class number
5732          * */
5733
5734         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
5735             to_complement = 1;
5736             /* FALLTHROUGH */
5737
5738         case POSIXL:    /* \w or [:punct:] etc. under /l */
5739             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5740             if (NEXTCHR_IS_EOS)
5741                 sayNO;
5742
5743             /* Use isFOO_lc() for characters within Latin1.  (Note that
5744              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5745              * wouldn't be invariant) */
5746             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5747                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
5748                     sayNO;
5749                 }
5750             }
5751             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5752                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
5753                                            (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
5754                                                             *(locinput + 1))))))
5755                 {
5756                     sayNO;
5757                 }
5758             }
5759             else { /* Here, must be an above Latin-1 code point */
5760                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5761                 goto utf8_posix_above_latin1;
5762             }
5763
5764             /* Here, must be utf8 */
5765             locinput += UTF8SKIP(locinput);
5766             break;
5767
5768         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
5769             to_complement = 1;
5770             /* FALLTHROUGH */
5771
5772         case POSIXD:    /* \w or [:punct:] etc. under /d */
5773             if (utf8_target) {
5774                 goto utf8_posix;
5775             }
5776             goto posixa;
5777
5778         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
5779
5780             if (NEXTCHR_IS_EOS) {
5781                 sayNO;
5782             }
5783
5784             /* All UTF-8 variants match */
5785             if (! UTF8_IS_INVARIANT(nextchr)) {
5786                 goto increment_locinput;
5787             }
5788
5789             to_complement = 1;
5790             /* FALLTHROUGH */
5791
5792         case POSIXA:    /* \w or [:punct:] etc. under /a */
5793
5794           posixa:
5795             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
5796              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
5797              * character is a single byte */
5798
5799             if (NEXTCHR_IS_EOS
5800                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
5801                                                             FLAGS(scan)))))
5802             {
5803                 sayNO;
5804             }
5805
5806             /* Here we are either not in utf8, or we matched a utf8-invariant,
5807              * so the next char is the next byte */
5808             locinput++;
5809             break;
5810
5811         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
5812             to_complement = 1;
5813             /* FALLTHROUGH */
5814
5815         case POSIXU:    /* \w or [:punct:] etc. under /u */
5816           utf8_posix:
5817             if (NEXTCHR_IS_EOS) {
5818                 sayNO;
5819             }
5820
5821             /* Use _generic_isCC() for characters within Latin1.  (Note that
5822              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5823              * wouldn't be invariant) */
5824             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5825                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
5826                                                            FLAGS(scan)))))
5827                 {
5828                     sayNO;
5829                 }
5830                 locinput++;
5831             }
5832             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5833                 if (! (to_complement
5834                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
5835                                                                *(locinput + 1)),
5836                                              FLAGS(scan)))))
5837                 {
5838                     sayNO;
5839                 }
5840                 locinput += 2;
5841             }
5842             else {  /* Handle above Latin-1 code points */
5843               utf8_posix_above_latin1:
5844                 classnum = (_char_class_number) FLAGS(scan);
5845                 if (classnum < _FIRST_NON_SWASH_CC) {
5846
5847                     /* Here, uses a swash to find such code points.  Load if if
5848                      * not done already */
5849                     if (! PL_utf8_swash_ptrs[classnum]) {
5850                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
5851                         PL_utf8_swash_ptrs[classnum]
5852                                 = _core_swash_init("utf8",
5853                                         "",
5854                                         &PL_sv_undef, 1, 0,
5855                                         PL_XPosix_ptrs[classnum], &flags);
5856                     }
5857                     if (! (to_complement
5858                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
5859                                                (U8 *) locinput, TRUE))))
5860                     {
5861                         sayNO;
5862                     }
5863                 }
5864                 else {  /* Here, uses macros to find above Latin-1 code points */
5865                     switch (classnum) {
5866                         case _CC_ENUM_SPACE:
5867                             if (! (to_complement
5868                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
5869                             {
5870                                 sayNO;
5871                             }
5872                             break;
5873                         case _CC_ENUM_BLANK:
5874                             if (! (to_complement
5875                                             ^ cBOOL(is_HORIZWS_high(locinput))))
5876                             {
5877                                 sayNO;
5878                             }
5879                             break;
5880                         case _CC_ENUM_XDIGIT:
5881                             if (! (to_complement
5882                                             ^ cBOOL(is_XDIGIT_high(locinput))))
5883                             {
5884                                 sayNO;
5885                             }
5886                             break;
5887                         case _CC_ENUM_VERTSPACE:
5888                             if (! (to_complement
5889                                             ^ cBOOL(is_VERTWS_high(locinput))))
5890                             {
5891                                 sayNO;
5892                             }
5893                             break;
5894                         default:    /* The rest, e.g. [:cntrl:], can't match
5895                                        above Latin1 */
5896                             if (! to_complement) {
5897                                 sayNO;
5898                             }
5899                             break;
5900                     }
5901                 }
5902                 locinput += UTF8SKIP(locinput);
5903             }
5904             break;
5905
5906         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
5907                        a Unicode extended Grapheme Cluster */
5908             if (NEXTCHR_IS_EOS)
5909                 sayNO;
5910             if  (! utf8_target) {
5911
5912                 /* Match either CR LF  or '.', as all the other possibilities
5913                  * require utf8 */
5914                 locinput++;         /* Match the . or CR */
5915                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
5916                                        match the LF */
5917                     && locinput < reginfo->strend
5918                     && UCHARAT(locinput) == '\n')
5919                 {
5920                     locinput++;
5921                 }
5922             }
5923             else {
5924
5925                 /* Get the gcb type for the current character */
5926                 PL_GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
5927                                                        (U8*) reginfo->strend);
5928
5929                 /* Then scan through the input until we get to the first
5930                  * character whose type is supposed to be a gcb with the
5931                  * current character.  (There is always a break at the
5932                  * end-of-input) */
5933                 locinput += UTF8SKIP(locinput);
5934                 while (locinput < reginfo->strend) {
5935                     PL_GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
5936                                                          (U8*) reginfo->strend);
5937                     if (isGCB(prev_gcb, cur_gcb)) {
5938                         break;
5939                     }
5940
5941                     prev_gcb = cur_gcb;
5942                     locinput += UTF8SKIP(locinput);
5943                 }
5944
5945
5946             }
5947             break;
5948
5949         case NREFFL:  /*  /\g{name}/il  */
5950         {   /* The capture buffer cases.  The ones beginning with N for the
5951                named buffers just convert to the equivalent numbered and
5952                pretend they were called as the corresponding numbered buffer
5953                op.  */
5954             /* don't initialize these in the declaration, it makes C++
5955                unhappy */
5956             const char *s;
5957             char type;
5958             re_fold_t folder;
5959             const U8 *fold_array;
5960             UV utf8_fold_flags;
5961
5962             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5963             folder = foldEQ_locale;
5964             fold_array = PL_fold_locale;
5965             type = REFFL;
5966             utf8_fold_flags = FOLDEQ_LOCALE;
5967             goto do_nref;
5968
5969         case NREFFA:  /*  /\g{name}/iaa  */
5970             folder = foldEQ_latin1;
5971             fold_array = PL_fold_latin1;
5972             type = REFFA;
5973             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5974             goto do_nref;
5975
5976         case NREFFU:  /*  /\g{name}/iu  */
5977             folder = foldEQ_latin1;
5978             fold_array = PL_fold_latin1;
5979             type = REFFU;
5980             utf8_fold_flags = 0;
5981             goto do_nref;
5982
5983         case NREFF:  /*  /\g{name}/i  */
5984             folder = foldEQ;
5985             fold_array = PL_fold;
5986             type = REFF;
5987             utf8_fold_flags = 0;
5988             goto do_nref;
5989
5990         case NREF:  /*  /\g{name}/   */
5991             type = REF;
5992             folder = NULL;
5993             fold_array = NULL;
5994             utf8_fold_flags = 0;
5995           do_nref:
5996
5997             /* For the named back references, find the corresponding buffer
5998              * number */
5999             n = reg_check_named_buff_matched(rex,scan);
6000
6001             if ( ! n ) {
6002                 sayNO;
6003             }
6004             goto do_nref_ref_common;
6005
6006         case REFFL:  /*  /\1/il  */
6007             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6008             folder = foldEQ_locale;
6009             fold_array = PL_fold_locale;
6010             utf8_fold_flags = FOLDEQ_LOCALE;
6011             goto do_ref;
6012
6013         case REFFA:  /*  /\1/iaa  */
6014             folder = foldEQ_latin1;
6015             fold_array = PL_fold_latin1;
6016             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6017             goto do_ref;
6018
6019         case REFFU:  /*  /\1/iu  */
6020             folder = foldEQ_latin1;
6021             fold_array = PL_fold_latin1;
6022             utf8_fold_flags = 0;
6023             goto do_ref;
6024
6025         case REFF:  /*  /\1/i  */
6026             folder = foldEQ;
6027             fold_array = PL_fold;
6028             utf8_fold_flags = 0;
6029             goto do_ref;
6030
6031         case REF:  /*  /\1/    */
6032             folder = NULL;
6033             fold_array = NULL;
6034             utf8_fold_flags = 0;
6035
6036           do_ref:
6037             type = OP(scan);
6038             n = ARG(scan);  /* which paren pair */
6039
6040           do_nref_ref_common:
6041             ln = rex->offs[n].start;
6042             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6043             if (rex->lastparen < n || ln == -1)
6044                 sayNO;                  /* Do not match unless seen CLOSEn. */
6045             if (ln == rex->offs[n].end)
6046                 break;
6047
6048             s = reginfo->strbeg + ln;
6049             if (type != REF     /* REF can do byte comparison */
6050                 && (utf8_target || type == REFFU || type == REFFL))
6051             {
6052                 char * limit = reginfo->strend;
6053
6054                 /* This call case insensitively compares the entire buffer
6055                     * at s, with the current input starting at locinput, but
6056                     * not going off the end given by reginfo->strend, and
6057                     * returns in <limit> upon success, how much of the
6058                     * current input was matched */
6059                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6060                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6061                 {
6062                     sayNO;
6063                 }
6064                 locinput = limit;
6065                 break;
6066             }
6067
6068             /* Not utf8:  Inline the first character, for speed. */
6069             if (!NEXTCHR_IS_EOS &&
6070                 UCHARAT(s) != nextchr &&
6071                 (type == REF ||
6072                  UCHARAT(s) != fold_array[nextchr]))
6073                 sayNO;
6074             ln = rex->offs[n].end - ln;
6075             if (locinput + ln > reginfo->strend)
6076                 sayNO;
6077             if (ln > 1 && (type == REF
6078                            ? memNE(s, locinput, ln)
6079                            : ! folder(s, locinput, ln)))
6080                 sayNO;
6081             locinput += ln;
6082             break;
6083         }
6084
6085         case NOTHING: /* null op; e.g. the 'nothing' following
6086                        * the '*' in m{(a+|b)*}' */
6087             break;
6088         case TAIL: /* placeholder while compiling (A|B|C) */
6089             break;
6090
6091 #undef  ST
6092 #define ST st->u.eval
6093         {
6094             SV *ret;
6095             REGEXP *re_sv;
6096             regexp *re;
6097             regexp_internal *rei;
6098             regnode *startpoint;
6099
6100         case GOSTART: /*  (?R)  */
6101         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6102             if (cur_eval && cur_eval->locinput==locinput) {
6103                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
6104                     Perl_croak(aTHX_ "Infinite recursion in regex");
6105                 if ( ++nochange_depth > max_nochange_depth )
6106                     Perl_croak(aTHX_
6107                         "Pattern subroutine nesting without pos change"
6108                         " exceeded limit in regex");
6109             } else {
6110                 nochange_depth = 0;
6111             }
6112             re_sv = rex_sv;
6113             re = rex;
6114             rei = rexi;
6115             if (OP(scan)==GOSUB) {
6116                 startpoint = scan + ARG2L(scan);
6117                 ST.close_paren = ARG(scan);
6118             } else {
6119                 startpoint = rei->program+1;
6120                 ST.close_paren = 0;
6121             }
6122
6123             /* Save all the positions seen so far. */
6124             ST.cp = regcppush(rex, 0, maxopenparen);
6125             REGCP_SET(ST.lastcp);
6126
6127             /* and then jump to the code we share with EVAL */
6128             goto eval_recurse_doit;
6129             /* NOTREACHED */
6130
6131         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
6132             if (cur_eval && cur_eval->locinput==locinput) {
6133                 if ( ++nochange_depth > max_nochange_depth )
6134                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6135             } else {
6136                 nochange_depth = 0;
6137             }
6138             {
6139                 /* execute the code in the {...} */
6140
6141                 dSP;
6142                 IV before;
6143                 OP * const oop = PL_op;
6144                 COP * const ocurcop = PL_curcop;
6145                 OP *nop;
6146                 CV *newcv;
6147
6148                 /* save *all* paren positions */
6149                 regcppush(rex, 0, maxopenparen);
6150                 REGCP_SET(runops_cp);
6151
6152                 if (!caller_cv)
6153                     caller_cv = find_runcv(NULL);
6154
6155                 n = ARG(scan);
6156
6157                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6158                     newcv = (ReANY(
6159                                                 (REGEXP*)(rexi->data->data[n])
6160                                             ))->qr_anoncv
6161                                         ;
6162                     nop = (OP*)rexi->data->data[n+1];
6163                 }
6164                 else if (rexi->data->what[n] == 'l') { /* literal code */
6165                     newcv = caller_cv;
6166                     nop = (OP*)rexi->data->data[n];
6167                     assert(CvDEPTH(newcv));
6168                 }
6169                 else {
6170                     /* literal with own CV */
6171                     assert(rexi->data->what[n] == 'L');
6172                     newcv = rex->qr_anoncv;
6173                     nop = (OP*)rexi->data->data[n];
6174                 }
6175
6176                 /* normally if we're about to execute code from the same
6177                  * CV that we used previously, we just use the existing
6178                  * CX stack entry. However, its possible that in the
6179                  * meantime we may have backtracked, popped from the save
6180                  * stack, and undone the SAVECOMPPAD(s) associated with
6181                  * PUSH_MULTICALL; in which case PL_comppad no longer
6182                  * points to newcv's pad. */
6183                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6184                 {
6185                     U8 flags = (CXp_SUB_RE |
6186                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6187                     if (last_pushed_cv) {
6188                         CHANGE_MULTICALL_FLAGS(newcv, flags);
6189                     }
6190                     else {
6191                         PUSH_MULTICALL_FLAGS(newcv, flags);
6192                     }
6193                     last_pushed_cv = newcv;
6194                 }
6195                 else {
6196                     /* these assignments are just to silence compiler
6197                      * warnings */
6198                     multicall_cop = NULL;
6199                     newsp = NULL;
6200                 }
6201                 last_pad = PL_comppad;
6202
6203                 /* the initial nextstate you would normally execute
6204                  * at the start of an eval (which would cause error
6205                  * messages to come from the eval), may be optimised
6206                  * away from the execution path in the regex code blocks;
6207                  * so manually set PL_curcop to it initially */
6208                 {
6209                     OP *o = cUNOPx(nop)->op_first;
6210                     assert(o->op_type == OP_NULL);
6211                     if (o->op_targ == OP_SCOPE) {
6212                         o = cUNOPo->op_first;
6213                     }
6214                     else {
6215                         assert(o->op_targ == OP_LEAVE);
6216                         o = cUNOPo->op_first;
6217                         assert(o->op_type == OP_ENTER);
6218                         o = OpSIBLING(o);
6219                     }
6220
6221                     if (o->op_type != OP_STUB) {
6222                         assert(    o->op_type == OP_NEXTSTATE
6223                                 || o->op_type == OP_DBSTATE
6224                                 || (o->op_type == OP_NULL
6225                                     &&  (  o->op_targ == OP_NEXTSTATE
6226                                         || o->op_targ == OP_DBSTATE
6227                                         )
6228                                     )
6229                         );
6230                         PL_curcop = (COP*)o;
6231                     }
6232                 }
6233                 nop = nop->op_next;
6234
6235                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
6236                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6237
6238                 rex->offs[0].end = locinput - reginfo->strbeg;
6239                 if (reginfo->info_aux_eval->pos_magic)
6240                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6241                                   reginfo->sv, reginfo->strbeg,
6242                                   locinput - reginfo->strbeg);
6243
6244                 if (sv_yes_mark) {
6245                     SV *sv_mrk = get_sv("REGMARK", 1);
6246                     sv_setsv(sv_mrk, sv_yes_mark);
6247                 }
6248
6249                 /* we don't use MULTICALL here as we want to call the
6250                  * first op of the block of interest, rather than the
6251                  * first op of the sub */
6252                 before = (IV)(SP-PL_stack_base);
6253                 PL_op = nop;
6254                 CALLRUNOPS(aTHX);                       /* Scalar context. */
6255                 SPAGAIN;
6256                 if ((IV)(SP-PL_stack_base) == before)
6257                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
6258                 else {
6259                     ret = POPs;
6260                     PUTBACK;
6261                 }
6262
6263                 /* before restoring everything, evaluate the returned
6264                  * value, so that 'uninit' warnings don't use the wrong
6265                  * PL_op or pad. Also need to process any magic vars
6266                  * (e.g. $1) *before* parentheses are restored */
6267
6268                 PL_op = NULL;
6269
6270                 re_sv = NULL;
6271                 if (logical == 0)        /*   (?{})/   */
6272                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6273                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
6274                     sw = cBOOL(SvTRUE(ret));
6275                     logical = 0;
6276                 }
6277                 else {                   /*  /(??{})  */
6278                     /*  if its overloaded, let the regex compiler handle
6279                      *  it; otherwise extract regex, or stringify  */
6280                     if (SvGMAGICAL(ret))
6281                         ret = sv_mortalcopy(ret);
6282                     if (!SvAMAGIC(ret)) {
6283                         SV *sv = ret;
6284                         if (SvROK(sv))
6285                             sv = SvRV(sv);
6286                         if (SvTYPE(sv) == SVt_REGEXP)
6287                             re_sv = (REGEXP*) sv;
6288                         else if (SvSMAGICAL(ret)) {
6289                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6290                             if (mg)
6291                                 re_sv = (REGEXP *) mg->mg_obj;
6292                         }
6293
6294                         /* force any undef warnings here */
6295                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6296                             ret = sv_mortalcopy(ret);
6297                             (void) SvPV_force_nolen(ret);
6298                         }
6299                     }
6300
6301                 }
6302
6303                 /* *** Note that at this point we don't restore
6304                  * PL_comppad, (or pop the CxSUB) on the assumption it may
6305                  * be used again soon. This is safe as long as nothing
6306                  * in the regexp code uses the pad ! */
6307                 PL_op = oop;
6308                 PL_curcop = ocurcop;
6309                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
6310                 PL_curpm = PL_reg_curpm;
6311
6312                 if (logical != 2)
6313                     break;
6314             }
6315
6316                 /* only /(??{})/  from now on */
6317                 logical = 0;
6318                 {
6319                     /* extract RE object from returned value; compiling if
6320                      * necessary */
6321
6322                     if (re_sv) {
6323                         re_sv = reg_temp_copy(NULL, re_sv);
6324                     }
6325                     else {
6326                         U32 pm_flags = 0;
6327
6328                         if (SvUTF8(ret) && IN_BYTES) {
6329                             /* In use 'bytes': make a copy of the octet
6330                              * sequence, but without the flag on */
6331                             STRLEN len;
6332                             const char *const p = SvPV(ret, len);
6333                             ret = newSVpvn_flags(p, len, SVs_TEMP);
6334                         }
6335                         if (rex->intflags & PREGf_USE_RE_EVAL)
6336                             pm_flags |= PMf_USE_RE_EVAL;
6337
6338                         /* if we got here, it should be an engine which
6339                          * supports compiling code blocks and stuff */
6340                         assert(rex->engine && rex->engine->op_comp);
6341                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6342                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
6343                                     rex->engine, NULL, NULL,
6344                                     /* copy /msixn etc to inner pattern */
6345                                     ARG2L(scan),
6346                                     pm_flags);
6347
6348                         if (!(SvFLAGS(ret)
6349                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
6350                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
6351                             /* This isn't a first class regexp. Instead, it's
6352                                caching a regexp onto an existing, Perl visible
6353                                scalar.  */
6354                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
6355                         }
6356                     }
6357                     SAVEFREESV(re_sv);
6358                     re = ReANY(re_sv);
6359                 }
6360                 RXp_MATCH_COPIED_off(re);
6361                 re->subbeg = rex->subbeg;
6362                 re->sublen = rex->sublen;
6363                 re->suboffset = rex->suboffset;
6364                 re->subcoffset = rex->subcoffset;
6365                 re->lastparen = 0;
6366                 re->lastcloseparen = 0;
6367                 rei = RXi_GET(re);
6368                 DEBUG_EXECUTE_r(
6369                     debug_start_match(re_sv, utf8_target, locinput,
6370                                     reginfo->strend, "Matching embedded");
6371                 );
6372                 startpoint = rei->program + 1;
6373                 ST.close_paren = 0; /* only used for GOSUB */
6374                 /* Save all the seen positions so far. */
6375                 ST.cp = regcppush(rex, 0, maxopenparen);
6376                 REGCP_SET(ST.lastcp);
6377                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
6378                 maxopenparen = 0;
6379                 /* run the pattern returned from (??{...}) */
6380
6381               eval_recurse_doit: /* Share code with GOSUB below this line
6382                             * At this point we expect the stack context to be
6383                             * set up correctly */
6384
6385                 /* invalidate the S-L poscache. We're now executing a
6386                  * different set of WHILEM ops (and their associated
6387                  * indexes) against the same string, so the bits in the
6388                  * cache are meaningless. Setting maxiter to zero forces
6389                  * the cache to be invalidated and zeroed before reuse.
6390                  * XXX This is too dramatic a measure. Ideally we should
6391                  * save the old cache and restore when running the outer
6392                  * pattern again */
6393                 reginfo->poscache_maxiter = 0;
6394
6395                 /* the new regexp might have a different is_utf8_pat than we do */
6396                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
6397
6398                 ST.prev_rex = rex_sv;
6399                 ST.prev_curlyx = cur_curlyx;
6400                 rex_sv = re_sv;
6401                 SET_reg_curpm(rex_sv);
6402                 rex = re;
6403                 rexi = rei;
6404                 cur_curlyx = NULL;
6405                 ST.B = next;
6406                 ST.prev_eval = cur_eval;
6407                 cur_eval = st;
6408                 /* now continue from first node in postoned RE */
6409                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
6410                 /* NOTREACHED */
6411                 NOT_REACHED; /* NOTREACHED */
6412         }
6413
6414         case EVAL_AB: /* cleanup after a successful (??{A})B */
6415             /* note: this is called twice; first after popping B, then A */
6416             rex_sv = ST.prev_rex;
6417             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6418             SET_reg_curpm(rex_sv);
6419             rex = ReANY(rex_sv);
6420             rexi = RXi_GET(rex);
6421             {
6422                 /* preserve $^R across LEAVE's. See Bug 121070. */
6423                 SV *save_sv= GvSV(PL_replgv);
6424                 SvREFCNT_inc(save_sv);
6425                 regcpblow(ST.cp); /* LEAVE in disguise */
6426                 sv_setsv(GvSV(PL_replgv), save_sv);
6427                 SvREFCNT_dec(save_sv);
6428             }
6429             cur_eval = ST.prev_eval;
6430             cur_curlyx = ST.prev_curlyx;
6431
6432             /* Invalidate cache. See "invalidate" comment above. */
6433             reginfo->poscache_maxiter = 0;
6434             if ( nochange_depth )
6435                 nochange_depth--;
6436             sayYES;
6437
6438
6439         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
6440             /* note: this is called twice; first after popping B, then A */
6441             rex_sv = ST.prev_rex;
6442             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6443             SET_reg_curpm(rex_sv);
6444             rex = ReANY(rex_sv);
6445             rexi = RXi_GET(rex);
6446
6447             REGCP_UNWIND(ST.lastcp);
6448             regcppop(rex, &maxopenparen);
6449             cur_eval = ST.prev_eval;
6450             cur_curlyx = ST.prev_curlyx;
6451             /* Invalidate cache. See "invalidate" comment above. */
6452             reginfo->poscache_maxiter = 0;
6453             if ( nochange_depth )
6454                 nochange_depth--;
6455             sayNO_SILENT;
6456 #undef ST
6457
6458         case OPEN: /*  (  */
6459             n = ARG(scan);  /* which paren pair */
6460             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
6461             if (n > maxopenparen)
6462                 maxopenparen = n;
6463             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
6464                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
6465                 PTR2UV(rex),
6466                 PTR2UV(rex->offs),
6467                 (UV)n,
6468                 (IV)rex->offs[n].start_tmp,
6469                 (UV)maxopenparen
6470             ));
6471             lastopen = n;
6472             break;
6473
6474 /* XXX really need to log other places start/end are set too */
6475 #define CLOSE_CAPTURE \
6476     rex->offs[n].start = rex->offs[n].start_tmp; \
6477     rex->offs[n].end = locinput - reginfo->strbeg; \
6478     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
6479         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
6480         PTR2UV(rex), \
6481         PTR2UV(rex->offs), \
6482         (UV)n, \
6483         (IV)rex->offs[n].start, \
6484         (IV)rex->offs[n].end \
6485     ))
6486
6487         case CLOSE:  /*  )  */
6488             n = ARG(scan);  /* which paren pair */
6489             CLOSE_CAPTURE;
6490             if (n > rex->lastparen)
6491                 rex->lastparen = n;
6492             rex->lastcloseparen = n;
6493             if (cur_eval && cur_eval->u.eval.close_paren == n) {
6494                 goto fake_end;
6495             }
6496             break;
6497
6498         case ACCEPT:  /*  (*ACCEPT)  */
6499             if (ARG(scan)){
6500                 regnode *cursor;
6501                 for (cursor=scan;
6502                      cursor && OP(cursor)!=END;
6503                      cursor=regnext(cursor))
6504                 {
6505                     if ( OP(cursor)==CLOSE ){
6506                         n = ARG(cursor);
6507                         if ( n <= lastopen ) {
6508                             CLOSE_CAPTURE;
6509                             if (n > rex->lastparen)
6510                                 rex->lastparen = n;
6511                             rex->lastcloseparen = n;
6512                             if ( n == ARG(scan) || (cur_eval &&
6513                                 cur_eval->u.eval.close_paren == n))
6514                                 break;
6515                         }
6516                     }
6517                 }
6518             }
6519             goto fake_end;
6520             /* NOTREACHED */
6521
6522         case GROUPP:  /*  (?(1))  */
6523             n = ARG(scan);  /* which paren pair */
6524             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
6525             break;
6526
6527         case NGROUPP:  /*  (?(<name>))  */
6528             /* reg_check_named_buff_matched returns 0 for no match */
6529             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
6530             break;
6531
6532         case INSUBP:   /*  (?(R))  */
6533             n = ARG(scan);
6534             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
6535             break;
6536
6537         case DEFINEP:  /*  (?(DEFINE))  */
6538             sw = 0;
6539             break;
6540
6541         case IFTHEN:   /*  (?(cond)A|B)  */
6542             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6543             if (sw)
6544                 next = NEXTOPER(NEXTOPER(scan));
6545             else {
6546                 next = scan + ARG(scan);
6547                 if (OP(next) == IFTHEN) /* Fake one. */
6548                     next = NEXTOPER(NEXTOPER(next));
6549             }
6550             break;
6551
6552         case LOGICAL:  /* modifier for EVAL and IFMATCH */
6553             logical = scan->flags;
6554             break;
6555
6556 /*******************************************************************
6557
6558 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
6559 pattern, where A and B are subpatterns. (For simple A, CURLYM or
6560 STAR/PLUS/CURLY/CURLYN are used instead.)
6561
6562 A*B is compiled as <CURLYX><A><WHILEM><B>
6563
6564 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
6565 state, which contains the current count, initialised to -1. It also sets
6566 cur_curlyx to point to this state, with any previous value saved in the
6567 state block.
6568
6569 CURLYX then jumps straight to the WHILEM op, rather than executing A,
6570 since the pattern may possibly match zero times (i.e. it's a while {} loop
6571 rather than a do {} while loop).
6572
6573 Each entry to WHILEM represents a successful match of A. The count in the
6574 CURLYX block is incremented, another WHILEM state is pushed, and execution
6575 passes to A or B depending on greediness and the current count.
6576
6577 For example, if matching against the string a1a2a3b (where the aN are
6578 substrings that match /A/), then the match progresses as follows: (the
6579 pushed states are interspersed with the bits of strings matched so far):
6580
6581     <CURLYX cnt=-1>
6582     <CURLYX cnt=0><WHILEM>
6583     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
6584     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
6585     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
6586     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
6587
6588 (Contrast this with something like CURLYM, which maintains only a single
6589 backtrack state:
6590
6591     <CURLYM cnt=0> a1
6592     a1 <CURLYM cnt=1> a2
6593     a1 a2 <CURLYM cnt=2> a3
6594     a1 a2 a3 <CURLYM cnt=3> b
6595 )
6596
6597 Each WHILEM state block marks a point to backtrack to upon partial failure
6598 of A or B, and also contains some minor state data related to that
6599 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
6600 overall state, such as the count, and pointers to the A and B ops.
6601
6602 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
6603 must always point to the *current* CURLYX block, the rules are:
6604
6605 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
6606 and set cur_curlyx to point the new block.
6607
6608 When popping the CURLYX block after a successful or unsuccessful match,
6609 restore the previous cur_curlyx.
6610
6611 When WHILEM is about to execute B, save the current cur_curlyx, and set it
6612 to the outer one saved in the CURLYX block.
6613
6614 When popping the WHILEM block after a successful or unsuccessful B match,
6615 restore the previous cur_curlyx.
6616
6617 Here's an example for the pattern (AI* BI)*BO
6618 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
6619
6620 cur_
6621 curlyx backtrack stack
6622 ------ ---------------
6623 NULL
6624 CO     <CO prev=NULL> <WO>
6625 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6626 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6627 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
6628
6629 At this point the pattern succeeds, and we work back down the stack to
6630 clean up, restoring as we go:
6631
6632 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6633 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6634 CO     <CO prev=NULL> <WO>
6635 NULL
6636
6637 *******************************************************************/
6638
6639 #define ST st->u.curlyx
6640
6641         case CURLYX:    /* start of /A*B/  (for complex A) */
6642         {
6643             /* No need to save/restore up to this paren */
6644             I32 parenfloor = scan->flags;
6645
6646             assert(next); /* keep Coverity happy */
6647             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
6648                 next += ARG(next);
6649
6650             /* XXXX Probably it is better to teach regpush to support
6651                parenfloor > maxopenparen ... */
6652             if (parenfloor > (I32)rex->lastparen)
6653                 parenfloor = rex->lastparen; /* Pessimization... */
6654
6655             ST.prev_curlyx= cur_curlyx;
6656             cur_curlyx = st;
6657             ST.cp = PL_savestack_ix;
6658
6659             /* these fields contain the state of the current curly.
6660              * they are accessed by subsequent WHILEMs */
6661             ST.parenfloor = parenfloor;
6662             ST.me = scan;
6663             ST.B = next;
6664             ST.minmod = minmod;
6665             minmod = 0;
6666             ST.count = -1;      /* this will be updated by WHILEM */
6667             ST.lastloc = NULL;  /* this will be updated by WHILEM */
6668
6669             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
6670             /* NOTREACHED */
6671             NOT_REACHED; /* NOTREACHED */
6672         }
6673
6674         case CURLYX_end: /* just finished matching all of A*B */
6675             cur_curlyx = ST.prev_curlyx;
6676             sayYES;
6677             /* NOTREACHED */
6678             NOT_REACHED; /* NOTREACHED */
6679
6680         case CURLYX_end_fail: /* just failed to match all of A*B */
6681             regcpblow(ST.cp);
6682             cur_curlyx = ST.prev_curlyx;
6683             sayNO;
6684             /* NOTREACHED */
6685             NOT_REACHED; /* NOTREACHED */
6686
6687
6688 #undef ST
6689 #define ST st->u.whilem
6690
6691         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
6692         {
6693             /* see the discussion above about CURLYX/WHILEM */
6694             I32 n;
6695             int min, max;
6696             regnode *A;
6697
6698             assert(cur_curlyx); /* keep Coverity happy */
6699
6700             min = ARG1(cur_curlyx->u.curlyx.me);
6701             max = ARG2(cur_curlyx->u.curlyx.me);
6702             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
6703             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
6704             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
6705             ST.cache_offset = 0;
6706             ST.cache_mask = 0;
6707
6708
6709             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6710                   "%*s  whilem: matched %ld out of %d..%d\n",
6711                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
6712             );
6713
6714             /* First just match a string of min A's. */
6715
6716             if (n < min) {
6717                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6718                                     maxopenparen);
6719                 cur_curlyx->u.curlyx.lastloc = locinput;
6720                 REGCP_SET(ST.lastcp);
6721
6722                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
6723                 /* NOTREACHED */
6724                 NOT_REACHED; /* NOTREACHED */
6725             }
6726
6727             /* If degenerate A matches "", assume A done. */
6728
6729             if (locinput == cur_curlyx->u.curlyx.lastloc) {
6730                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6731                    "%*s  whilem: empty match detected, trying continuation...\n",
6732                    REPORT_CODE_OFF+depth*2, "")
6733                 );
6734                 goto do_whilem_B_max;
6735             }
6736
6737             /* super-linear cache processing.
6738              *
6739              * The idea here is that for certain types of CURLYX/WHILEM -
6740              * principally those whose upper bound is infinity (and
6741              * excluding regexes that have things like \1 and other very
6742              * non-regular expresssiony things), then if a pattern like
6743              * /....A*.../ fails and we backtrack to the WHILEM, then we
6744              * make a note that this particular WHILEM op was at string
6745              * position 47 (say) when the rest of pattern failed. Then, if
6746              * we ever find ourselves back at that WHILEM, and at string
6747              * position 47 again, we can just fail immediately rather than
6748              * running the rest of the pattern again.
6749              *
6750              * This is very handy when patterns start to go
6751              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
6752              * with a combinatorial explosion of backtracking.
6753              *
6754              * The cache is implemented as a bit array, with one bit per
6755              * string byte position per WHILEM op (up to 16) - so its
6756              * between 0.25 and 2x the string size.
6757              *
6758              * To avoid allocating a poscache buffer every time, we do an
6759              * initially countdown; only after we have  executed a WHILEM
6760              * op (string-length x #WHILEMs) times do we allocate the
6761              * cache.
6762              *
6763              * The top 4 bits of scan->flags byte say how many different
6764              * relevant CURLLYX/WHILEM op pairs there are, while the
6765              * bottom 4-bits is the identifying index number of this
6766              * WHILEM.
6767              */
6768
6769             if (scan->flags) {
6770
6771                 if (!reginfo->poscache_maxiter) {
6772                     /* start the countdown: Postpone detection until we
6773                      * know the match is not *that* much linear. */
6774                     reginfo->poscache_maxiter
6775                         =    (reginfo->strend - reginfo->strbeg + 1)
6776                            * (scan->flags>>4);
6777                     /* possible overflow for long strings and many CURLYX's */
6778                     if (reginfo->poscache_maxiter < 0)
6779                         reginfo->poscache_maxiter = I32_MAX;
6780                     reginfo->poscache_iter = reginfo->poscache_maxiter;
6781                 }
6782
6783                 if (reginfo->poscache_iter-- == 0) {
6784                     /* initialise cache */
6785                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
6786                     regmatch_info_aux *const aux = reginfo->info_aux;
6787                     if (aux->poscache) {
6788                         if ((SSize_t)reginfo->poscache_size < size) {
6789                             Renew(aux->poscache, size, char);
6790                             reginfo->poscache_size = size;
6791                         }
6792                         Zero(aux->poscache, size, char);
6793                     }
6794                     else {
6795                         reginfo->poscache_size = size;
6796                         Newxz(aux->poscache, size, char);
6797                     }
6798                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6799       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
6800                               PL_colors[4], PL_colors[5])
6801                     );
6802                 }
6803
6804                 if (reginfo->poscache_iter < 0) {
6805                     /* have we already failed at this position? */
6806                     SSize_t offset, mask;
6807
6808                     reginfo->poscache_iter = -1; /* stop eventual underflow */
6809                     offset  = (scan->flags & 0xf) - 1
6810                                 +   (locinput - reginfo->strbeg)
6811                                   * (scan->flags>>4);
6812                     mask    = 1 << (offset % 8);
6813                     offset /= 8;
6814                     if (reginfo->info_aux->poscache[offset] & mask) {
6815                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6816                             "%*s  whilem: (cache) already tried at this position...\n",
6817                             REPORT_CODE_OFF+depth*2, "")
6818                         );
6819                         sayNO; /* cache records failure */
6820                     }
6821                     ST.cache_offset = offset;
6822                     ST.cache_mask   = mask;
6823                 }
6824             }
6825
6826             /* Prefer B over A for minimal matching. */
6827
6828             if (cur_curlyx->u.curlyx.minmod) {
6829                 ST.save_curlyx = cur_curlyx;
6830                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6831                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
6832                             maxopenparen);
6833                 REGCP_SET(ST.lastcp);
6834                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
6835                                     locinput);
6836                 /* NOTREACHED */
6837                 NOT_REACHED; /* NOTREACHED */
6838             }
6839
6840             /* Prefer A over B for maximal matching. */
6841
6842             if (n < max) { /* More greed allowed? */
6843                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6844                             maxopenparen);
6845                 cur_curlyx->u.curlyx.lastloc = locinput;
6846                 REGCP_SET(ST.lastcp);
6847                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
6848                 /* NOTREACHED */
6849                 NOT_REACHED; /* NOTREACHED */
6850             }
6851             goto do_whilem_B_max;
6852         }
6853         /* NOTREACHED */
6854         NOT_REACHED; /* NOTREACHED */
6855
6856         case WHILEM_B_min: /* just matched B in a minimal match */
6857         case WHILEM_B_max: /* just matched B in a maximal match */
6858             cur_curlyx = ST.save_curlyx;
6859             sayYES;
6860             /* NOTREACHED */
6861             NOT_REACHED; /* NOTREACHED */
6862
6863         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
6864             cur_curlyx = ST.save_curlyx;
6865             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6866             cur_curlyx->u.curlyx.count--;
6867             CACHEsayNO;
6868             /* NOTREACHED */
6869             NOT_REACHED; /* NOTREACHED */
6870
6871         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
6872             /* FALLTHROUGH */
6873         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
6874             REGCP_UNWIND(ST.lastcp);
6875             regcppop(rex, &maxopenparen);
6876             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6877             cur_curlyx->u.curlyx.count--;
6878             CACHEsayNO;
6879             /* NOTREACHED */
6880             NOT_REACHED; /* NOTREACHED */
6881
6882         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
6883             REGCP_UNWIND(ST.lastcp);
6884             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
6885             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6886                 "%*s  whilem: failed, trying continuation...\n",
6887                 REPORT_CODE_OFF+depth*2, "")
6888             );
6889           do_whilem_B_max:
6890             if (cur_curlyx->u.curlyx.count >= REG_INFTY
6891                 && ckWARN(WARN_REGEXP)
6892                 && !reginfo->warned)
6893             {
6894                 reginfo->warned = TRUE;
6895                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6896                      "Complex regular subexpression recursion limit (%d) "
6897                      "exceeded",
6898                      REG_INFTY - 1);
6899             }
6900
6901             /* now try B */
6902             ST.save_curlyx = cur_curlyx;
6903             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6904             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6905                                 locinput);
6906             /* NOTREACHED */
6907             NOT_REACHED; /* NOTREACHED */
6908
6909         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6910             cur_curlyx = ST.save_curlyx;
6911             REGCP_UNWIND(ST.lastcp);
6912             regcppop(rex, &maxopenparen);
6913
6914             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
6915                 /* Maximum greed exceeded */
6916                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6917                     && ckWARN(WARN_REGEXP)
6918                     && !reginfo->warned)
6919                 {
6920                     reginfo->warned     = TRUE;
6921                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6922                         "Complex regular subexpression recursion "
6923                         "limit (%d) exceeded",
6924                         REG_INFTY - 1);
6925                 }
6926                 cur_curlyx->u.curlyx.count--;
6927                 CACHEsayNO;
6928             }
6929
6930             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6931                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6932             );
6933             /* Try grabbing another A and see if it helps. */
6934             cur_curlyx->u.curlyx.lastloc = locinput;
6935             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6936                             maxopenparen);
6937             REGCP_SET(ST.lastcp);
6938             PUSH_STATE_GOTO(WHILEM_A_min,
6939                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6940                 locinput);
6941             /* NOTREACHED */
6942             NOT_REACHED; /* NOTREACHED */
6943
6944 #undef  ST
6945 #define ST st->u.branch
6946
6947         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
6948             next = scan + ARG(scan);
6949             if (next == scan)
6950                 next = NULL;
6951             scan = NEXTOPER(scan);
6952             /* FALLTHROUGH */
6953
6954         case BRANCH:        /*  /(...|A|...)/ */
6955             scan = NEXTOPER(scan); /* scan now points to inner node */
6956             ST.lastparen = rex->lastparen;
6957             ST.lastcloseparen = rex->lastcloseparen;
6958             ST.next_branch = next;
6959             REGCP_SET(ST.cp);
6960
6961             /* Now go into the branch */
6962             if (has_cutgroup) {
6963                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
6964             } else {
6965                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
6966             }
6967             /* NOTREACHED */
6968             NOT_REACHED; /* NOTREACHED */
6969
6970         case CUTGROUP:  /*  /(*THEN)/  */
6971             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
6972                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6973             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
6974             /* NOTREACHED */
6975             NOT_REACHED; /* NOTREACHED */
6976
6977         case CUTGROUP_next_fail:
6978             do_cutgroup = 1;
6979             no_final = 1;
6980             if (st->u.mark.mark_name)
6981                 sv_commit = st->u.mark.mark_name;
6982             sayNO;
6983             /* NOTREACHED */
6984             NOT_REACHED; /* NOTREACHED */
6985
6986         case BRANCH_next:
6987             sayYES;
6988             /* NOTREACHED */
6989             NOT_REACHED; /* NOTREACHED */
6990
6991         case BRANCH_next_fail: /* that branch failed; try the next, if any */
6992             if (do_cutgroup) {
6993                 do_cutgroup = 0;
6994                 no_final = 0;
6995             }
6996             REGCP_UNWIND(ST.cp);
6997             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6998             scan = ST.next_branch;
6999             /* no more branches? */
7000             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7001                 DEBUG_EXECUTE_r({
7002                     PerlIO_printf( Perl_debug_log,
7003                         "%*s  %sBRANCH failed...%s\n",
7004                         REPORT_CODE_OFF+depth*2, "",
7005                         PL_colors[4],
7006                         PL_colors[5] );
7007                 });
7008                 sayNO_SILENT;
7009             }
7010             continue; /* execute next BRANCH[J] op */
7011             /* NOTREACHED */
7012
7013         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7014             minmod = 1;
7015             break;
7016
7017 #undef  ST
7018 #define ST st->u.curlym
7019
7020         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
7021
7022             /* This is an optimisation of CURLYX that enables us to push
7023              * only a single backtracking state, no matter how many matches
7024              * there are in {m,n}. It relies on the pattern being constant
7025              * length, with no parens to influence future backrefs
7026              */
7027
7028             ST.me = scan;
7029             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7030
7031             ST.lastparen      = rex->lastparen;
7032             ST.lastcloseparen = rex->lastcloseparen;
7033
7034             /* if paren positive, emulate an OPEN/CLOSE around A */
7035             if (ST.me->flags) {
7036                 U32 paren = ST.me->flags;
7037                 if (paren > maxopenparen)
7038                     maxopenparen = paren;
7039                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7040             }
7041             ST.A = scan;
7042             ST.B = next;
7043             ST.alen = 0;
7044             ST.count = 0;
7045             ST.minmod = minmod;
7046             minmod = 0;
7047             ST.c1 = CHRTEST_UNINIT;
7048             REGCP_SET(ST.cp);
7049
7050             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7051                 goto curlym_do_B;
7052
7053           curlym_do_A: /* execute the A in /A{m,n}B/  */
7054             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7055             /* NOTREACHED */
7056             NOT_REACHED; /* NOTREACHED */
7057
7058         case CURLYM_A: /* we've just matched an A */
7059             ST.count++;
7060             /* after first match, determine A's length: u.curlym.alen */
7061             if (ST.count == 1) {
7062                 if (reginfo->is_utf8_target) {
7063                     char *s = st->locinput;
7064                     while (s < locinput) {
7065                         ST.alen++;
7066                         s += UTF8SKIP(s);
7067                     }
7068                 }
7069                 else {
7070                     ST.alen = locinput - st->locinput;
7071                 }
7072                 if (ST.alen == 0)
7073                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7074             }
7075             DEBUG_EXECUTE_r(
7076                 PerlIO_printf(Perl_debug_log,
7077                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7078                           (int)(REPORT_CODE_OFF+(depth*2)), "",
7079                           (IV) ST.count, (IV)ST.alen)
7080             );
7081
7082             if (cur_eval && cur_eval->u.eval.close_paren &&
7083                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7084                 goto fake_end;
7085
7086             {
7087                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7088                 if ( max == REG_INFTY || ST.count < max )
7089                     goto curlym_do_A; /* try to match another A */
7090             }
7091             goto curlym_do_B; /* try to match B */
7092
7093         case CURLYM_A_fail: /* just failed to match an A */
7094             REGCP_UNWIND(ST.cp);
7095
7096             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7097                 || (cur_eval && cur_eval->u.eval.close_paren &&
7098                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
7099                 sayNO;
7100
7101           curlym_do_B: /* execute the B in /A{m,n}B/  */
7102             if (ST.c1 == CHRTEST_UNINIT) {
7103                 /* calculate c1 and c2 for possible match of 1st char
7104                  * following curly */
7105                 ST.c1 = ST.c2 = CHRTEST_VOID;
7106                 assert(ST.B);
7107                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7108                     regnode *text_node = ST.B;
7109                     if (! HAS_TEXT(text_node))
7110                         FIND_NEXT_IMPT(text_node);
7111                     /* this used to be
7112
7113                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7114
7115                         But the former is redundant in light of the latter.
7116
7117                         if this changes back then the macro for
7118                         IS_TEXT and friends need to change.
7119                      */
7120                     if (PL_regkind[OP(text_node)] == EXACT) {
7121                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7122                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7123                            reginfo))
7124                         {
7125                             sayNO;
7126                         }
7127                     }
7128                 }
7129             }
7130
7131             DEBUG_EXECUTE_r(
7132                 PerlIO_printf(Perl_debug_log,
7133                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
7134                     (int)(REPORT_CODE_OFF+(depth*2)),
7135                     "", (IV)ST.count)
7136                 );
7137             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7138                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7139                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7140                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7141                     {
7142                         /* simulate B failing */
7143                         DEBUG_OPTIMISE_r(
7144                             PerlIO_printf(Perl_debug_log,
7145                                 "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7146                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
7147                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7148                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7149                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7150                         );
7151                         state_num = CURLYM_B_fail;
7152                         goto reenter_switch;
7153                     }
7154                 }
7155                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7156                     /* simulate B failing */
7157                     DEBUG_OPTIMISE_r(
7158                         PerlIO_printf(Perl_debug_log,
7159                             "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7160                             (int)(REPORT_CODE_OFF+(depth*2)),"",
7161                             (int) nextchr, ST.c1, ST.c2)
7162                     );
7163                     state_num = CURLYM_B_fail;
7164                     goto reenter_switch;
7165                 }
7166             }
7167
7168             if (ST.me->flags) {
7169                 /* emulate CLOSE: mark current A as captured */
7170                 I32 paren = ST.me->flags;
7171                 if (ST.count) {
7172                     rex->offs[paren].start
7173                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7174                     rex->offs[paren].end = locinput - reginfo->strbeg;
7175                     if ((U32)paren > rex->lastparen)
7176                         rex->lastparen = paren;
7177                     rex->lastcloseparen = paren;
7178                 }
7179                 else
7180                     rex->offs[paren].end = -1;
7181                 if (cur_eval && cur_eval->u.eval.close_paren &&
7182                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7183                 {
7184                     if (ST.count)
7185                         goto fake_end;
7186                     else
7187                         sayNO;
7188                 }
7189             }
7190
7191             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7192             /* NOTREACHED */
7193             NOT_REACHED; /* NOTREACHED */
7194
7195         case CURLYM_B_fail: /* just failed to match a B */
7196             REGCP_UNWIND(ST.cp);
7197             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7198             if (ST.minmod) {
7199                 I32 max = ARG2(ST.me);
7200                 if (max != REG_INFTY && ST.count == max)
7201                     sayNO;
7202                 goto curlym_do_A; /* try to match a further A */
7203             }
7204             /* backtrack one A */
7205             if (ST.count == ARG1(ST.me) /* min */)
7206                 sayNO;
7207             ST.count--;
7208             SET_locinput(HOPc(locinput, -ST.alen));
7209             goto curlym_do_B; /* try to match B */
7210
7211 #undef ST
7212 #define ST st->u.curly
7213
7214 #define CURLY_SETPAREN(paren, success) \
7215     if (paren) { \
7216         if (success) { \
7217             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7218             rex->offs[paren].end = locinput - reginfo->strbeg; \
7219             if (paren > rex->lastparen) \
7220                 rex->lastparen = paren; \
7221             rex->lastcloseparen = paren; \
7222         } \
7223         else { \
7224             rex->offs[paren].end = -1; \
7225             rex->lastparen      = ST.lastparen; \
7226             rex->lastcloseparen = ST.lastcloseparen; \
7227         } \
7228     }
7229
7230         case STAR:              /*  /A*B/ where A is width 1 char */
7231             ST.paren = 0;
7232             ST.min = 0;
7233             ST.max = REG_INFTY;
7234             scan = NEXTOPER(scan);
7235             goto repeat;
7236
7237         case PLUS:              /*  /A+B/ where A is width 1 char */
7238             ST.paren = 0;
7239             ST.min = 1;
7240             ST.max = REG_INFTY;
7241             scan = NEXTOPER(scan);
7242             goto repeat;
7243
7244         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
7245             ST.paren = scan->flags;     /* Which paren to set */
7246             ST.lastparen      = rex->lastparen;
7247             ST.lastcloseparen = rex->lastcloseparen;
7248             if (ST.paren > maxopenparen)
7249                 maxopenparen = ST.paren;
7250             ST.min = ARG1(scan);  /* min to match */
7251             ST.max = ARG2(scan);  /* max to match */
7252             if (cur_eval && cur_eval->u.eval.close_paren &&
7253                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7254                 ST.min=1;
7255                 ST.max=1;
7256             }
7257             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7258             goto repeat;
7259
7260         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
7261             ST.paren = 0;
7262             ST.min = ARG1(scan);  /* min to match */
7263             ST.max = ARG2(scan);  /* max to match */
7264             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7265           repeat:
7266             /*
7267             * Lookahead to avoid useless match attempts
7268             * when we know what character comes next.
7269             *
7270             * Used to only do .*x and .*?x, but now it allows
7271             * for )'s, ('s and (?{ ... })'s to be in the way
7272             * of the quantifier and the EXACT-like node.  -- japhy
7273             */
7274
7275             assert(ST.min <= ST.max);
7276             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7277                 ST.c1 = ST.c2 = CHRTEST_VOID;
7278             }
7279             else {
7280                 regnode *text_node = next;
7281
7282                 if (! HAS_TEXT(text_node))
7283                     FIND_NEXT_IMPT(text_node);
7284
7285                 if (! HAS_TEXT(text_node))
7286                     ST.c1 = ST.c2 = CHRTEST_VOID;
7287                 else {
7288                     if ( PL_regkind[OP(text_node)] != EXACT ) {
7289                         ST.c1 = ST.c2 = CHRTEST_VOID;
7290                     }
7291                     else {
7292
7293                     /*  Currently we only get here when
7294
7295                         PL_rekind[OP(text_node)] == EXACT
7296
7297                         if this changes back then the macro for IS_TEXT and
7298                         friends need to change. */
7299                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7300                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7301                            reginfo))
7302                         {
7303                             sayNO;
7304                         }
7305                     }
7306                 }
7307             }
7308
7309             ST.A = scan;
7310             ST.B = next;
7311             if (minmod) {
7312                 char *li = locinput;
7313                 minmod = 0;
7314                 if (ST.min &&
7315                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
7316                             < ST.min)
7317                     sayNO;
7318                 SET_locinput(li);
7319                 ST.count = ST.min;
7320                 REGCP_SET(ST.cp);
7321                 if (ST.c1 == CHRTEST_VOID)
7322                     goto curly_try_B_min;
7323
7324                 ST.oldloc = locinput;
7325
7326                 /* set ST.maxpos to the furthest point along the
7327                  * string that could possibly match */
7328                 if  (ST.max == REG_INFTY) {
7329                     ST.maxpos = reginfo->strend - 1;
7330                     if (utf8_target)
7331                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7332                             ST.maxpos--;
7333                 }
7334                 else if (utf8_target) {
7335                     int m = ST.max - ST.min;
7336                     for (ST.maxpos = locinput;
7337                          m >0 && ST.maxpos < reginfo->strend; m--)
7338                         ST.maxpos += UTF8SKIP(ST.maxpos);
7339                 }
7340                 else {
7341                     ST.maxpos = locinput + ST.max - ST.min;
7342                     if (ST.maxpos >= reginfo->strend)
7343                         ST.maxpos = reginfo->strend - 1;
7344                 }
7345                 goto curly_try_B_min_known;
7346
7347             }
7348             else {
7349                 /* avoid taking address of locinput, so it can remain
7350                  * a register var */
7351                 char *li = locinput;
7352                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
7353                 if (ST.count < ST.min)
7354                     sayNO;
7355                 SET_locinput(li);
7356                 if ((ST.count > ST.min)
7357                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
7358                 {
7359                     /* A{m,n} must come at the end of the string, there's
7360                      * no point in backing off ... */
7361                     ST.min = ST.count;
7362                     /* ...except that $ and \Z can match before *and* after
7363                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
7364                        We may back off by one in this case. */
7365                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
7366                         ST.min--;
7367                 }
7368                 REGCP_SET(ST.cp);
7369                 goto curly_try_B_max;
7370             }
7371             /* NOTREACHED */
7372             NOT_REACHED; /* NOTREACHED */
7373
7374         case CURLY_B_min_known_fail:
7375             /* failed to find B in a non-greedy match where c1,c2 valid */
7376
7377             REGCP_UNWIND(ST.cp);
7378             if (ST.paren) {
7379                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7380             }
7381             /* Couldn't or didn't -- move forward. */
7382             ST.oldloc = locinput;
7383             if (utf8_target)
7384                 locinput += UTF8SKIP(locinput);
7385             else
7386                 locinput++;
7387             ST.count++;
7388           curly_try_B_min_known:
7389              /* find the next place where 'B' could work, then call B */
7390             {
7391                 int n;
7392                 if (utf8_target) {
7393                     n = (ST.oldloc == locinput) ? 0 : 1;
7394                     if (ST.c1 == ST.c2) {
7395                         /* set n to utf8_distance(oldloc, locinput) */
7396                         while (locinput <= ST.maxpos
7397                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
7398                         {
7399                             locinput += UTF8SKIP(locinput);
7400                             n++;
7401                         }
7402                     }
7403                     else {
7404                         /* set n to utf8_distance(oldloc, locinput) */
7405                         while (locinput <= ST.maxpos
7406                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7407                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7408                         {
7409                             locinput += UTF8SKIP(locinput);
7410                             n++;
7411                         }
7412                     }
7413                 }
7414                 else {  /* Not utf8_target */
7415                     if (ST.c1 == ST.c2) {
7416                         while (locinput <= ST.maxpos &&
7417                                UCHARAT(locinput) != ST.c1)
7418                             locinput++;
7419                     }
7420                     else {
7421                         while (locinput <= ST.maxpos
7422                                && UCHARAT(locinput) != ST.c1
7423                                && UCHARAT(locinput) != ST.c2)
7424                             locinput++;
7425                     }
7426                     n = locinput - ST.oldloc;
7427                 }
7428                 if (locinput > ST.maxpos)
7429                     sayNO;
7430                 if (n) {
7431                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
7432                      * at b; check that everything between oldloc and
7433                      * locinput matches */
7434                     char *li = ST.oldloc;
7435                     ST.count += n;
7436                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
7437                         sayNO;
7438                     assert(n == REG_INFTY || locinput == li);
7439                 }
7440                 CURLY_SETPAREN(ST.paren, ST.count);
7441                 if (cur_eval && cur_eval->u.eval.close_paren &&
7442                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
7443                     goto fake_end;
7444                 }
7445                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
7446             }
7447             /* NOTREACHED */
7448             NOT_REACHED; /* NOTREACHED */
7449
7450         case CURLY_B_min_fail:
7451             /* failed to find B in a non-greedy match where c1,c2 invalid */
7452
7453             REGCP_UNWIND(ST.cp);
7454             if (ST.paren) {
7455                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7456             }
7457             /* failed -- move forward one */
7458             {
7459                 char *li = locinput;
7460                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
7461                     sayNO;
7462                 }
7463                 locinput = li;
7464             }
7465             {
7466                 ST.count++;
7467                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
7468                         ST.count > 0)) /* count overflow ? */
7469                 {
7470                   curly_try_B_min:
7471                     CURLY_SETPAREN(ST.paren, ST.count);
7472                     if (cur_eval && cur_eval->u.eval.close_paren &&
7473                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
7474                         goto fake_end;
7475                     }
7476                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
7477                 }
7478             }
7479             sayNO;
7480             /* NOTREACHED */
7481             NOT_REACHED; /* NOTREACHED */
7482
7483           curly_try_B_max:
7484             /* a successful greedy match: now try to match B */
7485             if (cur_eval && cur_eval->u.eval.close_paren &&
7486                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7487                 goto fake_end;
7488             }
7489             {
7490                 bool could_match = locinput < reginfo->strend;
7491
7492                 /* If it could work, try it. */
7493                 if (ST.c1 != CHRTEST_VOID && could_match) {
7494                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
7495                     {
7496                         could_match = memEQ(locinput,
7497                                             ST.c1_utf8,
7498                                             UTF8SKIP(locinput))
7499                                     || memEQ(locinput,
7500                                              ST.c2_utf8,
7501                                              UTF8SKIP(locinput));
7502                     }
7503                     else {
7504                         could_match = UCHARAT(locinput) == ST.c1
7505                                       || UCHARAT(locinput) == ST.c2;
7506                     }
7507                 }
7508                 if (ST.c1 == CHRTEST_VOID || could_match) {
7509                     CURLY_SETPAREN(ST.paren, ST.count);
7510                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
7511                     /* NOTREACHED */
7512                     NOT_REACHED; /* NOTREACHED */
7513                 }
7514             }
7515             /* FALLTHROUGH */
7516
7517         case CURLY_B_max_fail:
7518             /* failed to find B in a greedy match */
7519
7520             REGCP_UNWIND(ST.cp);
7521             if (ST.paren) {
7522                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7523             }
7524             /*  back up. */
7525             if (--ST.count < ST.min)
7526                 sayNO;
7527             locinput = HOPc(locinput, -1);
7528             goto curly_try_B_max;
7529
7530 #undef ST
7531
7532         case END: /*  last op of main pattern  */
7533           fake_end:
7534             if (cur_eval) {
7535                 /* we've just finished A in /(??{A})B/; now continue with B */
7536
7537                 st->u.eval.prev_rex = rex_sv;           /* inner */
7538
7539                 /* Save *all* the positions. */
7540                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
7541                 rex_sv = cur_eval->u.eval.prev_rex;
7542                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7543                 SET_reg_curpm(rex_sv);
7544                 rex = ReANY(rex_sv);
7545                 rexi = RXi_GET(rex);
7546                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
7547
7548                 REGCP_SET(st->u.eval.lastcp);
7549
7550                 /* Restore parens of the outer rex without popping the
7551                  * savestack */
7552                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
7553                                         &maxopenparen);
7554
7555                 st->u.eval.prev_eval = cur_eval;
7556                 cur_eval = cur_eval->u.eval.prev_eval;
7557                 DEBUG_EXECUTE_r(
7558                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
7559                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
7560                 if ( nochange_depth )
7561                     nochange_depth--;
7562
7563                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
7564                                     locinput); /* match B */
7565             }
7566
7567             if (locinput < reginfo->till) {
7568                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7569                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
7570                                       PL_colors[4],
7571                                       (long)(locinput - startpos),
7572                                       (long)(reginfo->till - startpos),
7573                                       PL_colors[5]));
7574
7575                 sayNO_SILENT;           /* Cannot match: too short. */
7576             }
7577             sayYES;                     /* Success! */
7578
7579         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
7580             DEBUG_EXECUTE_r(
7581             PerlIO_printf(Perl_debug_log,
7582                 "%*s  %ssubpattern success...%s\n",
7583                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
7584             sayYES;                     /* Success! */
7585
7586 #undef  ST
7587 #define ST st->u.ifmatch
7588
7589         {
7590             char *newstart;
7591
7592         case SUSPEND:   /* (?>A) */
7593             ST.wanted = 1;
7594             newstart = locinput;
7595             goto do_ifmatch;
7596
7597         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
7598             ST.wanted = 0;
7599             goto ifmatch_trivial_fail_test;
7600
7601         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
7602             ST.wanted = 1;
7603           ifmatch_trivial_fail_test:
7604             if (scan->flags) {
7605                 char * const s = HOPBACKc(locinput, scan->flags);
7606                 if (!s) {
7607                     /* trivial fail */
7608                     if (logical) {
7609                         logical = 0;
7610                         sw = 1 - cBOOL(ST.wanted);
7611                     }
7612                     else if (ST.wanted)
7613                         sayNO;
7614                     next = scan + ARG(scan);
7615                     if (next == scan)
7616                         next = NULL;
7617                     break;
7618                 }
7619                 newstart = s;
7620             }
7621             else
7622                 newstart = locinput;
7623
7624           do_ifmatch:
7625             ST.me = scan;
7626             ST.logical = logical;
7627             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
7628
7629             /* execute body of (?...A) */
7630             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
7631             /* NOTREACHED */
7632             NOT_REACHED; /* NOTREACHED */
7633         }
7634
7635         case IFMATCH_A_fail: /* body of (?...A) failed */
7636             ST.wanted = !ST.wanted;
7637             /* FALLTHROUGH */
7638
7639         case IFMATCH_A: /* body of (?...A) succeeded */
7640             if (ST.logical) {
7641                 sw = cBOOL(ST.wanted);
7642             }
7643             else if (!ST.wanted)
7644                 sayNO;
7645
7646             if (OP(ST.me) != SUSPEND) {
7647                 /* restore old position except for (?>...) */
7648                 locinput = st->locinput;
7649             }
7650             scan = ST.me + ARG(ST.me);
7651             if (scan == ST.me)
7652                 scan = NULL;
7653             continue; /* execute B */
7654
7655 #undef ST
7656
7657         case LONGJMP: /*  alternative with many branches compiles to
7658                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
7659             next = scan + ARG(scan);
7660             if (next == scan)
7661                 next = NULL;
7662             break;
7663
7664         case COMMIT:  /*  (*COMMIT)  */
7665             reginfo->cutpoint = reginfo->strend;
7666             /* FALLTHROUGH */
7667
7668         case PRUNE:   /*  (*PRUNE)   */
7669             if (!scan->flags)
7670                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7671             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
7672             /* NOTREACHED */
7673             NOT_REACHED; /* NOTREACHED */
7674
7675         case COMMIT_next_fail:
7676             no_final = 1;
7677             /* FALLTHROUGH */
7678
7679         case OPFAIL:   /* (*FAIL)  */
7680             sayNO;
7681             /* NOTREACHED */
7682             NOT_REACHED; /* NOTREACHED */
7683
7684 #define ST st->u.mark
7685         case MARKPOINT: /*  (*MARK:foo)  */
7686             ST.prev_mark = mark_state;
7687             ST.mark_name = sv_commit = sv_yes_mark
7688                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7689             mark_state = st;
7690             ST.mark_loc = locinput;
7691             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
7692             /* NOTREACHED */
7693             NOT_REACHED; /* NOTREACHED */
7694
7695         case MARKPOINT_next:
7696             mark_state = ST.prev_mark;
7697             sayYES;
7698             /* NOTREACHED */
7699             NOT_REACHED; /* NOTREACHED */
7700
7701         case MARKPOINT_next_fail:
7702             if (popmark && sv_eq(ST.mark_name,popmark))
7703             {
7704                 if (ST.mark_loc > startpoint)
7705                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7706                 popmark = NULL; /* we found our mark */
7707                 sv_commit = ST.mark_name;
7708
7709                 DEBUG_EXECUTE_r({
7710                         PerlIO_printf(Perl_debug_log,
7711                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
7712                             REPORT_CODE_OFF+depth*2, "",
7713                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
7714                 });
7715             }
7716             mark_state = ST.prev_mark;
7717             sv_yes_mark = mark_state ?
7718                 mark_state->u.mark.mark_name : NULL;
7719             sayNO;
7720             /* NOTREACHED */
7721             NOT_REACHED; /* NOTREACHED */
7722
7723         case SKIP:  /*  (*SKIP)  */
7724             if (scan->flags) {
7725                 /* (*SKIP) : if we fail we cut here*/
7726                 ST.mark_name = NULL;
7727                 ST.mark_loc = locinput;
7728                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
7729             } else {
7730                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
7731                    otherwise do nothing.  Meaning we need to scan
7732                  */
7733                 regmatch_state *cur = mark_state;
7734                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7735
7736                 while (cur) {
7737                     if ( sv_eq( cur->u.mark.mark_name,
7738                                 find ) )
7739                     {
7740                         ST.mark_name = find;
7741                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
7742                     }
7743                     cur = cur->u.mark.prev_mark;
7744                 }
7745             }
7746             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
7747             break;
7748
7749         case SKIP_next_fail:
7750             if (ST.mark_name) {
7751                 /* (*CUT:NAME) - Set up to search for the name as we
7752                    collapse the stack*/
7753                 popmark = ST.mark_name;
7754             } else {
7755                 /* (*CUT) - No name, we cut here.*/
7756                 if (ST.mark_loc > startpoint)
7757                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7758                 /* but we set sv_commit to latest mark_name if there
7759                    is one so they can test to see how things lead to this
7760                    cut */
7761                 if (mark_state)
7762                     sv_commit=mark_state->u.mark.mark_name;
7763             }
7764             no_final = 1;
7765             sayNO;
7766             /* NOTREACHED */
7767             NOT_REACHED; /* NOTREACHED */
7768 #undef ST
7769
7770         case LNBREAK: /* \R */
7771             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
7772                 locinput += n;
7773             } else
7774                 sayNO;
7775             break;
7776
7777         default:
7778             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
7779                           PTR2UV(scan), OP(scan));
7780             Perl_croak(aTHX_ "regexp memory corruption");
7781
7782         /* this is a point to jump to in order to increment
7783          * locinput by one character */
7784           increment_locinput:
7785             assert(!NEXTCHR_IS_EOS);
7786             if (utf8_target) {
7787                 locinput += PL_utf8skip[nextchr];
7788                 /* locinput is allowed to go 1 char off the end, but not 2+ */
7789                 if (locinput > reginfo->strend)
7790                     sayNO;
7791             }
7792             else
7793                 locinput++;
7794             break;
7795
7796         } /* end switch */
7797
7798         /* switch break jumps here */
7799         scan = next; /* prepare to execute the next op and ... */
7800         continue;    /* ... jump back to the top, reusing st */
7801         /* NOTREACHED */
7802
7803       push_yes_state:
7804         /* push a state that backtracks on success */
7805         st->u.yes.prev_yes_state = yes_state;
7806         yes_state = st;
7807         /* FALLTHROUGH */
7808       push_state:
7809         /* push a new regex state, then continue at scan  */
7810         {
7811             regmatch_state *newst;
7812
7813             DEBUG_STACK_r({
7814                 regmatch_state *cur = st;
7815                 regmatch_state *curyes = yes_state;
7816                 int curd = depth;
7817                 regmatch_slab *slab = PL_regmatch_slab;
7818                 for (;curd > -1;cur--,curd--) {
7819                     if (cur < SLAB_FIRST(slab)) {
7820                         slab = slab->prev;
7821                         cur = SLAB_LAST(slab);
7822                     }
7823                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
7824                         REPORT_CODE_OFF + 2 + depth * 2,"",
7825                         curd, PL_reg_name[cur->resume_state],
7826                         (curyes == cur) ? "yes" : ""
7827                     );
7828                     if (curyes == cur)
7829                         curyes = cur->u.yes.prev_yes_state;
7830                 }
7831             } else
7832                 DEBUG_STATE_pp("push")
7833             );
7834             depth++;
7835             st->locinput = locinput;
7836             newst = st+1;
7837             if (newst >  SLAB_LAST(PL_regmatch_slab))
7838                 newst = S_push_slab(aTHX);
7839             PL_regmatch_state = newst;
7840
7841             locinput = pushinput;
7842             st = newst;
7843             continue;
7844             /* NOTREACHED */
7845         }
7846     }
7847
7848     /*
7849     * We get here only if there's trouble -- normally "case END" is
7850     * the terminating point.
7851     */
7852     Perl_croak(aTHX_ "corrupted regexp pointers");
7853     /* NOTREACHED */
7854     sayNO;
7855     NOT_REACHED; /* NOTREACHED */
7856
7857   yes:
7858     if (yes_state) {
7859         /* we have successfully completed a subexpression, but we must now
7860          * pop to the state marked by yes_state and continue from there */
7861         assert(st != yes_state);
7862 #ifdef DEBUGGING
7863         while (st != yes_state) {
7864             st--;
7865             if (st < SLAB_FIRST(PL_regmatch_slab)) {
7866                 PL_regmatch_slab = PL_regmatch_slab->prev;
7867                 st = SLAB_LAST(PL_regmatch_slab);
7868             }
7869             DEBUG_STATE_r({
7870                 if (no_final) {
7871                     DEBUG_STATE_pp("pop (no final)");
7872                 } else {
7873                     DEBUG_STATE_pp("pop (yes)");
7874                 }
7875             });
7876             depth--;
7877         }
7878 #else
7879         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
7880             || yes_state > SLAB_LAST(PL_regmatch_slab))
7881         {
7882             /* not in this slab, pop slab */
7883             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
7884             PL_regmatch_slab = PL_regmatch_slab->prev;
7885             st = SLAB_LAST(PL_regmatch_slab);
7886         }
7887         depth -= (st - yes_state);
7888 #endif
7889         st = yes_state;
7890         yes_state = st->u.yes.prev_yes_state;
7891         PL_regmatch_state = st;
7892
7893         if (no_final)
7894             locinput= st->locinput;
7895         state_num = st->resume_state + no_final;
7896         goto reenter_switch;
7897     }
7898
7899     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
7900                           PL_colors[4], PL_colors[5]));
7901
7902     if (reginfo->info_aux_eval) {
7903         /* each successfully executed (?{...}) block does the equivalent of
7904          *   local $^R = do {...}
7905          * When popping the save stack, all these locals would be undone;
7906          * bypass this by setting the outermost saved $^R to the latest
7907          * value */
7908         /* I dont know if this is needed or works properly now.
7909          * see code related to PL_replgv elsewhere in this file.
7910          * Yves
7911          */
7912         if (oreplsv != GvSV(PL_replgv))
7913             sv_setsv(oreplsv, GvSV(PL_replgv));
7914     }
7915     result = 1;
7916     goto final_exit;
7917
7918   no:
7919     DEBUG_EXECUTE_r(
7920         PerlIO_printf(Perl_debug_log,
7921             "%*s  %sfailed...%s\n",
7922             REPORT_CODE_OFF+depth*2, "",
7923             PL_colors[4], PL_colors[5])
7924         );
7925
7926   no_silent:
7927     if (no_final) {
7928         if (yes_state) {
7929             goto yes;
7930         } else {
7931             goto final_exit;
7932         }
7933     }
7934     if (depth) {
7935         /* there's a previous state to backtrack to */
7936         st--;
7937         if (st < SLAB_FIRST(PL_regmatch_slab)) {
7938             PL_regmatch_slab = PL_regmatch_slab->prev;
7939             st = SLAB_LAST(PL_regmatch_slab);
7940         }
7941         PL_regmatch_state = st;
7942         locinput= st->locinput;
7943
7944         DEBUG_STATE_pp("pop");
7945         depth--;
7946         if (yes_state == st)
7947             yes_state = st->u.yes.prev_yes_state;
7948
7949         state_num = st->resume_state + 1; /* failure = success + 1 */
7950         goto reenter_switch;
7951     }
7952     result = 0;
7953
7954   final_exit:
7955     if (rex->intflags & PREGf_VERBARG_SEEN) {
7956         SV *sv_err = get_sv("REGERROR", 1);
7957         SV *sv_mrk = get_sv("REGMARK", 1);
7958         if (result) {
7959             sv_commit = &PL_sv_no;
7960             if (!sv_yes_mark)
7961                 sv_yes_mark = &PL_sv_yes;
7962         } else {
7963             if (!sv_commit)
7964                 sv_commit = &PL_sv_yes;
7965             sv_yes_mark = &PL_sv_no;
7966         }
7967         assert(sv_err);
7968         assert(sv_mrk);
7969         sv_setsv(sv_err, sv_commit);
7970         sv_setsv(sv_mrk, sv_yes_mark);
7971     }
7972
7973
7974     if (last_pushed_cv) {
7975         dSP;
7976         POP_MULTICALL;
7977         PERL_UNUSED_VAR(SP);
7978     }
7979
7980     assert(!result ||  locinput - reginfo->strbeg >= 0);
7981     return result ?  locinput - reginfo->strbeg : -1;
7982 }
7983
7984 /*
7985  - regrepeat - repeatedly match something simple, report how many
7986  *
7987  * What 'simple' means is a node which can be the operand of a quantifier like
7988  * '+', or {1,3}
7989  *
7990  * startposp - pointer a pointer to the start position.  This is updated
7991  *             to point to the byte following the highest successful
7992  *             match.
7993  * p         - the regnode to be repeatedly matched against.
7994  * reginfo   - struct holding match state, such as strend
7995  * max       - maximum number of things to match.
7996  * depth     - (for debugging) backtracking depth.
7997  */
7998 STATIC I32
7999 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8000             regmatch_info *const reginfo, I32 max, int depth)
8001 {
8002     char *scan;     /* Pointer to current position in target string */
8003     I32 c;
8004     char *loceol = reginfo->strend;   /* local version */
8005     I32 hardcount = 0;  /* How many matches so far */
8006     bool utf8_target = reginfo->is_utf8_target;
8007     unsigned int to_complement = 0;  /* Invert the result? */
8008     UV utf8_flags;
8009     _char_class_number classnum;
8010 #ifndef DEBUGGING
8011     PERL_UNUSED_ARG(depth);
8012 #endif
8013
8014     PERL_ARGS_ASSERT_REGREPEAT;
8015
8016     scan = *startposp;
8017     if (max == REG_INFTY)
8018         max = I32_MAX;
8019     else if (! utf8_target && loceol - scan > max)
8020         loceol = scan + max;
8021
8022     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8023      * to the maximum of how far we should go in it (leaving it set to the real
8024      * end, if the maximum permissible would take us beyond that).  This allows
8025      * us to make the loop exit condition that we haven't gone past <loceol> to
8026      * also mean that we haven't exceeded the max permissible count, saving a
8027      * test each time through the loop.  But it assumes that the OP matches a
8028      * single byte, which is true for most of the OPs below when applied to a
8029      * non-UTF-8 target.  Those relatively few OPs that don't have this
8030      * characteristic will have to compensate.
8031      *
8032      * There is no adjustment for UTF-8 targets, as the number of bytes per
8033      * character varies.  OPs will have to test both that the count is less
8034      * than the max permissible (using <hardcount> to keep track), and that we
8035      * are still within the bounds of the string (using <loceol>.  A few OPs
8036      * match a single byte no matter what the encoding.  They can omit the max
8037      * test if, for the UTF-8 case, they do the adjustment that was skipped
8038      * above.
8039      *
8040      * Thus, the code above sets things up for the common case; and exceptional
8041      * cases need extra work; the common case is to make sure <scan> doesn't
8042      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8043      * count doesn't exceed the maximum permissible */
8044
8045     switch (OP(p)) {
8046     case REG_ANY:
8047         if (utf8_target) {
8048             while (scan < loceol && hardcount < max && *scan != '\n') {
8049                 scan += UTF8SKIP(scan);
8050                 hardcount++;
8051             }
8052         } else {
8053             while (scan < loceol && *scan != '\n')
8054                 scan++;
8055         }
8056         break;
8057     case SANY:
8058         if (utf8_target) {
8059             while (scan < loceol && hardcount < max) {
8060                 scan += UTF8SKIP(scan);
8061                 hardcount++;
8062             }
8063         }
8064         else
8065             scan = loceol;
8066         break;
8067     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
8068         if (utf8_target && loceol - scan > max) {
8069
8070             /* <loceol> hadn't been adjusted in the UTF-8 case */
8071             scan +=  max;
8072         }
8073         else {
8074             scan = loceol;
8075         }
8076         break;
8077     case EXACTL:
8078         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8079         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8080             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8081         }
8082         /* FALLTHROUGH */
8083     case EXACT:
8084         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8085
8086         c = (U8)*STRING(p);
8087
8088         /* Can use a simple loop if the pattern char to match on is invariant
8089          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
8090          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8091          * true iff it doesn't matter if the argument is in UTF-8 or not */
8092         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8093             if (utf8_target && loceol - scan > max) {
8094                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8095                  * since here, to match at all, 1 char == 1 byte */
8096                 loceol = scan + max;
8097             }
8098             while (scan < loceol && UCHARAT(scan) == c) {
8099                 scan++;
8100             }
8101         }
8102         else if (reginfo->is_utf8_pat) {
8103             if (utf8_target) {
8104                 STRLEN scan_char_len;
8105
8106                 /* When both target and pattern are UTF-8, we have to do
8107                  * string EQ */
8108                 while (hardcount < max
8109                        && scan < loceol
8110                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8111                        && memEQ(scan, STRING(p), scan_char_len))
8112                 {
8113                     scan += scan_char_len;
8114                     hardcount++;
8115                 }
8116             }
8117             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8118
8119                 /* Target isn't utf8; convert the character in the UTF-8
8120                  * pattern to non-UTF8, and do a simple loop */
8121                 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8122                 while (scan < loceol && UCHARAT(scan) == c) {
8123                     scan++;
8124                 }
8125             } /* else pattern char is above Latin1, can't possibly match the
8126                  non-UTF-8 target */
8127         }
8128         else {
8129
8130             /* Here, the string must be utf8; pattern isn't, and <c> is
8131              * different in utf8 than not, so can't compare them directly.
8132              * Outside the loop, find the two utf8 bytes that represent c, and
8133              * then look for those in sequence in the utf8 string */
8134             U8 high = UTF8_TWO_BYTE_HI(c);
8135             U8 low = UTF8_TWO_BYTE_LO(c);
8136
8137             while (hardcount < max
8138                     && scan + 1 < loceol
8139                     && UCHARAT(scan) == high
8140                     && UCHARAT(scan + 1) == low)
8141             {
8142                 scan += 2;
8143                 hardcount++;
8144             }
8145         }
8146         break;
8147
8148     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
8149         assert(! reginfo->is_utf8_pat);
8150         /* FALLTHROUGH */
8151     case EXACTFA:
8152         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8153         goto do_exactf;
8154
8155     case EXACTFL:
8156         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8157         utf8_flags = FOLDEQ_LOCALE;
8158         goto do_exactf;
8159
8160     case EXACTF:   /* This node only generated for non-utf8 patterns */
8161         assert(! reginfo->is_utf8_pat);
8162         utf8_flags = 0;
8163         goto do_exactf;
8164
8165     case EXACTFLU8:
8166         if (! utf8_target) {
8167             break;
8168         }
8169         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8170                                     | FOLDEQ_S2_FOLDS_SANE;
8171         goto do_exactf;
8172
8173     case EXACTFU_SS:
8174     case EXACTFU:
8175         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8176
8177       do_exactf: {
8178         int c1, c2;
8179         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8180
8181         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8182
8183         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8184                                         reginfo))
8185         {
8186             if (c1 == CHRTEST_VOID) {
8187                 /* Use full Unicode fold matching */
8188                 char *tmpeol = reginfo->strend;
8189                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8190                 while (hardcount < max
8191                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8192                                              STRING(p), NULL, pat_len,
8193                                              reginfo->is_utf8_pat, utf8_flags))
8194                 {
8195                     scan = tmpeol;
8196                     tmpeol = reginfo->strend;
8197                     hardcount++;
8198                 }
8199             }
8200             else if (utf8_target) {
8201                 if (c1 == c2) {
8202                     while (scan < loceol
8203                            && hardcount < max
8204                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8205                     {
8206                         scan += UTF8SKIP(scan);
8207                         hardcount++;
8208                     }
8209                 }
8210                 else {
8211                     while (scan < loceol
8212                            && hardcount < max
8213                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8214                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8215                     {
8216                         scan += UTF8SKIP(scan);
8217                         hardcount++;
8218                     }
8219                 }
8220             }
8221             else if (c1 == c2) {
8222                 while (scan < loceol && UCHARAT(scan) == c1) {
8223                     scan++;
8224                 }
8225             }
8226             else {
8227                 while (scan < loceol &&
8228                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8229                 {
8230                     scan++;
8231                 }
8232             }
8233         }
8234         break;
8235     }
8236     case ANYOFL:
8237         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8238         /* FALLTHROUGH */
8239     case ANYOF:
8240         if (utf8_target) {
8241             while (hardcount < max
8242                    && scan < loceol
8243                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8244             {
8245                 scan += UTF8SKIP(scan);
8246                 hardcount++;
8247             }
8248         } else {
8249             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
8250                 scan++;
8251         }
8252         break;
8253
8254     /* The argument (FLAGS) to all the POSIX node types is the class number */
8255
8256     case NPOSIXL:
8257         to_complement = 1;
8258         /* FALLTHROUGH */
8259
8260     case POSIXL:
8261         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8262         if (! utf8_target) {
8263             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8264                                                                    *scan)))
8265             {
8266                 scan++;
8267             }
8268         } else {
8269             while (hardcount < max && scan < loceol
8270                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8271                                                                   (U8 *) scan)))
8272             {
8273                 scan += UTF8SKIP(scan);
8274                 hardcount++;
8275             }
8276         }
8277         break;
8278
8279     case POSIXD:
8280         if (utf8_target) {
8281             goto utf8_posix;
8282         }
8283         /* FALLTHROUGH */
8284
8285     case POSIXA:
8286         if (utf8_target && loceol - scan > max) {
8287
8288             /* We didn't adjust <loceol> at the beginning of this routine
8289              * because is UTF-8, but it is actually ok to do so, since here, to
8290              * match, 1 char == 1 byte. */
8291             loceol = scan + max;
8292         }
8293         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8294             scan++;
8295         }
8296         break;
8297
8298     case NPOSIXD:
8299         if (utf8_target) {
8300             to_complement = 1;
8301             goto utf8_posix;
8302         }
8303         /* FALLTHROUGH */
8304
8305     case NPOSIXA:
8306         if (! utf8_target) {
8307             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8308                 scan++;
8309             }
8310         }
8311         else {
8312
8313             /* The complement of something that matches only ASCII matches all
8314              * non-ASCII, plus everything in ASCII that isn't in the class. */
8315             while (hardcount < max && scan < loceol
8316                    && (! isASCII_utf8(scan)
8317                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8318             {
8319                 scan += UTF8SKIP(scan);
8320                 hardcount++;
8321             }
8322         }
8323         break;
8324
8325     case NPOSIXU:
8326         to_complement = 1;
8327         /* FALLTHROUGH */
8328
8329     case POSIXU:
8330         if (! utf8_target) {
8331             while (scan < loceol && to_complement
8332                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8333             {
8334                 scan++;
8335             }
8336         }
8337         else {
8338           utf8_posix:
8339             classnum = (_char_class_number) FLAGS(p);
8340             if (classnum < _FIRST_NON_SWASH_CC) {
8341
8342                 /* Here, a swash is needed for above-Latin1 code points.
8343                  * Process as many Latin1 code points using the built-in rules.
8344                  * Go to another loop to finish processing upon encountering
8345                  * the first Latin1 code point.  We could do that in this loop
8346                  * as well, but the other way saves having to test if the swash
8347                  * has been loaded every time through the loop: extra space to
8348                  * save a test. */
8349                 while (hardcount < max && scan < loceol) {
8350                     if (UTF8_IS_INVARIANT(*scan)) {
8351                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
8352                                                                    classnum))))
8353                         {
8354                             break;
8355                         }
8356                         scan++;
8357                     }
8358                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
8359                         if (! (to_complement
8360                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
8361                                                                      *(scan + 1)),
8362                                                     classnum))))
8363                         {
8364                             break;
8365                         }
8366                         scan += 2;
8367                     }
8368                     else {
8369                         goto found_above_latin1;
8370                     }
8371
8372                     hardcount++;
8373                 }
8374             }
8375             else {
8376                 /* For these character classes, the knowledge of how to handle
8377                  * every code point is compiled in to Perl via a macro.  This
8378                  * code is written for making the loops as tight as possible.
8379                  * It could be refactored to save space instead */
8380                 switch (classnum) {
8381                     case _CC_ENUM_SPACE:
8382                         while (hardcount < max
8383                                && scan < loceol
8384                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
8385                         {
8386                             scan += UTF8SKIP(scan);
8387                             hardcount++;
8388                         }
8389                         break;
8390                     case _CC_ENUM_BLANK:
8391                         while (hardcount < max
8392                                && scan < loceol
8393                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
8394                         {
8395                             scan += UTF8SKIP(scan);
8396                             hardcount++;
8397                         }
8398                         break;
8399                     case _CC_ENUM_XDIGIT:
8400                         while (hardcount < max
8401                                && scan < loceol
8402                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
8403                         {
8404                             scan += UTF8SKIP(scan);
8405                             hardcount++;
8406                         }
8407                         break;
8408                     case _CC_ENUM_VERTSPACE:
8409                         while (hardcount < max
8410                                && scan < loceol
8411                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
8412                         {
8413                             scan += UTF8SKIP(scan);
8414                             hardcount++;
8415                         }
8416                         break;
8417                     case _CC_ENUM_CNTRL:
8418                         while (hardcount < max
8419                                && scan < loceol
8420                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
8421                         {
8422                             scan += UTF8SKIP(scan);
8423                             hardcount++;
8424                         }
8425                         break;
8426                     default:
8427                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
8428                 }
8429             }
8430         }
8431         break;
8432
8433       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
8434
8435         /* Load the swash if not already present */
8436         if (! PL_utf8_swash_ptrs[classnum]) {
8437             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
8438             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
8439                                         "utf8",
8440                                         "",
8441                                         &PL_sv_undef, 1, 0,
8442                                         PL_XPosix_ptrs[classnum], &flags);
8443         }
8444
8445         while (hardcount < max && scan < loceol
8446                && to_complement ^ cBOOL(_generic_utf8(
8447                                        classnum,
8448                                        scan,
8449                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
8450                                                    (U8 *) scan,
8451                                                    TRUE))))
8452         {
8453             scan += UTF8SKIP(scan);
8454             hardcount++;
8455         }
8456         break;
8457
8458     case LNBREAK:
8459         if (utf8_target) {
8460             while (hardcount < max && scan < loceol &&
8461                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
8462                 scan += c;
8463                 hardcount++;
8464             }
8465         } else {
8466             /* LNBREAK can match one or two latin chars, which is ok, but we
8467              * have to use hardcount in this situation, and throw away the
8468              * adjustment to <loceol> done before the switch statement */
8469             loceol = reginfo->strend;
8470             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
8471                 scan+=c;
8472                 hardcount++;
8473             }
8474         }
8475         break;
8476
8477     case BOUNDL:
8478     case NBOUNDL:
8479         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8480         /* FALLTHROUGH */
8481     case BOUND:
8482     case BOUNDA:
8483     case BOUNDU:
8484     case EOS:
8485     case GPOS:
8486     case KEEPS:
8487     case NBOUND:
8488     case NBOUNDA:
8489     case NBOUNDU:
8490     case OPFAIL:
8491     case SBOL:
8492     case SEOL:
8493         /* These are all 0 width, so match right here or not at all. */
8494         break;
8495
8496     default:
8497         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
8498         /* NOTREACHED */
8499         NOT_REACHED; /* NOTREACHED */
8500
8501     }
8502
8503     if (hardcount)
8504         c = hardcount;
8505     else
8506         c = scan - *startposp;
8507     *startposp = scan;
8508
8509     DEBUG_r({
8510         GET_RE_DEBUG_FLAGS_DECL;
8511         DEBUG_EXECUTE_r({
8512             SV * const prop = sv_newmortal();
8513             regprop(prog, prop, p, reginfo, NULL);
8514             PerlIO_printf(Perl_debug_log,
8515                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
8516                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
8517         });
8518     });
8519
8520     return(c);
8521 }
8522
8523
8524 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
8525 /*
8526 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
8527 create a copy so that changes the caller makes won't change the shared one.
8528 If <altsvp> is non-null, will return NULL in it, for back-compat.
8529  */
8530 SV *
8531 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
8532 {
8533     PERL_ARGS_ASSERT_REGCLASS_SWASH;
8534
8535     if (altsvp) {
8536         *altsvp = NULL;
8537     }
8538
8539     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
8540 }
8541
8542 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
8543
8544 /*
8545  - reginclass - determine if a character falls into a character class
8546
8547   n is the ANYOF-type regnode
8548   p is the target string
8549   p_end points to one byte beyond the end of the target string
8550   utf8_target tells whether p is in UTF-8.
8551
8552   Returns true if matched; false otherwise.
8553
8554   Note that this can be a synthetic start class, a combination of various
8555   nodes, so things you think might be mutually exclusive, such as locale,
8556   aren't.  It can match both locale and non-locale
8557
8558  */
8559
8560 STATIC bool
8561 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
8562 {
8563     dVAR;
8564     const char flags = ANYOF_FLAGS(n);
8565     bool match = FALSE;
8566     UV c = *p;
8567
8568     PERL_ARGS_ASSERT_REGINCLASS;
8569
8570     /* If c is not already the code point, get it.  Note that
8571      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
8572     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
8573         STRLEN c_len = 0;
8574         c = utf8n_to_uvchr(p, p_end - p, &c_len,
8575                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
8576                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
8577                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
8578                  * UTF8_ALLOW_FFFF */
8579         if (c_len == (STRLEN)-1)
8580             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
8581         if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
8582             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
8583         }
8584     }
8585
8586     /* If this character is potentially in the bitmap, check it */
8587     if (c < NUM_ANYOF_CODE_POINTS) {
8588         if (ANYOF_BITMAP_TEST(n, c))
8589             match = TRUE;
8590         else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
8591                   && ! utf8_target
8592                   && ! isASCII(c))
8593         {
8594             match = TRUE;
8595         }
8596         else if (flags & ANYOF_LOCALE_FLAGS) {
8597             if ((flags & ANYOF_LOC_FOLD)
8598                 && c < 256
8599                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
8600             {
8601                 match = TRUE;
8602             }
8603             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
8604                      && c < 256
8605             ) {
8606
8607                 /* The data structure is arranged so bits 0, 2, 4, ... are set
8608                  * if the class includes the Posix character class given by
8609                  * bit/2; and 1, 3, 5, ... are set if the class includes the
8610                  * complemented Posix class given by int(bit/2).  So we loop
8611                  * through the bits, each time changing whether we complement
8612                  * the result or not.  Suppose for the sake of illustration
8613                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
8614                  * is set, it means there is a match for this ANYOF node if the
8615                  * character is in the class given by the expression (0 / 2 = 0
8616                  * = \w).  If it is in that class, isFOO_lc() will return 1,
8617                  * and since 'to_complement' is 0, the result will stay TRUE,
8618                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
8619                  * bit 1 is 1.  That means there is a match if the character
8620                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
8621                  * but will on bit 1.  On the second iteration 'to_complement'
8622                  * will be 1, so the exclusive or will reverse things, so we
8623                  * are testing for \W.  On the third iteration, 'to_complement'
8624                  * will be 0, and we would be testing for \s; the fourth
8625                  * iteration would test for \S, etc.
8626                  *
8627                  * Note that this code assumes that all the classes are closed
8628                  * under folding.  For example, if a character matches \w, then
8629                  * its fold does too; and vice versa.  This should be true for
8630                  * any well-behaved locale for all the currently defined Posix
8631                  * classes, except for :lower: and :upper:, which are handled
8632                  * by the pseudo-class :cased: which matches if either of the
8633                  * other two does.  To get rid of this assumption, an outer
8634                  * loop could be used below to iterate over both the source
8635                  * character, and its fold (if different) */
8636
8637                 int count = 0;
8638                 int to_complement = 0;
8639
8640                 while (count < ANYOF_MAX) {
8641                     if (ANYOF_POSIXL_TEST(n, count)
8642                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
8643                     {
8644                         match = TRUE;
8645                         break;
8646                     }
8647                     count++;
8648                     to_complement ^= 1;
8649                 }
8650             }
8651         }
8652     }
8653
8654
8655     /* If the bitmap didn't (or couldn't) match, and something outside the
8656      * bitmap could match, try that. */
8657     if (!match) {
8658         if (c >= NUM_ANYOF_CODE_POINTS
8659             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
8660         {
8661             match = TRUE;       /* Everything above the bitmap matches */
8662         }
8663         else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
8664                   || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
8665                   || ((flags & ANYOF_LOC_FOLD)
8666                        && IN_UTF8_CTYPE_LOCALE
8667                        && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
8668         {
8669             SV* only_utf8_locale = NULL;
8670             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
8671                                                        &only_utf8_locale, NULL);
8672             if (sw) {
8673                 U8 utf8_buffer[2];
8674                 U8 * utf8_p;
8675                 if (utf8_target) {
8676                     utf8_p = (U8 *) p;
8677                 } else { /* Convert to utf8 */
8678                     utf8_p = utf8_buffer;
8679                     append_utf8_from_native_byte(*p, &utf8_p);
8680                     utf8_p = utf8_buffer;
8681                 }
8682
8683                 if (swash_fetch(sw, utf8_p, TRUE)) {
8684                     match = TRUE;
8685                 }
8686             }
8687             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
8688                 match = _invlist_contains_cp(only_utf8_locale, c);
8689             }
8690         }
8691
8692         if (UNICODE_IS_SUPER(c)
8693             && (flags & ANYOF_WARN_SUPER)
8694             && ckWARN_d(WARN_NON_UNICODE))
8695         {
8696             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
8697                 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
8698         }
8699     }
8700
8701 #if ANYOF_INVERT != 1
8702     /* Depending on compiler optimization cBOOL takes time, so if don't have to
8703      * use it, don't */
8704 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
8705 #endif
8706
8707     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
8708     return (flags & ANYOF_INVERT) ^ match;
8709 }
8710
8711 STATIC U8 *
8712 S_reghop3(U8 *s, SSize_t off, const U8* lim)
8713 {
8714     /* return the position 'off' UTF-8 characters away from 's', forward if
8715      * 'off' >= 0, backwards if negative.  But don't go outside of position
8716      * 'lim', which better be < s  if off < 0 */
8717
8718     PERL_ARGS_ASSERT_REGHOP3;
8719
8720     if (off >= 0) {
8721         while (off-- && s < lim) {
8722             /* XXX could check well-formedness here */
8723             s += UTF8SKIP(s);
8724         }
8725     }
8726     else {
8727         while (off++ && s > lim) {
8728             s--;
8729             if (UTF8_IS_CONTINUED(*s)) {
8730                 while (s > lim && UTF8_IS_CONTINUATION(*s))
8731                     s--;
8732             }
8733             /* XXX could check well-formedness here */
8734         }
8735     }
8736     return s;
8737 }
8738
8739 STATIC U8 *
8740 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
8741 {
8742     PERL_ARGS_ASSERT_REGHOP4;
8743
8744     if (off >= 0) {
8745         while (off-- && s < rlim) {
8746             /* XXX could check well-formedness here */
8747             s += UTF8SKIP(s);
8748         }
8749     }
8750     else {
8751         while (off++ && s > llim) {
8752             s--;
8753             if (UTF8_IS_CONTINUED(*s)) {
8754                 while (s > llim && UTF8_IS_CONTINUATION(*s))
8755                     s--;
8756             }
8757             /* XXX could check well-formedness here */
8758         }
8759     }
8760     return s;
8761 }
8762
8763 /* like reghop3, but returns NULL on overrun, rather than returning last
8764  * char pos */
8765
8766 STATIC U8 *
8767 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
8768 {
8769     PERL_ARGS_ASSERT_REGHOPMAYBE3;
8770
8771     if (off >= 0) {
8772         while (off-- && s < lim) {
8773             /* XXX could check well-formedness here */
8774             s += UTF8SKIP(s);
8775         }
8776         if (off >= 0)
8777             return NULL;
8778     }
8779     else {
8780         while (off++ && s > lim) {
8781             s--;
8782             if (UTF8_IS_CONTINUED(*s)) {
8783                 while (s > lim && UTF8_IS_CONTINUATION(*s))
8784                     s--;
8785             }
8786             /* XXX could check well-formedness here */
8787         }
8788         if (off <= 0)
8789             return NULL;
8790     }
8791     return s;
8792 }
8793
8794
8795 /* when executing a regex that may have (?{}), extra stuff needs setting
8796    up that will be visible to the called code, even before the current
8797    match has finished. In particular:
8798
8799    * $_ is localised to the SV currently being matched;
8800    * pos($_) is created if necessary, ready to be updated on each call-out
8801      to code;
8802    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
8803      isn't set until the current pattern is successfully finished), so that
8804      $1 etc of the match-so-far can be seen;
8805    * save the old values of subbeg etc of the current regex, and  set then
8806      to the current string (again, this is normally only done at the end
8807      of execution)
8808 */
8809
8810 static void
8811 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
8812 {
8813     MAGIC *mg;
8814     regexp *const rex = ReANY(reginfo->prog);
8815     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8816
8817     eval_state->rex = rex;
8818
8819     if (reginfo->sv) {
8820         /* Make $_ available to executed code. */
8821         if (reginfo->sv != DEFSV) {
8822             SAVE_DEFSV;
8823             DEFSV_set(reginfo->sv);
8824         }
8825
8826         if (!(mg = mg_find_mglob(reginfo->sv))) {
8827             /* prepare for quick setting of pos */
8828             mg = sv_magicext_mglob(reginfo->sv);
8829             mg->mg_len = -1;
8830         }
8831         eval_state->pos_magic = mg;
8832         eval_state->pos       = mg->mg_len;
8833         eval_state->pos_flags = mg->mg_flags;
8834     }
8835     else
8836         eval_state->pos_magic = NULL;
8837
8838     if (!PL_reg_curpm) {
8839         /* PL_reg_curpm is a fake PMOP that we can attach the current
8840          * regex to and point PL_curpm at, so that $1 et al are visible
8841          * within a /(?{})/. It's just allocated once per interpreter the
8842          * first time its needed */
8843         Newxz(PL_reg_curpm, 1, PMOP);
8844 #ifdef USE_ITHREADS
8845         {
8846             SV* const repointer = &PL_sv_undef;
8847             /* this regexp is also owned by the new PL_reg_curpm, which
8848                will try to free it.  */
8849             av_push(PL_regex_padav, repointer);
8850             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
8851             PL_regex_pad = AvARRAY(PL_regex_padav);
8852         }
8853 #endif
8854     }
8855     SET_reg_curpm(reginfo->prog);
8856     eval_state->curpm = PL_curpm;
8857     PL_curpm = PL_reg_curpm;
8858     if (RXp_MATCH_COPIED(rex)) {
8859         /*  Here is a serious problem: we cannot rewrite subbeg,
8860             since it may be needed if this match fails.  Thus
8861             $` inside (?{}) could fail... */
8862         eval_state->subbeg     = rex->subbeg;
8863         eval_state->sublen     = rex->sublen;
8864         eval_state->suboffset  = rex->suboffset;
8865         eval_state->subcoffset = rex->subcoffset;
8866 #ifdef PERL_ANY_COW
8867         eval_state->saved_copy = rex->saved_copy;
8868 #endif
8869         RXp_MATCH_COPIED_off(rex);
8870     }
8871     else
8872         eval_state->subbeg = NULL;
8873     rex->subbeg = (char *)reginfo->strbeg;
8874     rex->suboffset = 0;
8875     rex->subcoffset = 0;
8876     rex->sublen = reginfo->strend - reginfo->strbeg;
8877 }
8878
8879
8880 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8881
8882 static void
8883 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8884 {
8885     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8886     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
8887     regmatch_slab *s;
8888
8889     Safefree(aux->poscache);
8890
8891     if (eval_state) {
8892
8893         /* undo the effects of S_setup_eval_state() */
8894
8895         if (eval_state->subbeg) {
8896             regexp * const rex = eval_state->rex;
8897             rex->subbeg     = eval_state->subbeg;
8898             rex->sublen     = eval_state->sublen;
8899             rex->suboffset  = eval_state->suboffset;
8900             rex->subcoffset = eval_state->subcoffset;
8901 #ifdef PERL_ANY_COW
8902             rex->saved_copy = eval_state->saved_copy;
8903 #endif
8904             RXp_MATCH_COPIED_on(rex);
8905         }
8906         if (eval_state->pos_magic)
8907         {
8908             eval_state->pos_magic->mg_len = eval_state->pos;
8909             eval_state->pos_magic->mg_flags =
8910                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8911                | (eval_state->pos_flags & MGf_BYTES);
8912         }
8913
8914         PL_curpm = eval_state->curpm;
8915     }
8916
8917     PL_regmatch_state = aux->old_regmatch_state;
8918     PL_regmatch_slab  = aux->old_regmatch_slab;
8919
8920     /* free all slabs above current one - this must be the last action
8921      * of this function, as aux and eval_state are allocated within
8922      * slabs and may be freed here */
8923
8924     s = PL_regmatch_slab->next;
8925     if (s) {
8926         PL_regmatch_slab->next = NULL;
8927         while (s) {
8928             regmatch_slab * const osl = s;
8929             s = s->next;
8930             Safefree(osl);
8931         }
8932     }
8933 }
8934
8935
8936 STATIC void
8937 S_to_utf8_substr(pTHX_ regexp *prog)
8938 {
8939     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8940      * on the converted value */
8941
8942     int i = 1;
8943
8944     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8945
8946     do {
8947         if (prog->substrs->data[i].substr
8948             && !prog->substrs->data[i].utf8_substr) {
8949             SV* const sv = newSVsv(prog->substrs->data[i].substr);
8950             prog->substrs->data[i].utf8_substr = sv;
8951             sv_utf8_upgrade(sv);
8952             if (SvVALID(prog->substrs->data[i].substr)) {
8953                 if (SvTAIL(prog->substrs->data[i].substr)) {
8954                     /* Trim the trailing \n that fbm_compile added last
8955                        time.  */
8956                     SvCUR_set(sv, SvCUR(sv) - 1);
8957                     /* Whilst this makes the SV technically "invalid" (as its
8958                        buffer is no longer followed by "\0") when fbm_compile()
8959                        adds the "\n" back, a "\0" is restored.  */
8960                     fbm_compile(sv, FBMcf_TAIL);
8961                 } else
8962                     fbm_compile(sv, 0);
8963             }
8964             if (prog->substrs->data[i].substr == prog->check_substr)
8965                 prog->check_utf8 = sv;
8966         }
8967     } while (i--);
8968 }
8969
8970 STATIC bool
8971 S_to_byte_substr(pTHX_ regexp *prog)
8972 {
8973     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8974      * on the converted value; returns FALSE if can't be converted. */
8975
8976     int i = 1;
8977
8978     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8979
8980     do {
8981         if (prog->substrs->data[i].utf8_substr
8982             && !prog->substrs->data[i].substr) {
8983             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8984             if (! sv_utf8_downgrade(sv, TRUE)) {
8985                 return FALSE;
8986             }
8987             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8988                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8989                     /* Trim the trailing \n that fbm_compile added last
8990                         time.  */
8991                     SvCUR_set(sv, SvCUR(sv) - 1);
8992                     fbm_compile(sv, FBMcf_TAIL);
8993                 } else
8994                     fbm_compile(sv, 0);
8995             }
8996             prog->substrs->data[i].substr = sv;
8997             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8998                 prog->check_substr = sv;
8999         }
9000     } while (i--);
9001
9002     return TRUE;
9003 }
9004
9005 /*
9006  * Local variables:
9007  * c-indentation-style: bsd
9008  * c-basic-offset: 4
9009  * indent-tabs-mode: nil
9010  * End:
9011  *
9012  * ex: set ts=8 sts=4 sw=4 et:
9013  */