regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /* Valid only if 'c', the character being looke-up, is an invariant under
 111  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 112  * everything matchable is straight forward in the bitmap */
 113 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 114                                 ? reginclass(prog,p,c,c+1,u)                \
 115                                 : ANYOF_BITMAP_TEST(p,*(c)))
 116
 117 /*
 118  * Forwards.
 119  */
 120
 121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 122 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 123
 124 #define HOPc(pos,off) \
 125         (char *)(reginfo->is_utf8_target \
 126             ? reghop3((U8*)pos, off, \
 127                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 128             : (U8*)(pos + off))
 129
 130 #define HOPBACKc(pos, off) \
 131         (char*)(reginfo->is_utf8_target \
 132             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(reginfo->strbeg)) \
 133             : (pos - off >= reginfo->strbeg)    \
 134                 ? (U8*)pos - off                \
 135                 : NULL)
 136
 137 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 138 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 139
 140 /* lim must be +ve. Returns NULL on overshoot */
 141 #define HOPMAYBE3(pos,off,lim) \
 142         (reginfo->is_utf8_target                        \
 143             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 144             : ((U8*)pos + off <= lim)                   \
 145                 ? (U8*)pos + off                        \
 146                 : NULL)
 147
 148 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 149  * off must be >=0; args should be vars rather than expressions */
 150 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 151     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 152     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 153
 154 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 155     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 156     : (U8*)(pos + off))
 157 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 158
 159 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 160 #define NEXTCHR_IS_EOS (nextchr < 0)
 161
 162 #define SET_nextchr \
 163     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 164
 165 #define SET_locinput(p) \
 166     locinput = (p);  \
 167     SET_nextchr
 168
 169
 170 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 171         if (!swash_ptr) {                                                     \
 172             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 173             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 174                                          1, 0, invlist, &flags);              \
 175             assert(swash_ptr);                                                \
 176         }                                                                     \
 177     } STMT_END
 178
 179 /* If in debug mode, we test that a known character properly matches */
 180 #ifdef DEBUGGING
 181 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 182                                           property_name,                      \
 183                                           invlist,                            \
 184                                           utf8_char_in_property)              \
 185         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 186         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 187 #else
 188 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 189                                           property_name,                      \
 190                                           invlist,                            \
 191                                           utf8_char_in_property)              \
 192         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 193 #endif
 194
 195 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 196                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 197                                         "",                                   \
 198                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 199                                         LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
 200
 201 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 202 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 203
 204 /* for use after a quantifier and before an EXACT-like node -- japhy */
 205 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 206  *
 207  * NOTE that *nothing* that affects backtracking should be in here, specifically
 208  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 209  * node that is in between two EXACT like nodes when ascertaining what the required
 210  * "follow" character is. This should probably be moved to regex compile time
 211  * although it may be done at run time beause of the REF possibility - more
 212  * investigation required. -- demerphq
 213 */
 214 #define JUMPABLE(rn) (                                                             \
 215     OP(rn) == OPEN ||                                                              \
 216     (OP(rn) == CLOSE &&                                                            \
 217      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 218     OP(rn) == EVAL ||                                                              \
 219     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 220     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 221     OP(rn) == KEEPS ||                                                             \
 222     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 223 )
 224 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 225
 226 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 227
 228 #if 0
 229 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 230    we don't need this definition.  XXX These are now out-of-sync*/
 231 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 232 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 233 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 234
 235 #else
 236 /* ... so we use this as its faster. */
 237 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 238 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 239 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 240 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 241
 242 #endif
 243
 244 /*
 245   Search for mandatory following text node; for lookahead, the text must
 246   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 247 */
 248 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 249     while (JUMPABLE(rn)) { \
 250         const OPCODE type = OP(rn); \
 251         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 252             rn = NEXTOPER(NEXTOPER(rn)); \
 253         else if (type == PLUS) \
 254             rn = NEXTOPER(rn); \
 255         else if (type == IFMATCH) \
 256             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 257         else rn += NEXT_OFF(rn); \
 258     } \
 259 } STMT_END
 260
 261 #define SLAB_FIRST(s) (&(s)->states[0])
 262 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 263
 264 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 265 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 266 static regmatch_state * S_push_slab(pTHX);
 267
 268 #define REGCP_PAREN_ELEMS 3
 269 #define REGCP_OTHER_ELEMS 3
 270 #define REGCP_FRAME_ELEMS 1
 271 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 272  * are needed for the regexp context stack bookkeeping. */
 273
 274 STATIC CHECKPOINT
 275 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 276 {
 277     const int retval = PL_savestack_ix;
 278     const int paren_elems_to_push =
 279                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 280     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 281     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 282     I32 p;
 283     GET_RE_DEBUG_FLAGS_DECL;
 284
 285     PERL_ARGS_ASSERT_REGCPPUSH;
 286
 287     if (paren_elems_to_push < 0)
 288         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 289                    (int)paren_elems_to_push, (int)maxopenparen,
 290                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 291
 292     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 293         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 294                    " out of range (%lu-%ld)",
 295                    total_elems,
 296                    (unsigned long)maxopenparen,
 297                    (long)parenfloor);
 298
 299     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 300
 301     DEBUG_BUFFERS_r(
 302         if ((int)maxopenparen > (int)parenfloor)
 303             Perl_re_exec_indentf( aTHX_
 304                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 305                 depth,
 306                 PTR2UV(rex),
 307                 PTR2UV(rex->offs)
 308             );
 309     );
 310     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 311 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 312         SSPUSHIV(rex->offs[p].end);
 313         SSPUSHIV(rex->offs[p].start);
 314         SSPUSHINT(rex->offs[p].start_tmp);
 315         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 316             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 317             depth,
 318             (UV)p,
 319             (IV)rex->offs[p].start,
 320             (IV)rex->offs[p].start_tmp,
 321             (IV)rex->offs[p].end
 322         ));
 323     }
 324 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 325     SSPUSHINT(maxopenparen);
 326     SSPUSHINT(rex->lastparen);
 327     SSPUSHINT(rex->lastcloseparen);
 328     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 329
 330     return retval;
 331 }
 332
 333 /* These are needed since we do not localize EVAL nodes: */
 334 #define REGCP_SET(cp)                                           \
 335     DEBUG_STATE_r(                                              \
 336         Perl_re_exec_indentf( aTHX_                                         \
 337             "Setting an EVAL scope, savestack=%"IVdf",\n",      \
 338             depth, (IV)PL_savestack_ix                          \
 339         )                                                       \
 340     );                                                          \
 341     cp = PL_savestack_ix
 342
 343 #define REGCP_UNWIND(cp)                                        \
 344     DEBUG_STATE_r(                                              \
 345         if (cp != PL_savestack_ix)                              \
 346             Perl_re_exec_indentf( aTHX_                                     \
 347                 "Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n",\
 348                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 349             )                                                   \
 350     );                                                          \
 351     regcpblow(cp)
 352
 353 #define UNWIND_PAREN(lp, lcp)               \
 354     for (n = rex->lastparen; n > lp; n--)   \
 355         rex->offs[n].end = -1;              \
 356     rex->lastparen = n;                     \
 357     rex->lastcloseparen = lcp;
 358
 359
 360 STATIC void
 361 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 362 {
 363     UV i;
 364     U32 paren;
 365     GET_RE_DEBUG_FLAGS_DECL;
 366
 367     PERL_ARGS_ASSERT_REGCPPOP;
 368
 369     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 370     i = SSPOPUV;
 371     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 372     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 373     rex->lastcloseparen = SSPOPINT;
 374     rex->lastparen = SSPOPINT;
 375     *maxopenparen_p = SSPOPINT;
 376
 377     i -= REGCP_OTHER_ELEMS;
 378     /* Now restore the parentheses context. */
 379     DEBUG_BUFFERS_r(
 380         if (i || rex->lastparen + 1 <= rex->nparens)
 381             Perl_re_exec_indentf( aTHX_
 382                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 383                 depth,
 384                 PTR2UV(rex),
 385                 PTR2UV(rex->offs)
 386             );
 387     );
 388     paren = *maxopenparen_p;
 389     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 390         SSize_t tmps;
 391         rex->offs[paren].start_tmp = SSPOPINT;
 392         rex->offs[paren].start = SSPOPIV;
 393         tmps = SSPOPIV;
 394         if (paren <= rex->lastparen)
 395             rex->offs[paren].end = tmps;
 396         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 397             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 398             depth,
 399             (UV)paren,
 400             (IV)rex->offs[paren].start,
 401             (IV)rex->offs[paren].start_tmp,
 402             (IV)rex->offs[paren].end,
 403             (paren > rex->lastparen ? "(skipped)" : ""));
 404         );
 405         paren--;
 406     }
 407 #if 1
 408     /* It would seem that the similar code in regtry()
 409      * already takes care of this, and in fact it is in
 410      * a better location to since this code can #if 0-ed out
 411      * but the code in regtry() is needed or otherwise tests
 412      * requiring null fields (pat.t#187 and split.t#{13,14}
 413      * (as of patchlevel 7877)  will fail.  Then again,
 414      * this code seems to be necessary or otherwise
 415      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 416      * --jhi updated by dapm */
 417     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 418         if (i > *maxopenparen_p)
 419             rex->offs[i].start = -1;
 420         rex->offs[i].end = -1;
 421         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 422             "    \\%"UVuf": %s   ..-1 undeffing\n",
 423             depth,
 424             (UV)i,
 425             (i > *maxopenparen_p) ? "-1" : "  "
 426         ));
 427     }
 428 #endif
 429 }
 430
 431 /* restore the parens and associated vars at savestack position ix,
 432  * but without popping the stack */
 433
 434 STATIC void
 435 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 436 {
 437     I32 tmpix = PL_savestack_ix;
 438     PERL_ARGS_ASSERT_REGCP_RESTORE;
 439
 440     PL_savestack_ix = ix;
 441     regcppop(rex, maxopenparen_p);
 442     PL_savestack_ix = tmpix;
 443 }
 444
 445 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 446
 447 STATIC bool
 448 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 449 {
 450     /* Returns a boolean as to whether or not 'character' is a member of the
 451      * Posix character class given by 'classnum' that should be equivalent to a
 452      * value in the typedef '_char_class_number'.
 453      *
 454      * Ideally this could be replaced by a just an array of function pointers
 455      * to the C library functions that implement the macros this calls.
 456      * However, to compile, the precise function signatures are required, and
 457      * these may vary from platform to to platform.  To avoid having to figure
 458      * out what those all are on each platform, I (khw) am using this method,
 459      * which adds an extra layer of function call overhead (unless the C
 460      * optimizer strips it away).  But we don't particularly care about
 461      * performance with locales anyway. */
 462
 463     switch ((_char_class_number) classnum) {
 464         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 465         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 466         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 467         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 468         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 469                                         || isUPPER_LC(character);
 470         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 471         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 472         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 473         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 474         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 475         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 476         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 477         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 478         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 479         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 480         default:    /* VERTSPACE should never occur in locales */
 481             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 482     }
 483
 484     NOT_REACHED; /* NOTREACHED */
 485     return FALSE;
 486 }
 487
 488 STATIC bool
 489 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 490 {
 491     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 492      * 'character' is a member of the Posix character class given by 'classnum'
 493      * that should be equivalent to a value in the typedef
 494      * '_char_class_number'.
 495      *
 496      * This just calls isFOO_lc on the code point for the character if it is in
 497      * the range 0-255.  Outside that range, all characters use Unicode
 498      * rules, ignoring any locale.  So use the Unicode function if this class
 499      * requires a swash, and use the Unicode macro otherwise. */
 500
 501     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 502
 503     if (UTF8_IS_INVARIANT(*character)) {
 504         return isFOO_lc(classnum, *character);
 505     }
 506     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 507         return isFOO_lc(classnum,
 508                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 509     }
 510
 511     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 512
 513     if (classnum < _FIRST_NON_SWASH_CC) {
 514
 515         /* Initialize the swash unless done already */
 516         if (! PL_utf8_swash_ptrs[classnum]) {
 517             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 518             PL_utf8_swash_ptrs[classnum] =
 519                     _core_swash_init("utf8",
 520                                      "",
 521                                      &PL_sv_undef, 1, 0,
 522                                      PL_XPosix_ptrs[classnum], &flags);
 523         }
 524
 525         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 526                                  character,
 527                                  TRUE /* is UTF */ ));
 528     }
 529
 530     switch ((_char_class_number) classnum) {
 531         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 532         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 533         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 534         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 535         default:                 break;
 536     }
 537
 538     return FALSE; /* Things like CNTRL are always below 256 */
 539 }
 540
 541 /*
 542  * pregexec and friends
 543  */
 544
 545 #ifndef PERL_IN_XSUB_RE
 546 /*
 547  - pregexec - match a regexp against a string
 548  */
 549 I32
 550 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 551          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 552 /* stringarg: the point in the string at which to begin matching */
 553 /* strend:    pointer to null at end of string */
 554 /* strbeg:    real beginning of string */
 555 /* minend:    end of match must be >= minend bytes after stringarg. */
 556 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 557  *            itself is accessed via the pointers above */
 558 /* nosave:    For optimizations. */
 559 {
 560     PERL_ARGS_ASSERT_PREGEXEC;
 561
 562     return
 563         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 564                       nosave ? 0 : REXEC_COPY_STR);
 565 }
 566 #endif
 567
 568
 569
 570 /* re_intuit_start():
 571  *
 572  * Based on some optimiser hints, try to find the earliest position in the
 573  * string where the regex could match.
 574  *
 575  *   rx:     the regex to match against
 576  *   sv:     the SV being matched: only used for utf8 flag; the string
 577  *           itself is accessed via the pointers below. Note that on
 578  *           something like an overloaded SV, SvPOK(sv) may be false
 579  *           and the string pointers may point to something unrelated to
 580  *           the SV itself.
 581  *   strbeg: real beginning of string
 582  *   strpos: the point in the string at which to begin matching
 583  *   strend: pointer to the byte following the last char of the string
 584  *   flags   currently unused; set to 0
 585  *   data:   currently unused; set to NULL
 586  *
 587  * The basic idea of re_intuit_start() is to use some known information
 588  * about the pattern, namely:
 589  *
 590  *   a) the longest known anchored substring (i.e. one that's at a
 591  *      constant offset from the beginning of the pattern; but not
 592  *      necessarily at a fixed offset from the beginning of the
 593  *      string);
 594  *   b) the longest floating substring (i.e. one that's not at a constant
 595  *      offset from the beginning of the pattern);
 596  *   c) Whether the pattern is anchored to the string; either
 597  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 598  *      or anchored to pos(): /\G/;
 599  *   d) A start class: a real or synthetic character class which
 600  *      represents which characters are legal at the start of the pattern;
 601  *
 602  * to either quickly reject the match, or to find the earliest position
 603  * within the string at which the pattern might match, thus avoiding
 604  * running the full NFA engine at those earlier locations, only to
 605  * eventually fail and retry further along.
 606  *
 607  * Returns NULL if the pattern can't match, or returns the address within
 608  * the string which is the earliest place the match could occur.
 609  *
 610  * The longest of the anchored and floating substrings is called 'check'
 611  * and is checked first. The other is called 'other' and is checked
 612  * second. The 'other' substring may not be present.  For example,
 613  *
 614  *    /(abc|xyz)ABC\d{0,3}DEFG/
 615  *
 616  * will have
 617  *
 618  *   check substr (float)    = "DEFG", offset 6..9 chars
 619  *   other substr (anchored) = "ABC",  offset 3..3 chars
 620  *   stclass = [ax]
 621  *
 622  * Be aware that during the course of this function, sometimes 'anchored'
 623  * refers to a substring being anchored relative to the start of the
 624  * pattern, and sometimes to the pattern itself being anchored relative to
 625  * the string. For example:
 626  *
 627  *   /\dabc/:   "abc" is anchored to the pattern;
 628  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 629  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 630  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 631  *                    but the pattern is anchored to the string.
 632  */
 633
 634 char *
 635 Perl_re_intuit_start(pTHX_
 636                     REGEXP * const rx,
 637                     SV *sv,
 638                     const char * const strbeg,
 639                     char *strpos,
 640                     char *strend,
 641                     const U32 flags,
 642                     re_scream_pos_data *data)
 643 {
 644     struct regexp *const prog = ReANY(rx);
 645     SSize_t start_shift = prog->check_offset_min;
 646     /* Should be nonnegative! */
 647     SSize_t end_shift   = 0;
 648     /* current lowest pos in string where the regex can start matching */
 649     char *rx_origin = strpos;
 650     SV *check;
 651     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 652     U8   other_ix = 1 - prog->substrs->check_ix;
 653     bool ml_anch = 0;
 654     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 655     char *check_at = NULL;              /* check substr found at this pos */
 656     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 657     RXi_GET_DECL(prog,progi);
 658     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 659     regmatch_info *const reginfo = &reginfo_buf;
 660     GET_RE_DEBUG_FLAGS_DECL;
 661
 662     PERL_ARGS_ASSERT_RE_INTUIT_START;
 663     PERL_UNUSED_ARG(flags);
 664     PERL_UNUSED_ARG(data);
 665
 666     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 667                 "Intuit: trying to determine minimum start position...\n"));
 668
 669     /* for now, assume that all substr offsets are positive. If at some point
 670      * in the future someone wants to do clever things with lookbehind and
 671      * -ve offsets, they'll need to fix up any code in this function
 672      * which uses these offsets. See the thread beginning
 673      * <20140113145929.GF27210@iabyn.com>
 674      */
 675     assert(prog->substrs->data[0].min_offset >= 0);
 676     assert(prog->substrs->data[0].max_offset >= 0);
 677     assert(prog->substrs->data[1].min_offset >= 0);
 678     assert(prog->substrs->data[1].max_offset >= 0);
 679     assert(prog->substrs->data[2].min_offset >= 0);
 680     assert(prog->substrs->data[2].max_offset >= 0);
 681
 682     /* for now, assume that if both present, that the floating substring
 683      * doesn't start before the anchored substring.
 684      * If you break this assumption (e.g. doing better optimisations
 685      * with lookahead/behind), then you'll need to audit the code in this
 686      * function carefully first
 687      */
 688     assert(
 689             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 690               && (prog->float_utf8    || prog->float_substr))
 691            || (prog->float_min_offset >= prog->anchored_offset));
 692
 693     /* byte rather than char calculation for efficiency. It fails
 694      * to quickly reject some cases that can't match, but will reject
 695      * them later after doing full char arithmetic */
 696     if (prog->minlen > strend - strpos) {
 697         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 698                               "  String too short...\n"));
 699         goto fail;
 700     }
 701
 702     RX_MATCH_UTF8_set(rx,utf8_target);
 703     reginfo->is_utf8_target = cBOOL(utf8_target);
 704     reginfo->info_aux = NULL;
 705     reginfo->strbeg = strbeg;
 706     reginfo->strend = strend;
 707     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 708     reginfo->intuit = 1;
 709     /* not actually used within intuit, but zero for safety anyway */
 710     reginfo->poscache_maxiter = 0;
 711
 712     if (utf8_target) {
 713         if ((!prog->anchored_utf8 && prog->anchored_substr)
 714                 || (!prog->float_utf8 && prog->float_substr))
 715             to_utf8_substr(prog);
 716         check = prog->check_utf8;
 717     } else {
 718         if (!prog->check_substr && prog->check_utf8) {
 719             if (! to_byte_substr(prog)) {
 720                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 721             }
 722         }
 723         check = prog->check_substr;
 724     }
 725
 726     /* dump the various substring data */
 727     DEBUG_OPTIMISE_MORE_r({
 728         int i;
 729         for (i=0; i<=2; i++) {
 730             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 731                                   : prog->substrs->data[i].substr);
 732             if (!sv)
 733                 continue;
 734
 735             Perl_re_printf( aTHX_
 736                 "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 737                 " useful=%"IVdf" utf8=%d [%s]\n",
 738                 i,
 739                 (IV)prog->substrs->data[i].min_offset,
 740                 (IV)prog->substrs->data[i].max_offset,
 741                 (IV)prog->substrs->data[i].end_shift,
 742                 BmUSEFUL(sv),
 743                 utf8_target ? 1 : 0,
 744                 SvPEEK(sv));
 745         }
 746     });
 747
 748     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 749
 750         /* ml_anch: check after \n?
 751          *
 752          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 753          * with /.*.../, these flags will have been added by the
 754          * compiler:
 755          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 756          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 757          */
 758         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 759                    && !(prog->intflags & PREGf_IMPLICIT);
 760
 761         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 762             /* we are only allowed to match at BOS or \G */
 763
 764             /* trivially reject if there's a BOS anchor and we're not at BOS.
 765              *
 766              * Note that we don't try to do a similar quick reject for
 767              * \G, since generally the caller will have calculated strpos
 768              * based on pos() and gofs, so the string is already correctly
 769              * anchored by definition; and handling the exceptions would
 770              * be too fiddly (e.g. REXEC_IGNOREPOS).
 771              */
 772             if (   strpos != strbeg
 773                 && (prog->intflags & PREGf_ANCH_SBOL))
 774             {
 775                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 776                                 "  Not at start...\n"));
 777                 goto fail;
 778             }
 779
 780             /* in the presence of an anchor, the anchored (relative to the
 781              * start of the regex) substr must also be anchored relative
 782              * to strpos. So quickly reject if substr isn't found there.
 783              * This works for \G too, because the caller will already have
 784              * subtracted gofs from pos, and gofs is the offset from the
 785              * \G to the start of the regex. For example, in /.abc\Gdef/,
 786              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 787              * caller will have set strpos=pos()-4; we look for the substr
 788              * at position pos()-4+1, which lines up with the "a" */
 789
 790             if (prog->check_offset_min == prog->check_offset_max) {
 791                 /* Substring at constant offset from beg-of-str... */
 792                 SSize_t slen = SvCUR(check);
 793                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 794
 795                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 796                     "  Looking for check substr at fixed offset %"IVdf"...\n",
 797                     (IV)prog->check_offset_min));
 798
 799                 if (SvTAIL(check)) {
 800                     /* In this case, the regex is anchored at the end too.
 801                      * Unless it's a multiline match, the lengths must match
 802                      * exactly, give or take a \n.  NB: slen >= 1 since
 803                      * the last char of check is \n */
 804                     if (!multiline
 805                         && (   strend - s > slen
 806                             || strend - s < slen - 1
 807                             || (strend - s == slen && strend[-1] != '\n')))
 808                     {
 809                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 810                                             "  String too long...\n"));
 811                         goto fail_finish;
 812                     }
 813                     /* Now should match s[0..slen-2] */
 814                     slen--;
 815                 }
 816                 if (slen && (strend - s < slen
 817                     || *SvPVX_const(check) != *s
 818                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
 819                 {
 820                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 821                                     "  String not equal...\n"));
 822                     goto fail_finish;
 823                 }
 824
 825                 check_at = s;
 826                 goto success_at_start;
 827             }
 828         }
 829     }
 830
 831     end_shift = prog->check_end_shift;
 832
 833 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 834     if (end_shift < 0)
 835         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 836                    (IV)end_shift, RX_PRECOMP(prog));
 837 #endif
 838
 839   restart:
 840
 841     /* This is the (re)entry point of the main loop in this function.
 842      * The goal of this loop is to:
 843      * 1) find the "check" substring in the region rx_origin..strend
 844      *    (adjusted by start_shift / end_shift). If not found, reject
 845      *    immediately.
 846      * 2) If it exists, look for the "other" substr too if defined; for
 847      *    example, if the check substr maps to the anchored substr, then
 848      *    check the floating substr, and vice-versa. If not found, go
 849      *    back to (1) with rx_origin suitably incremented.
 850      * 3) If we find an rx_origin position that doesn't contradict
 851      *    either of the substrings, then check the possible additional
 852      *    constraints on rx_origin of /^.../m or a known start class.
 853      *    If these fail, then depending on which constraints fail, jump
 854      *    back to here, or to various other re-entry points further along
 855      *    that skip some of the first steps.
 856      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 857      *    substring. If the start position was determined to be at the
 858      *    beginning of the string  - so, not rejected, but not optimised,
 859      *    since we have to run regmatch from position 0 - decrement the
 860      *    BmUSEFUL() count. Otherwise increment it.
 861      */
 862
 863
 864     /* first, look for the 'check' substring */
 865
 866     {
 867         U8* start_point;
 868         U8* end_point;
 869
 870         DEBUG_OPTIMISE_MORE_r({
 871             Perl_re_printf( aTHX_
 872                 "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 873                 " Start shift: %"IVdf" End shift %"IVdf
 874                 " Real end Shift: %"IVdf"\n",
 875                 (IV)(rx_origin - strbeg),
 876                 (IV)prog->check_offset_min,
 877                 (IV)start_shift,
 878                 (IV)end_shift,
 879                 (IV)prog->check_end_shift);
 880         });
 881
 882         end_point = HOP3(strend, -end_shift, strbeg);
 883         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 884         if (!start_point)
 885             goto fail_finish;
 886
 887
 888         /* If the regex is absolutely anchored to either the start of the
 889          * string (SBOL) or to pos() (ANCH_GPOS), then
 890          * check_offset_max represents an upper bound on the string where
 891          * the substr could start. For the ANCH_GPOS case, we assume that
 892          * the caller of intuit will have already set strpos to
 893          * pos()-gofs, so in this case strpos + offset_max will still be
 894          * an upper bound on the substr.
 895          */
 896         if (!ml_anch
 897             && prog->intflags & PREGf_ANCH
 898             && prog->check_offset_max != SSize_t_MAX)
 899         {
 900             SSize_t len = SvCUR(check) - !!SvTAIL(check);
 901             const char * const anchor =
 902                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 903
 904             /* do a bytes rather than chars comparison. It's conservative;
 905              * so it skips doing the HOP if the result can't possibly end
 906              * up earlier than the old value of end_point.
 907              */
 908             if ((char*)end_point - anchor > prog->check_offset_max) {
 909                 end_point = HOP3lim((U8*)anchor,
 910                                 prog->check_offset_max,
 911                                 end_point -len)
 912                             + len;
 913             }
 914         }
 915
 916         check_at = fbm_instr( start_point, end_point,
 917                       check, multiline ? FBMrf_MULTILINE : 0);
 918
 919         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 920             "  doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
 921             (IV)((char*)start_point - strbeg),
 922             (IV)((char*)end_point   - strbeg),
 923             (IV)(check_at ? check_at - strbeg : -1)
 924         ));
 925
 926         /* Update the count-of-usability, remove useless subpatterns,
 927             unshift s.  */
 928
 929         DEBUG_EXECUTE_r({
 930             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 931                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 932             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
 933                               (check_at ? "Found" : "Did not find"),
 934                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 935                     ? "anchored" : "floating"),
 936                 quoted,
 937                 RE_SV_TAIL(check),
 938                 (check_at ? " at offset " : "...\n") );
 939         });
 940
 941         if (!check_at)
 942             goto fail_finish;
 943         /* set rx_origin to the minimum position where the regex could start
 944          * matching, given the constraint of the just-matched check substring.
 945          * But don't set it lower than previously.
 946          */
 947
 948         if (check_at - rx_origin > prog->check_offset_max)
 949             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 950         /* Finish the diagnostic message */
 951         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 952             "%ld (rx_origin now %"IVdf")...\n",
 953             (long)(check_at - strbeg),
 954             (IV)(rx_origin - strbeg)
 955         ));
 956     }
 957
 958
 959     /* now look for the 'other' substring if defined */
 960
 961     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 962                     : prog->substrs->data[other_ix].substr)
 963     {
 964         /* Take into account the "other" substring. */
 965         char *last, *last1;
 966         char *s;
 967         SV* must;
 968         struct reg_substr_datum *other;
 969
 970       do_other_substr:
 971         other = &prog->substrs->data[other_ix];
 972
 973         /* if "other" is anchored:
 974          * we've previously found a floating substr starting at check_at.
 975          * This means that the regex origin must lie somewhere
 976          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 977          * and max:                 HOP3(check_at, -check_offset_min)
 978          * (except that min will be >= strpos)
 979          * So the fixed  substr must lie somewhere between
 980          *  HOP3(min, anchored_offset)
 981          *  HOP3(max, anchored_offset) + SvCUR(substr)
 982          */
 983
 984         /* if "other" is floating
 985          * Calculate last1, the absolute latest point where the
 986          * floating substr could start in the string, ignoring any
 987          * constraints from the earlier fixed match. It is calculated
 988          * as follows:
 989          *
 990          * strend - prog->minlen (in chars) is the absolute latest
 991          * position within the string where the origin of the regex
 992          * could appear. The latest start point for the floating
 993          * substr is float_min_offset(*) on from the start of the
 994          * regex.  last1 simply combines thee two offsets.
 995          *
 996          * (*) You might think the latest start point should be
 997          * float_max_offset from the regex origin, and technically
 998          * you'd be correct. However, consider
 999          *    /a\d{2,4}bcd\w/
1000          * Here, float min, max are 3,5 and minlen is 7.
1001          * This can match either
1002          *    /a\d\dbcd\w/
1003          *    /a\d\d\dbcd\w/
1004          *    /a\d\d\d\dbcd\w/
1005          * In the first case, the regex matches minlen chars; in the
1006          * second, minlen+1, in the third, minlen+2.
1007          * In the first case, the floating offset is 3 (which equals
1008          * float_min), in the second, 4, and in the third, 5 (which
1009          * equals float_max). In all cases, the floating string bcd
1010          * can never start more than 4 chars from the end of the
1011          * string, which equals minlen - float_min. As the substring
1012          * starts to match more than float_min from the start of the
1013          * regex, it makes the regex match more than minlen chars,
1014          * and the two cancel each other out. So we can always use
1015          * float_min - minlen, rather than float_max - minlen for the
1016          * latest position in the string.
1017          *
1018          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1019          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1020          */
1021
1022         assert(prog->minlen >= other->min_offset);
1023         last1 = HOP3c(strend,
1024                         other->min_offset - prog->minlen, strbeg);
1025
1026         if (other_ix) {/* i.e. if (other-is-float) */
1027             /* last is the latest point where the floating substr could
1028              * start, *given* any constraints from the earlier fixed
1029              * match. This constraint is that the floating string starts
1030              * <= float_max_offset chars from the regex origin (rx_origin).
1031              * If this value is less than last1, use it instead.
1032              */
1033             assert(rx_origin <= last1);
1034             last =
1035                 /* this condition handles the offset==infinity case, and
1036                  * is a short-cut otherwise. Although it's comparing a
1037                  * byte offset to a char length, it does so in a safe way,
1038                  * since 1 char always occupies 1 or more bytes,
1039                  * so if a string range is  (last1 - rx_origin) bytes,
1040                  * it will be less than or equal to  (last1 - rx_origin)
1041                  * chars; meaning it errs towards doing the accurate HOP3
1042                  * rather than just using last1 as a short-cut */
1043                 (last1 - rx_origin) < other->max_offset
1044                     ? last1
1045                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1046         }
1047         else {
1048             assert(strpos + start_shift <= check_at);
1049             last = HOP4c(check_at, other->min_offset - start_shift,
1050                         strbeg, strend);
1051         }
1052
1053         s = HOP3c(rx_origin, other->min_offset, strend);
1054         if (s < other_last)     /* These positions already checked */
1055             s = other_last;
1056
1057         must = utf8_target ? other->utf8_substr : other->substr;
1058         assert(SvPOK(must));
1059         {
1060             char *from = s;
1061             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1062
1063             if (to > strend)
1064                 to = strend;
1065             if (from > to) {
1066                 s = NULL;
1067                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1068                     "  skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1069                     (IV)(from - strbeg),
1070                     (IV)(to   - strbeg)
1071                 ));
1072             }
1073             else {
1074                 s = fbm_instr(
1075                     (unsigned char*)from,
1076                     (unsigned char*)to,
1077                     must,
1078                     multiline ? FBMrf_MULTILINE : 0
1079                 );
1080                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1081                     "  doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1082                     (IV)(from - strbeg),
1083                     (IV)(to   - strbeg),
1084                     (IV)(s ? s - strbeg : -1)
1085                 ));
1086             }
1087         }
1088
1089         DEBUG_EXECUTE_r({
1090             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1091                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1092             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1093                 s ? "Found" : "Contradicts",
1094                 other_ix ? "floating" : "anchored",
1095                 quoted, RE_SV_TAIL(must));
1096         });
1097
1098
1099         if (!s) {
1100             /* last1 is latest possible substr location. If we didn't
1101              * find it before there, we never will */
1102             if (last >= last1) {
1103                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1104                                         "; giving up...\n"));
1105                 goto fail_finish;
1106             }
1107
1108             /* try to find the check substr again at a later
1109              * position. Maybe next time we'll find the "other" substr
1110              * in range too */
1111             other_last = HOP3c(last, 1, strend) /* highest failure */;
1112             rx_origin =
1113                 other_ix /* i.e. if other-is-float */
1114                     ? HOP3c(rx_origin, 1, strend)
1115                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1116             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1117                 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1118                 (other_ix ? "floating" : "anchored"),
1119                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1120                 (IV)(rx_origin - strbeg)
1121             ));
1122             goto restart;
1123         }
1124         else {
1125             if (other_ix) { /* if (other-is-float) */
1126                 /* other_last is set to s, not s+1, since its possible for
1127                  * a floating substr to fail first time, then succeed
1128                  * second time at the same floating position; e.g.:
1129                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1130                  * The first time round, anchored and float match at
1131                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1132                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1133                  */
1134                 other_last = s;
1135             }
1136             else {
1137                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1138                 other_last = HOP3c(s, 1, strend);
1139             }
1140             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1141                 " at offset %ld (rx_origin now %"IVdf")...\n",
1142                   (long)(s - strbeg),
1143                 (IV)(rx_origin - strbeg)
1144               ));
1145
1146         }
1147     }
1148     else {
1149         DEBUG_OPTIMISE_MORE_r(
1150             Perl_re_printf( aTHX_
1151                 "  Check-only match: offset min:%"IVdf" max:%"IVdf
1152                 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1153                 " strend:%"IVdf"\n",
1154                 (IV)prog->check_offset_min,
1155                 (IV)prog->check_offset_max,
1156                 (IV)(check_at-strbeg),
1157                 (IV)(rx_origin-strbeg),
1158                 (IV)(rx_origin-check_at),
1159                 (IV)(strend-strbeg)
1160             )
1161         );
1162     }
1163
1164   postprocess_substr_matches:
1165
1166     /* handle the extra constraint of /^.../m if present */
1167
1168     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1169         char *s;
1170
1171         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1172                         "  looking for /^/m anchor"));
1173
1174         /* we have failed the constraint of a \n before rx_origin.
1175          * Find the next \n, if any, even if it's beyond the current
1176          * anchored and/or floating substrings. Whether we should be
1177          * scanning ahead for the next \n or the next substr is debatable.
1178          * On the one hand you'd expect rare substrings to appear less
1179          * often than \n's. On the other hand, searching for \n means
1180          * we're effectively flipping between check_substr and "\n" on each
1181          * iteration as the current "rarest" string candidate, which
1182          * means for example that we'll quickly reject the whole string if
1183          * hasn't got a \n, rather than trying every substr position
1184          * first
1185          */
1186
1187         s = HOP3c(strend, - prog->minlen, strpos);
1188         if (s <= rx_origin ||
1189             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1190         {
1191             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1192                             "  Did not find /%s^%s/m...\n",
1193                             PL_colors[0], PL_colors[1]));
1194             goto fail_finish;
1195         }
1196
1197         /* earliest possible origin is 1 char after the \n.
1198          * (since *rx_origin == '\n', it's safe to ++ here rather than
1199          * HOP(rx_origin, 1)) */
1200         rx_origin++;
1201
1202         if (prog->substrs->check_ix == 0  /* check is anchored */
1203             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1204         {
1205             /* Position contradicts check-string; either because
1206              * check was anchored (and thus has no wiggle room),
1207              * or check was float and rx_origin is above the float range */
1208             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1209                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1210                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1211             goto restart;
1212         }
1213
1214         /* if we get here, the check substr must have been float,
1215          * is in range, and we may or may not have had an anchored
1216          * "other" substr which still contradicts */
1217         assert(prog->substrs->check_ix); /* check is float */
1218
1219         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1220             /* whoops, the anchored "other" substr exists, so we still
1221              * contradict. On the other hand, the float "check" substr
1222              * didn't contradict, so just retry the anchored "other"
1223              * substr */
1224             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1225                 "  Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
1226                 PL_colors[0], PL_colors[1],
1227                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1228                 (IV)(rx_origin - strbeg)
1229             ));
1230             goto do_other_substr;
1231         }
1232
1233         /* success: we don't contradict the found floating substring
1234          * (and there's no anchored substr). */
1235         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1236             "  Found /%s^%s/m with rx_origin %ld...\n",
1237             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1238     }
1239     else {
1240         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1241             "  (multiline anchor test skipped)\n"));
1242     }
1243
1244   success_at_start:
1245
1246
1247     /* if we have a starting character class, then test that extra constraint.
1248      * (trie stclasses are too expensive to use here, we are better off to
1249      * leave it to regmatch itself) */
1250
1251     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1252         const U8* const str = (U8*)STRING(progi->regstclass);
1253
1254         /* XXX this value could be pre-computed */
1255         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1256                     ?  (reginfo->is_utf8_pat
1257                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1258                         : STR_LEN(progi->regstclass))
1259                     : 1);
1260         char * endpos;
1261         char *s;
1262         /* latest pos that a matching float substr constrains rx start to */
1263         char *rx_max_float = NULL;
1264
1265         /* if the current rx_origin is anchored, either by satisfying an
1266          * anchored substring constraint, or a /^.../m constraint, then we
1267          * can reject the current origin if the start class isn't found
1268          * at the current position. If we have a float-only match, then
1269          * rx_origin is constrained to a range; so look for the start class
1270          * in that range. if neither, then look for the start class in the
1271          * whole rest of the string */
1272
1273         /* XXX DAPM it's not clear what the minlen test is for, and why
1274          * it's not used in the floating case. Nothing in the test suite
1275          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1276          * Here are some old comments, which may or may not be correct:
1277          *
1278          *   minlen == 0 is possible if regstclass is \b or \B,
1279          *   and the fixed substr is ''$.
1280          *   Since minlen is already taken into account, rx_origin+1 is
1281          *   before strend; accidentally, minlen >= 1 guaranties no false
1282          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1283          *   0) below assumes that regstclass does not come from lookahead...
1284          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1285          *   This leaves EXACTF-ish only, which are dealt with in
1286          *   find_byclass().
1287          */
1288
1289         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1290             endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1291         else if (prog->float_substr || prog->float_utf8) {
1292             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1293             endpos= HOP3c(rx_max_float, cl_l, strend);
1294         }
1295         else
1296             endpos= strend;
1297
1298         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1299             "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1300             " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1301               (IV)start_shift, (IV)(check_at - strbeg),
1302               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1303
1304         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1305                             reginfo);
1306         if (!s) {
1307             if (endpos == strend) {
1308                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1309                                 "  Could not match STCLASS...\n") );
1310                 goto fail;
1311             }
1312             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1313                                "  This position contradicts STCLASS...\n") );
1314             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1315                         && !(prog->intflags & PREGf_IMPLICIT))
1316                 goto fail;
1317
1318             /* Contradict one of substrings */
1319             if (prog->anchored_substr || prog->anchored_utf8) {
1320                 if (prog->substrs->check_ix == 1) { /* check is float */
1321                     /* Have both, check_string is floating */
1322                     assert(rx_origin + start_shift <= check_at);
1323                     if (rx_origin + start_shift != check_at) {
1324                         /* not at latest position float substr could match:
1325                          * Recheck anchored substring, but not floating.
1326                          * The condition above is in bytes rather than
1327                          * chars for efficiency. It's conservative, in
1328                          * that it errs on the side of doing 'goto
1329                          * do_other_substr'. In this case, at worst,
1330                          * an extra anchored search may get done, but in
1331                          * practice the extra fbm_instr() is likely to
1332                          * get skipped anyway. */
1333                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1334                             "  about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1335                             (long)(other_last - strbeg),
1336                             (IV)(rx_origin - strbeg)
1337                         ));
1338                         goto do_other_substr;
1339                     }
1340                 }
1341             }
1342             else {
1343                 /* float-only */
1344
1345                 if (ml_anch) {
1346                     /* In the presence of ml_anch, we might be able to
1347                      * find another \n without breaking the current float
1348                      * constraint. */
1349
1350                     /* strictly speaking this should be HOP3c(..., 1, ...),
1351                      * but since we goto a block of code that's going to
1352                      * search for the next \n if any, its safe here */
1353                     rx_origin++;
1354                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1355                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1356                               PL_colors[0], PL_colors[1],
1357                               (long)(rx_origin - strbeg)) );
1358                     goto postprocess_substr_matches;
1359                 }
1360
1361                 /* strictly speaking this can never be true; but might
1362                  * be if we ever allow intuit without substrings */
1363                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1364                     goto fail;
1365
1366                 rx_origin = rx_max_float;
1367             }
1368
1369             /* at this point, any matching substrings have been
1370              * contradicted. Start again... */
1371
1372             rx_origin = HOP3c(rx_origin, 1, strend);
1373
1374             /* uses bytes rather than char calculations for efficiency.
1375              * It's conservative: it errs on the side of doing 'goto restart',
1376              * where there is code that does a proper char-based test */
1377             if (rx_origin + start_shift + end_shift > strend) {
1378                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1379                                        "  Could not match STCLASS...\n") );
1380                 goto fail;
1381             }
1382             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1383                 "  about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
1384                 (prog->substrs->check_ix ? "floating" : "anchored"),
1385                 (long)(rx_origin + start_shift - strbeg),
1386                 (IV)(rx_origin - strbeg)
1387             ));
1388             goto restart;
1389         }
1390
1391         /* Success !!! */
1392
1393         if (rx_origin != s) {
1394             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1395                         "  By STCLASS: moving %ld --> %ld\n",
1396                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1397                    );
1398         }
1399         else {
1400             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1401                                   "  Does not contradict STCLASS...\n");
1402                    );
1403         }
1404     }
1405
1406     /* Decide whether using the substrings helped */
1407
1408     if (rx_origin != strpos) {
1409         /* Fixed substring is found far enough so that the match
1410            cannot start at strpos. */
1411
1412         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1413         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1414     }
1415     else {
1416         /* The found rx_origin position does not prohibit matching at
1417          * strpos, so calling intuit didn't gain us anything. Decrement
1418          * the BmUSEFUL() count on the check substring, and if we reach
1419          * zero, free it.  */
1420         if (!(prog->intflags & PREGf_NAUGHTY)
1421             && (utf8_target ? (
1422                 prog->check_utf8                /* Could be deleted already */
1423                 && --BmUSEFUL(prog->check_utf8) < 0
1424                 && (prog->check_utf8 == prog->float_utf8)
1425             ) : (
1426                 prog->check_substr              /* Could be deleted already */
1427                 && --BmUSEFUL(prog->check_substr) < 0
1428                 && (prog->check_substr == prog->float_substr)
1429             )))
1430         {
1431             /* If flags & SOMETHING - do not do it many times on the same match */
1432             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1433             /* XXX Does the destruction order has to change with utf8_target? */
1434             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1435             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1436             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1437             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1438             check = NULL;                       /* abort */
1439             /* XXXX This is a remnant of the old implementation.  It
1440                     looks wasteful, since now INTUIT can use many
1441                     other heuristics. */
1442             prog->extflags &= ~RXf_USE_INTUIT;
1443         }
1444     }
1445
1446     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1447             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1448              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1449
1450     return rx_origin;
1451
1452   fail_finish:                          /* Substring not found */
1453     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1454         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1455   fail:
1456     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1457                           PL_colors[4], PL_colors[5]));
1458     return NULL;
1459 }
1460
1461
1462 #define DECL_TRIE_TYPE(scan) \
1463     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1464                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1465                  trie_utf8l, trie_flu8 }                                            \
1466                     trie_type = ((scan->flags == EXACT)                             \
1467                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1468                                  : (scan->flags == EXACTL)                          \
1469                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1470                                     : (scan->flags == EXACTFA)                      \
1471                                       ? (utf8_target                                \
1472                                          ? trie_utf8_exactfa_fold                   \
1473                                          : trie_latin_utf8_exactfa_fold)            \
1474                                       : (scan->flags == EXACTFLU8                   \
1475                                          ? trie_flu8                                \
1476                                          : (utf8_target                             \
1477                                            ? trie_utf8_fold                         \
1478                                            :   trie_latin_utf8_fold)))
1479
1480 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1481 STMT_START {                                                                        \
1482     STRLEN skiplen;                                                                 \
1483     U8 flags = FOLD_FLAGS_FULL;                                                     \
1484     switch (trie_type) {                                                            \
1485     case trie_flu8:                                                                 \
1486         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1487         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1488             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1489         }                                                                           \
1490         goto do_trie_utf8_fold;                                                     \
1491     case trie_utf8_exactfa_fold:                                                    \
1492         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1493         /* FALLTHROUGH */                                                           \
1494     case trie_utf8_fold:                                                            \
1495       do_trie_utf8_fold:                                                            \
1496         if ( foldlen>0 ) {                                                          \
1497             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1498             foldlen -= len;                                                         \
1499             uscan += len;                                                           \
1500             len=0;                                                                  \
1501         } else {                                                                    \
1502             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1503             len = UTF8SKIP(uc);                                                     \
1504             skiplen = UVCHR_SKIP( uvc );                                            \
1505             foldlen -= skiplen;                                                     \
1506             uscan = foldbuf + skiplen;                                              \
1507         }                                                                           \
1508         break;                                                                      \
1509     case trie_latin_utf8_exactfa_fold:                                              \
1510         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1511         /* FALLTHROUGH */                                                           \
1512     case trie_latin_utf8_fold:                                                      \
1513         if ( foldlen>0 ) {                                                          \
1514             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1515             foldlen -= len;                                                         \
1516             uscan += len;                                                           \
1517             len=0;                                                                  \
1518         } else {                                                                    \
1519             len = 1;                                                                \
1520             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1521             skiplen = UVCHR_SKIP( uvc );                                            \
1522             foldlen -= skiplen;                                                     \
1523             uscan = foldbuf + skiplen;                                              \
1524         }                                                                           \
1525         break;                                                                      \
1526     case trie_utf8l:                                                                \
1527         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1528         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1529             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1530         }                                                                           \
1531         /* FALLTHROUGH */                                                           \
1532     case trie_utf8:                                                                 \
1533         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1534         break;                                                                      \
1535     case trie_plain:                                                                \
1536         uvc = (UV)*uc;                                                              \
1537         len = 1;                                                                    \
1538     }                                                                               \
1539     if (uvc < 256) {                                                                \
1540         charid = trie->charmap[ uvc ];                                              \
1541     }                                                                               \
1542     else {                                                                          \
1543         charid = 0;                                                                 \
1544         if (widecharmap) {                                                          \
1545             SV** const svpp = hv_fetch(widecharmap,                                 \
1546                         (char*)&uvc, sizeof(UV), 0);                                \
1547             if (svpp)                                                               \
1548                 charid = (U16)SvIV(*svpp);                                          \
1549         }                                                                           \
1550     }                                                                               \
1551 } STMT_END
1552
1553 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1554     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1555                 startpos, doutf8, depth)
1556
1557 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1558 STMT_START {                                              \
1559     while (s <= e) {                                      \
1560         if ( (COND)                                       \
1561              && (ln == 1 || folder(s, pat_string, ln))    \
1562              && (reginfo->intuit || regtry(reginfo, &s)) )\
1563             goto got_it;                                  \
1564         s++;                                              \
1565     }                                                     \
1566 } STMT_END
1567
1568 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1569 STMT_START {                                          \
1570     while (s < strend) {                              \
1571         CODE                                          \
1572         s += UTF8SKIP(s);                             \
1573     }                                                 \
1574 } STMT_END
1575
1576 #define REXEC_FBC_SCAN(CODE)                          \
1577 STMT_START {                                          \
1578     while (s < strend) {                              \
1579         CODE                                          \
1580         s++;                                          \
1581     }                                                 \
1582 } STMT_END
1583
1584 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1585 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1586     if (COND) {                                                \
1587         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1588             goto got_it;                                       \
1589         else                                                   \
1590             tmp = doevery;                                     \
1591     }                                                          \
1592     else                                                       \
1593         tmp = 1;                                               \
1594 )
1595
1596 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1597 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1598     if (COND) {                                                \
1599         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1600             goto got_it;                                       \
1601         else                                                   \
1602             tmp = doevery;                                     \
1603     }                                                          \
1604     else                                                       \
1605         tmp = 1;                                               \
1606 )
1607
1608 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1609     if (utf8_target) {                                         \
1610         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1611     }                                                          \
1612     else {                                                     \
1613         REXEC_FBC_CLASS_SCAN(COND);                            \
1614     }
1615
1616 /* The three macros below are slightly different versions of the same logic.
1617  *
1618  * The first is for /a and /aa when the target string is UTF-8.  This can only
1619  * match ascii, but it must advance based on UTF-8.   The other two handle the
1620  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1621  * for the boundary (or non-boundary) between a word and non-word character.
1622  * The utf8 and non-utf8 cases have the same logic, but the details must be
1623  * different.  Find the "wordness" of the character just prior to this one, and
1624  * compare it with the wordness of this one.  If they differ, we have a
1625  * boundary.  At the beginning of the string, pretend that the previous
1626  * character was a new-line.
1627  *
1628  * All these macros uncleanly have side-effects with each other and outside
1629  * variables.  So far it's been too much trouble to clean-up
1630  *
1631  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1632  *               a word character or not.
1633  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1634  *               word/non-word
1635  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1636  *
1637  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1638  * are looking for a boundary or for a non-boundary.  If we are looking for a
1639  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1640  * see if this tentative match actually works, and if so, to quit the loop
1641  * here.  And vice-versa if we are looking for a non-boundary.
1642  *
1643  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1644  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1645  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1646  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1647  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1648  * complement.  But in that branch we complement tmp, meaning that at the
1649  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1650  * which means at the top of the loop in the next iteration, it is
1651  * TEST_NON_UTF8(s-1) */
1652 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1653     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1654     tmp = TEST_NON_UTF8(tmp);                                                  \
1655     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1656         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1657             tmp = !tmp;                                                        \
1658             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1659         }                                                                      \
1660         else {                                                                 \
1661             IF_FAIL;                                                           \
1662         }                                                                      \
1663     );                                                                         \
1664
1665 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1666  * TEST_UTF8 is a macro that for the same input code points returns identically
1667  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1668 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1669     if (s == reginfo->strbeg) {                                                \
1670         tmp = '\n';                                                            \
1671     }                                                                          \
1672     else { /* Back-up to the start of the previous character */                \
1673         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1674         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1675                                                        0, UTF8_ALLOW_DEFAULT); \
1676     }                                                                          \
1677     tmp = TEST_UV(tmp);                                                        \
1678     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1679     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1680         if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1681             tmp = !tmp;                                                        \
1682             IF_SUCCESS;                                                        \
1683         }                                                                      \
1684         else {                                                                 \
1685             IF_FAIL;                                                           \
1686         }                                                                      \
1687     );
1688
1689 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1690  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1691  * macros below */
1692 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1693     if (utf8_target) {                                                         \
1694         UTF8_CODE                                                              \
1695     }                                                                          \
1696     else {  /* Not utf8 */                                                     \
1697         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1698         tmp = TEST_NON_UTF8(tmp);                                              \
1699         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1700             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1701                 IF_SUCCESS;                                                    \
1702                 tmp = !tmp;                                                    \
1703             }                                                                  \
1704             else {                                                             \
1705                 IF_FAIL;                                                       \
1706             }                                                                  \
1707         );                                                                     \
1708     }                                                                          \
1709     /* Here, things have been set up by the previous code so that tmp is the   \
1710      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1711      * utf8ness of the target).  We also have to check if this matches against \
1712      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1713      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1714      * string */                                                               \
1715     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1716         IF_SUCCESS;                                                            \
1717     }                                                                          \
1718     else {                                                                     \
1719         IF_FAIL;                                                               \
1720     }
1721
1722 /* This is the macro to use when we want to see if something that looks like it
1723  * could match, actually does, and if so exits the loop */
1724 #define REXEC_FBC_TRYIT                            \
1725     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1726         goto got_it
1727
1728 /* The only difference between the BOUND and NBOUND cases is that
1729  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1730  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1731  * with the other one being empty (PLACEHOLDER is defined as empty).
1732  *
1733  * The TEST_FOO parameters are for operating on different forms of input, but
1734  * all should be ones that return identically for the same underlying code
1735  * points */
1736 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1737     FBC_BOUND_COMMON(                                                          \
1738           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1739           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1740
1741 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1742     FBC_BOUND_COMMON(                                                          \
1743             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1744             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1745
1746 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1747     FBC_BOUND_COMMON(                                                          \
1748           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1749           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1750
1751 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1752     FBC_BOUND_COMMON(                                                          \
1753             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1754             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1755
1756 #ifdef DEBUGGING
1757 static IV
1758 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1759   IV cp_out = Perl__invlist_search(invlist, cp_in);
1760   assert(cp_out >= 0);
1761   return cp_out;
1762 }
1763 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1764         invmap[S_get_break_val_cp_checked(invlist, cp)]
1765 #else
1766 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1767         invmap[_invlist_search(invlist, cp)]
1768 #endif
1769
1770 /* Takes a pointer to an inversion list, a pointer to its corresponding
1771  * inversion map, and a code point, and returns the code point's value
1772  * according to the two arrays.  It assumes that all code points have a value.
1773  * This is used as the base macro for macros for particular properties */
1774 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1775         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
1776
1777 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1778  * of a code point, returning the value for the first code point in the string.
1779  * And it takes the particular macro name that finds the desired value given a
1780  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1781 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1782              (__ASSERT_(pos < strend)                                          \
1783                  /* Note assumes is valid UTF-8 */                             \
1784              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1785
1786 /* Returns the GCB value for the input code point */
1787 #define getGCB_VAL_CP(cp)                                                      \
1788           _generic_GET_BREAK_VAL_CP(                                           \
1789                                     PL_GCB_invlist,                            \
1790                                     _Perl_GCB_invmap,                          \
1791                                     (cp))
1792
1793 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1794  * bounded by pos and strend */
1795 #define getGCB_VAL_UTF8(pos, strend)                                           \
1796     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1797
1798 /* Returns the LB value for the input code point */
1799 #define getLB_VAL_CP(cp)                                                       \
1800           _generic_GET_BREAK_VAL_CP(                                           \
1801                                     PL_LB_invlist,                             \
1802                                     _Perl_LB_invmap,                           \
1803                                     (cp))
1804
1805 /* Returns the LB value for the first code point in the UTF-8 encoded string
1806  * bounded by pos and strend */
1807 #define getLB_VAL_UTF8(pos, strend)                                            \
1808     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1809
1810
1811 /* Returns the SB value for the input code point */
1812 #define getSB_VAL_CP(cp)                                                       \
1813           _generic_GET_BREAK_VAL_CP(                                           \
1814                                     PL_SB_invlist,                             \
1815                                     _Perl_SB_invmap,                     \
1816                                     (cp))
1817
1818 /* Returns the SB value for the first code point in the UTF-8 encoded string
1819  * bounded by pos and strend */
1820 #define getSB_VAL_UTF8(pos, strend)                                            \
1821     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1822
1823 /* Returns the WB value for the input code point */
1824 #define getWB_VAL_CP(cp)                                                       \
1825           _generic_GET_BREAK_VAL_CP(                                           \
1826                                     PL_WB_invlist,                             \
1827                                     _Perl_WB_invmap,                         \
1828                                     (cp))
1829
1830 /* Returns the WB value for the first code point in the UTF-8 encoded string
1831  * bounded by pos and strend */
1832 #define getWB_VAL_UTF8(pos, strend)                                            \
1833     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1834
1835 /* We know what class REx starts with.  Try to find this position... */
1836 /* if reginfo->intuit, its a dryrun */
1837 /* annoyingly all the vars in this routine have different names from their counterparts
1838    in regmatch. /grrr */
1839 STATIC char *
1840 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1841     const char *strend, regmatch_info *reginfo)
1842 {
1843     dVAR;
1844     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1845     char *pat_string;   /* The pattern's exactish string */
1846     char *pat_end;          /* ptr to end char of pat_string */
1847     re_fold_t folder;   /* Function for computing non-utf8 folds */
1848     const U8 *fold_array;   /* array for folding ords < 256 */
1849     STRLEN ln;
1850     STRLEN lnc;
1851     U8 c1;
1852     U8 c2;
1853     char *e;
1854     I32 tmp = 1;        /* Scratch variable? */
1855     const bool utf8_target = reginfo->is_utf8_target;
1856     UV utf8_fold_flags = 0;
1857     const bool is_utf8_pat = reginfo->is_utf8_pat;
1858     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1859                                    with a result inverts that result, as 0^1 =
1860                                    1 and 1^1 = 0 */
1861     _char_class_number classnum;
1862
1863     RXi_GET_DECL(prog,progi);
1864
1865     PERL_ARGS_ASSERT_FIND_BYCLASS;
1866
1867     /* We know what class it must start with. */
1868     switch (OP(c)) {
1869     case ANYOFL:
1870         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1871
1872         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
1873             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1874         }
1875
1876         /* FALLTHROUGH */
1877     case ANYOFD:
1878     case ANYOF:
1879         if (utf8_target) {
1880             REXEC_FBC_UTF8_CLASS_SCAN(
1881                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1882         }
1883         else {
1884             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s, 0));
1885         }
1886         break;
1887
1888     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1889         assert(! is_utf8_pat);
1890         /* FALLTHROUGH */
1891     case EXACTFA:
1892         if (is_utf8_pat || utf8_target) {
1893             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1894             goto do_exactf_utf8;
1895         }
1896         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1897         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1898         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1899
1900     case EXACTF:   /* This node only generated for non-utf8 patterns */
1901         assert(! is_utf8_pat);
1902         if (utf8_target) {
1903             utf8_fold_flags = 0;
1904             goto do_exactf_utf8;
1905         }
1906         fold_array = PL_fold;
1907         folder = foldEQ;
1908         goto do_exactf_non_utf8;
1909
1910     case EXACTFL:
1911         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1912         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1913             utf8_fold_flags = FOLDEQ_LOCALE;
1914             goto do_exactf_utf8;
1915         }
1916         fold_array = PL_fold_locale;
1917         folder = foldEQ_locale;
1918         goto do_exactf_non_utf8;
1919
1920     case EXACTFU_SS:
1921         if (is_utf8_pat) {
1922             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1923         }
1924         goto do_exactf_utf8;
1925
1926     case EXACTFLU8:
1927             if (! utf8_target) {    /* All code points in this node require
1928                                        UTF-8 to express.  */
1929                 break;
1930             }
1931             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1932                                              | FOLDEQ_S2_FOLDS_SANE;
1933             goto do_exactf_utf8;
1934
1935     case EXACTFU:
1936         if (is_utf8_pat || utf8_target) {
1937             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1938             goto do_exactf_utf8;
1939         }
1940
1941         /* Any 'ss' in the pattern should have been replaced by regcomp,
1942          * so we don't have to worry here about this single special case
1943          * in the Latin1 range */
1944         fold_array = PL_fold_latin1;
1945         folder = foldEQ_latin1;
1946
1947         /* FALLTHROUGH */
1948
1949       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1950                            are no glitches with fold-length differences
1951                            between the target string and pattern */
1952
1953         /* The idea in the non-utf8 EXACTF* cases is to first find the
1954          * first character of the EXACTF* node and then, if necessary,
1955          * case-insensitively compare the full text of the node.  c1 is the
1956          * first character.  c2 is its fold.  This logic will not work for
1957          * Unicode semantics and the german sharp ss, which hence should
1958          * not be compiled into a node that gets here. */
1959         pat_string = STRING(c);
1960         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1961
1962         /* We know that we have to match at least 'ln' bytes (which is the
1963          * same as characters, since not utf8).  If we have to match 3
1964          * characters, and there are only 2 availabe, we know without
1965          * trying that it will fail; so don't start a match past the
1966          * required minimum number from the far end */
1967         e = HOP3c(strend, -((SSize_t)ln), s);
1968
1969         if (reginfo->intuit && e < s) {
1970             e = s;                      /* Due to minlen logic of intuit() */
1971         }
1972
1973         c1 = *pat_string;
1974         c2 = fold_array[c1];
1975         if (c1 == c2) { /* If char and fold are the same */
1976             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1977         }
1978         else {
1979             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1980         }
1981         break;
1982
1983       do_exactf_utf8:
1984       {
1985         unsigned expansion;
1986
1987         /* If one of the operands is in utf8, we can't use the simpler folding
1988          * above, due to the fact that many different characters can have the
1989          * same fold, or portion of a fold, or different- length fold */
1990         pat_string = STRING(c);
1991         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1992         pat_end = pat_string + ln;
1993         lnc = is_utf8_pat       /* length to match in characters */
1994                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1995                 : ln;
1996
1997         /* We have 'lnc' characters to match in the pattern, but because of
1998          * multi-character folding, each character in the target can match
1999          * up to 3 characters (Unicode guarantees it will never exceed
2000          * this) if it is utf8-encoded; and up to 2 if not (based on the
2001          * fact that the Latin 1 folds are already determined, and the
2002          * only multi-char fold in that range is the sharp-s folding to
2003          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2004          * string character.  Adjust lnc accordingly, rounding up, so that
2005          * if we need to match at least 4+1/3 chars, that really is 5. */
2006         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2007         lnc = (lnc + expansion - 1) / expansion;
2008
2009         /* As in the non-UTF8 case, if we have to match 3 characters, and
2010          * only 2 are left, it's guaranteed to fail, so don't start a
2011          * match that would require us to go beyond the end of the string
2012          */
2013         e = HOP3c(strend, -((SSize_t)lnc), s);
2014
2015         if (reginfo->intuit && e < s) {
2016             e = s;                      /* Due to minlen logic of intuit() */
2017         }
2018
2019         /* XXX Note that we could recalculate e to stop the loop earlier,
2020          * as the worst case expansion above will rarely be met, and as we
2021          * go along we would usually find that e moves further to the left.
2022          * This would happen only after we reached the point in the loop
2023          * where if there were no expansion we should fail.  Unclear if
2024          * worth the expense */
2025
2026         while (s <= e) {
2027             char *my_strend= (char *)strend;
2028             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2029                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2030                 && (reginfo->intuit || regtry(reginfo, &s)) )
2031             {
2032                 goto got_it;
2033             }
2034             s += (utf8_target) ? UTF8SKIP(s) : 1;
2035         }
2036         break;
2037     }
2038
2039     case BOUNDL:
2040         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2041         if (FLAGS(c) != TRADITIONAL_BOUND) {
2042             if (! IN_UTF8_CTYPE_LOCALE) {
2043                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2044                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2045             }
2046             goto do_boundu;
2047         }
2048
2049         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2050         break;
2051
2052     case NBOUNDL:
2053         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2054         if (FLAGS(c) != TRADITIONAL_BOUND) {
2055             if (! IN_UTF8_CTYPE_LOCALE) {
2056                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2057                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2058             }
2059             goto do_nboundu;
2060         }
2061
2062         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2063         break;
2064
2065     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2066                    meaning */
2067         assert(FLAGS(c) == TRADITIONAL_BOUND);
2068
2069         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2070         break;
2071
2072     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2073                    meaning */
2074         assert(FLAGS(c) == TRADITIONAL_BOUND);
2075
2076         FBC_BOUND_A(isWORDCHAR_A);
2077         break;
2078
2079     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2080                    meaning */
2081         assert(FLAGS(c) == TRADITIONAL_BOUND);
2082
2083         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2084         break;
2085
2086     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2087                    meaning */
2088         assert(FLAGS(c) == TRADITIONAL_BOUND);
2089
2090         FBC_NBOUND_A(isWORDCHAR_A);
2091         break;
2092
2093     case NBOUNDU:
2094         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2095             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2096             break;
2097         }
2098
2099       do_nboundu:
2100
2101         to_complement = 1;
2102         /* FALLTHROUGH */
2103
2104     case BOUNDU:
2105       do_boundu:
2106         switch((bound_type) FLAGS(c)) {
2107             case TRADITIONAL_BOUND:
2108                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2109                 break;
2110             case GCB_BOUND:
2111                 if (s == reginfo->strbeg) {
2112                     if (reginfo->intuit || regtry(reginfo, &s))
2113                     {
2114                         goto got_it;
2115                     }
2116
2117                     /* Didn't match.  Try at the next position (if there is one) */
2118                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2119                     if (UNLIKELY(s >= reginfo->strend)) {
2120                         break;
2121                     }
2122                 }
2123
2124                 if (utf8_target) {
2125                     GCB_enum before = getGCB_VAL_UTF8(
2126                                                reghop3((U8*)s, -1,
2127                                                        (U8*)(reginfo->strbeg)),
2128                                                (U8*) reginfo->strend);
2129                     while (s < strend) {
2130                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2131                                                         (U8*) reginfo->strend);
2132                         if (   (to_complement ^ isGCB(before,
2133                                                       after,
2134                                                       (U8*) reginfo->strbeg,
2135                                                       (U8*) s,
2136                                                       utf8_target))
2137                             && (reginfo->intuit || regtry(reginfo, &s)))
2138                         {
2139                             goto got_it;
2140                         }
2141                         before = after;
2142                         s += UTF8SKIP(s);
2143                     }
2144                 }
2145                 else {  /* Not utf8.  Everything is a GCB except between CR and
2146                            LF */
2147                     while (s < strend) {
2148                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2149                                               || UCHARAT(s) != '\n'))
2150                             && (reginfo->intuit || regtry(reginfo, &s)))
2151                         {
2152                             goto got_it;
2153                         }
2154                         s++;
2155                     }
2156                 }
2157
2158                 /* And, since this is a bound, it can match after the final
2159                  * character in the string */
2160                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2161                     goto got_it;
2162                 }
2163                 break;
2164
2165             case LB_BOUND:
2166                 if (s == reginfo->strbeg) {
2167                     if (reginfo->intuit || regtry(reginfo, &s)) {
2168                         goto got_it;
2169                     }
2170                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2171                     if (UNLIKELY(s >= reginfo->strend)) {
2172                         break;
2173                     }
2174                 }
2175
2176                 if (utf8_target) {
2177                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2178                                                                -1,
2179                                                                (U8*)(reginfo->strbeg)),
2180                                                        (U8*) reginfo->strend);
2181                     while (s < strend) {
2182                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2183                         if (to_complement ^ isLB(before,
2184                                                  after,
2185                                                  (U8*) reginfo->strbeg,
2186                                                  (U8*) s,
2187                                                  (U8*) reginfo->strend,
2188                                                  utf8_target)
2189                             && (reginfo->intuit || regtry(reginfo, &s)))
2190                         {
2191                             goto got_it;
2192                         }
2193                         before = after;
2194                         s += UTF8SKIP(s);
2195                     }
2196                 }
2197                 else {  /* Not utf8. */
2198                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2199                     while (s < strend) {
2200                         LB_enum after = getLB_VAL_CP((U8) *s);
2201                         if (to_complement ^ isLB(before,
2202                                                  after,
2203                                                  (U8*) reginfo->strbeg,
2204                                                  (U8*) s,
2205                                                  (U8*) reginfo->strend,
2206                                                  utf8_target)
2207                             && (reginfo->intuit || regtry(reginfo, &s)))
2208                         {
2209                             goto got_it;
2210                         }
2211                         before = after;
2212                         s++;
2213                     }
2214                 }
2215
2216                 if (reginfo->intuit || regtry(reginfo, &s)) {
2217                     goto got_it;
2218                 }
2219
2220                 break;
2221
2222             case SB_BOUND:
2223                 if (s == reginfo->strbeg) {
2224                     if (reginfo->intuit || regtry(reginfo, &s)) {
2225                         goto got_it;
2226                     }
2227                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2228                     if (UNLIKELY(s >= reginfo->strend)) {
2229                         break;
2230                     }
2231                 }
2232
2233                 if (utf8_target) {
2234                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2235                                                         -1,
2236                                                         (U8*)(reginfo->strbeg)),
2237                                                       (U8*) reginfo->strend);
2238                     while (s < strend) {
2239                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2240                                                          (U8*) reginfo->strend);
2241                         if ((to_complement ^ isSB(before,
2242                                                   after,
2243                                                   (U8*) reginfo->strbeg,
2244                                                   (U8*) s,
2245                                                   (U8*) reginfo->strend,
2246                                                   utf8_target))
2247                             && (reginfo->intuit || regtry(reginfo, &s)))
2248                         {
2249                             goto got_it;
2250                         }
2251                         before = after;
2252                         s += UTF8SKIP(s);
2253                     }
2254                 }
2255                 else {  /* Not utf8. */
2256                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2257                     while (s < strend) {
2258                         SB_enum after = getSB_VAL_CP((U8) *s);
2259                         if ((to_complement ^ isSB(before,
2260                                                   after,
2261                                                   (U8*) reginfo->strbeg,
2262                                                   (U8*) s,
2263                                                   (U8*) reginfo->strend,
2264                                                   utf8_target))
2265                             && (reginfo->intuit || regtry(reginfo, &s)))
2266                         {
2267                             goto got_it;
2268                         }
2269                         before = after;
2270                         s++;
2271                     }
2272                 }
2273
2274                 /* Here are at the final position in the target string.  The SB
2275                  * value is always true here, so matches, depending on other
2276                  * constraints */
2277                 if (reginfo->intuit || regtry(reginfo, &s)) {
2278                     goto got_it;
2279                 }
2280
2281                 break;
2282
2283             case WB_BOUND:
2284                 if (s == reginfo->strbeg) {
2285                     if (reginfo->intuit || regtry(reginfo, &s)) {
2286                         goto got_it;
2287                     }
2288                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2289                     if (UNLIKELY(s >= reginfo->strend)) {
2290                         break;
2291                     }
2292                 }
2293
2294                 if (utf8_target) {
2295                     /* We are at a boundary between char_sub_0 and char_sub_1.
2296                      * We also keep track of the value for char_sub_-1 as we
2297                      * loop through the line.   Context may be needed to make a
2298                      * determination, and if so, this can save having to
2299                      * recalculate it */
2300                     WB_enum previous = WB_UNKNOWN;
2301                     WB_enum before = getWB_VAL_UTF8(
2302                                               reghop3((U8*)s,
2303                                                       -1,
2304                                                       (U8*)(reginfo->strbeg)),
2305                                               (U8*) reginfo->strend);
2306                     while (s < strend) {
2307                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2308                                                         (U8*) reginfo->strend);
2309                         if ((to_complement ^ isWB(previous,
2310                                                   before,
2311                                                   after,
2312                                                   (U8*) reginfo->strbeg,
2313                                                   (U8*) s,
2314                                                   (U8*) reginfo->strend,
2315                                                   utf8_target))
2316                             && (reginfo->intuit || regtry(reginfo, &s)))
2317                         {
2318                             goto got_it;
2319                         }
2320                         previous = before;
2321                         before = after;
2322                         s += UTF8SKIP(s);
2323                     }
2324                 }
2325                 else {  /* Not utf8. */
2326                     WB_enum previous = WB_UNKNOWN;
2327                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2328                     while (s < strend) {
2329                         WB_enum after = getWB_VAL_CP((U8) *s);
2330                         if ((to_complement ^ isWB(previous,
2331                                                   before,
2332                                                   after,
2333                                                   (U8*) reginfo->strbeg,
2334                                                   (U8*) s,
2335                                                   (U8*) reginfo->strend,
2336                                                   utf8_target))
2337                             && (reginfo->intuit || regtry(reginfo, &s)))
2338                         {
2339                             goto got_it;
2340                         }
2341                         previous = before;
2342                         before = after;
2343                         s++;
2344                     }
2345                 }
2346
2347                 if (reginfo->intuit || regtry(reginfo, &s)) {
2348                     goto got_it;
2349                 }
2350         }
2351         break;
2352
2353     case LNBREAK:
2354         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2355                         is_LNBREAK_latin1_safe(s, strend)
2356         );
2357         break;
2358
2359     /* The argument to all the POSIX node types is the class number to pass to
2360      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2361
2362     case NPOSIXL:
2363         to_complement = 1;
2364         /* FALLTHROUGH */
2365
2366     case POSIXL:
2367         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2368         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2369                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2370         break;
2371
2372     case NPOSIXD:
2373         to_complement = 1;
2374         /* FALLTHROUGH */
2375
2376     case POSIXD:
2377         if (utf8_target) {
2378             goto posix_utf8;
2379         }
2380         goto posixa;
2381
2382     case NPOSIXA:
2383         if (utf8_target) {
2384             /* The complement of something that matches only ASCII matches all
2385              * non-ASCII, plus everything in ASCII that isn't in the class. */
2386             REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2387                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2388             break;
2389         }
2390
2391         to_complement = 1;
2392         /* FALLTHROUGH */
2393
2394     case POSIXA:
2395       posixa:
2396         /* Don't need to worry about utf8, as it can match only a single
2397          * byte invariant character. */
2398         REXEC_FBC_CLASS_SCAN(
2399                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2400         break;
2401
2402     case NPOSIXU:
2403         to_complement = 1;
2404         /* FALLTHROUGH */
2405
2406     case POSIXU:
2407         if (! utf8_target) {
2408             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2409                                                                     FLAGS(c))));
2410         }
2411         else {
2412
2413           posix_utf8:
2414             classnum = (_char_class_number) FLAGS(c);
2415             if (classnum < _FIRST_NON_SWASH_CC) {
2416                 while (s < strend) {
2417
2418                     /* We avoid loading in the swash as long as possible, but
2419                      * should we have to, we jump to a separate loop.  This
2420                      * extra 'if' statement is what keeps this code from being
2421                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2422                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2423                         goto found_above_latin1;
2424                     }
2425                     if ((UTF8_IS_INVARIANT(*s)
2426                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2427                                                                 classnum)))
2428                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
2429                             && to_complement ^ cBOOL(
2430                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2431                                                                       *(s + 1)),
2432                                               classnum))))
2433                     {
2434                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2435                             goto got_it;
2436                         else {
2437                             tmp = doevery;
2438                         }
2439                     }
2440                     else {
2441                         tmp = 1;
2442                     }
2443                     s += UTF8SKIP(s);
2444                 }
2445             }
2446             else switch (classnum) {    /* These classes are implemented as
2447                                            macros */
2448                 case _CC_ENUM_SPACE:
2449                     REXEC_FBC_UTF8_CLASS_SCAN(
2450                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
2451                     break;
2452
2453                 case _CC_ENUM_BLANK:
2454                     REXEC_FBC_UTF8_CLASS_SCAN(
2455                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
2456                     break;
2457
2458                 case _CC_ENUM_XDIGIT:
2459                     REXEC_FBC_UTF8_CLASS_SCAN(
2460                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2461                     break;
2462
2463                 case _CC_ENUM_VERTSPACE:
2464                     REXEC_FBC_UTF8_CLASS_SCAN(
2465                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
2466                     break;
2467
2468                 case _CC_ENUM_CNTRL:
2469                     REXEC_FBC_UTF8_CLASS_SCAN(
2470                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
2471                     break;
2472
2473                 default:
2474                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2475                     NOT_REACHED; /* NOTREACHED */
2476             }
2477         }
2478         break;
2479
2480       found_above_latin1:   /* Here we have to load a swash to get the result
2481                                for the current code point */
2482         if (! PL_utf8_swash_ptrs[classnum]) {
2483             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2484             PL_utf8_swash_ptrs[classnum] =
2485                     _core_swash_init("utf8",
2486                                      "",
2487                                      &PL_sv_undef, 1, 0,
2488                                      PL_XPosix_ptrs[classnum], &flags);
2489         }
2490
2491         /* This is a copy of the loop above for swash classes, though using the
2492          * FBC macro instead of being expanded out.  Since we've loaded the
2493          * swash, we don't have to check for that each time through the loop */
2494         REXEC_FBC_UTF8_CLASS_SCAN(
2495                 to_complement ^ cBOOL(_generic_utf8(
2496                                       classnum,
2497                                       s,
2498                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2499                                                   (U8 *) s, TRUE))));
2500         break;
2501
2502     case AHOCORASICKC:
2503     case AHOCORASICK:
2504         {
2505             DECL_TRIE_TYPE(c);
2506             /* what trie are we using right now */
2507             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2508             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2509             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2510
2511             const char *last_start = strend - trie->minlen;
2512 #ifdef DEBUGGING
2513             const char *real_start = s;
2514 #endif
2515             STRLEN maxlen = trie->maxlen;
2516             SV *sv_points;
2517             U8 **points; /* map of where we were in the input string
2518                             when reading a given char. For ASCII this
2519                             is unnecessary overhead as the relationship
2520                             is always 1:1, but for Unicode, especially
2521                             case folded Unicode this is not true. */
2522             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2523             U8 *bitmap=NULL;
2524
2525
2526             GET_RE_DEBUG_FLAGS_DECL;
2527
2528             /* We can't just allocate points here. We need to wrap it in
2529              * an SV so it gets freed properly if there is a croak while
2530              * running the match */
2531             ENTER;
2532             SAVETMPS;
2533             sv_points=newSV(maxlen * sizeof(U8 *));
2534             SvCUR_set(sv_points,
2535                 maxlen * sizeof(U8 *));
2536             SvPOK_on(sv_points);
2537             sv_2mortal(sv_points);
2538             points=(U8**)SvPV_nolen(sv_points );
2539             if ( trie_type != trie_utf8_fold
2540                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2541             {
2542                 if (trie->bitmap)
2543                     bitmap=(U8*)trie->bitmap;
2544                 else
2545                     bitmap=(U8*)ANYOF_BITMAP(c);
2546             }
2547             /* this is the Aho-Corasick algorithm modified a touch
2548                to include special handling for long "unknown char" sequences.
2549                The basic idea being that we use AC as long as we are dealing
2550                with a possible matching char, when we encounter an unknown char
2551                (and we have not encountered an accepting state) we scan forward
2552                until we find a legal starting char.
2553                AC matching is basically that of trie matching, except that when
2554                we encounter a failing transition, we fall back to the current
2555                states "fail state", and try the current char again, a process
2556                we repeat until we reach the root state, state 1, or a legal
2557                transition. If we fail on the root state then we can either
2558                terminate if we have reached an accepting state previously, or
2559                restart the entire process from the beginning if we have not.
2560
2561              */
2562             while (s <= last_start) {
2563                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2564                 U8 *uc = (U8*)s;
2565                 U16 charid = 0;
2566                 U32 base = 1;
2567                 U32 state = 1;
2568                 UV uvc = 0;
2569                 STRLEN len = 0;
2570                 STRLEN foldlen = 0;
2571                 U8 *uscan = (U8*)NULL;
2572                 U8 *leftmost = NULL;
2573 #ifdef DEBUGGING
2574                 U32 accepted_word= 0;
2575 #endif
2576                 U32 pointpos = 0;
2577
2578                 while ( state && uc <= (U8*)strend ) {
2579                     int failed=0;
2580                     U32 word = aho->states[ state ].wordnum;
2581
2582                     if( state==1 ) {
2583                         if ( bitmap ) {
2584                             DEBUG_TRIE_EXECUTE_r(
2585                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2586                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2587                                         (char *)uc, utf8_target, 0 );
2588                                     Perl_re_printf( aTHX_
2589                                         " Scanning for legal start char...\n");
2590                                 }
2591                             );
2592                             if (utf8_target) {
2593                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2594                                     uc += UTF8SKIP(uc);
2595                                 }
2596                             } else {
2597                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2598                                     uc++;
2599                                 }
2600                             }
2601                             s= (char *)uc;
2602                         }
2603                         if (uc >(U8*)last_start) break;
2604                     }
2605
2606                     if ( word ) {
2607                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2608                         if (!leftmost || lpos < leftmost) {
2609                             DEBUG_r(accepted_word=word);
2610                             leftmost= lpos;
2611                         }
2612                         if (base==0) break;
2613
2614                     }
2615                     points[pointpos++ % maxlen]= uc;
2616                     if (foldlen || uc < (U8*)strend) {
2617                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2618                                          widecharmap, uc,
2619                                          uscan, len, uvc, charid, foldlen,
2620                                          foldbuf, uniflags);
2621                         DEBUG_TRIE_EXECUTE_r({
2622                             dump_exec_pos( (char *)uc, c, strend,
2623                                         real_start, s, utf8_target, 0);
2624                             Perl_re_printf( aTHX_
2625                                 " Charid:%3u CP:%4"UVxf" ",
2626                                  charid, uvc);
2627                         });
2628                     }
2629                     else {
2630                         len = 0;
2631                         charid = 0;
2632                     }
2633
2634
2635                     do {
2636 #ifdef DEBUGGING
2637                         word = aho->states[ state ].wordnum;
2638 #endif
2639                         base = aho->states[ state ].trans.base;
2640
2641                         DEBUG_TRIE_EXECUTE_r({
2642                             if (failed)
2643                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2644                                     s,   utf8_target, 0 );
2645                             Perl_re_printf( aTHX_
2646                                 "%sState: %4"UVxf", word=%"UVxf,
2647                                 failed ? " Fail transition to " : "",
2648                                 (UV)state, (UV)word);
2649                         });
2650                         if ( base ) {
2651                             U32 tmp;
2652                             I32 offset;
2653                             if (charid &&
2654                                  ( ((offset = base + charid
2655                                     - 1 - trie->uniquecharcount)) >= 0)
2656                                  && ((U32)offset < trie->lasttrans)
2657                                  && trie->trans[offset].check == state
2658                                  && (tmp=trie->trans[offset].next))
2659                             {
2660                                 DEBUG_TRIE_EXECUTE_r(
2661                                     Perl_re_printf( aTHX_ " - legal\n"));
2662                                 state = tmp;
2663                                 break;
2664                             }
2665                             else {
2666                                 DEBUG_TRIE_EXECUTE_r(
2667                                     Perl_re_printf( aTHX_ " - fail\n"));
2668                                 failed = 1;
2669                                 state = aho->fail[state];
2670                             }
2671                         }
2672                         else {
2673                             /* we must be accepting here */
2674                             DEBUG_TRIE_EXECUTE_r(
2675                                     Perl_re_printf( aTHX_ " - accepting\n"));
2676                             failed = 1;
2677                             break;
2678                         }
2679                     } while(state);
2680                     uc += len;
2681                     if (failed) {
2682                         if (leftmost)
2683                             break;
2684                         if (!state) state = 1;
2685                     }
2686                 }
2687                 if ( aho->states[ state ].wordnum ) {
2688                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2689                     if (!leftmost || lpos < leftmost) {
2690                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2691                         leftmost = lpos;
2692                     }
2693                 }
2694                 if (leftmost) {
2695                     s = (char*)leftmost;
2696                     DEBUG_TRIE_EXECUTE_r({
2697                         Perl_re_printf( aTHX_  "Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2698                             (UV)accepted_word, (IV)(s - real_start)
2699                         );
2700                     });
2701                     if (reginfo->intuit || regtry(reginfo, &s)) {
2702                         FREETMPS;
2703                         LEAVE;
2704                         goto got_it;
2705                     }
2706                     s = HOPc(s,1);
2707                     DEBUG_TRIE_EXECUTE_r({
2708                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
2709                     });
2710                 } else {
2711                     DEBUG_TRIE_EXECUTE_r(
2712                         Perl_re_printf( aTHX_ "No match.\n"));
2713                     break;
2714                 }
2715             }
2716             FREETMPS;
2717             LEAVE;
2718         }
2719         break;
2720     default:
2721         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2722     }
2723     return 0;
2724   got_it:
2725     return s;
2726 }
2727
2728 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2729  * flags have same meanings as with regexec_flags() */
2730
2731 static void
2732 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2733                             char *strbeg,
2734                             char *strend,
2735                             SV *sv,
2736                             U32 flags,
2737                             bool utf8_target)
2738 {
2739     struct regexp *const prog = ReANY(rx);
2740
2741     if (flags & REXEC_COPY_STR) {
2742 #ifdef PERL_ANY_COW
2743         if (SvCANCOW(sv)) {
2744             DEBUG_C(Perl_re_printf( aTHX_
2745                               "Copy on write: regexp capture, type %d\n",
2746                                     (int) SvTYPE(sv)));
2747             /* Create a new COW SV to share the match string and store
2748              * in saved_copy, unless the current COW SV in saved_copy
2749              * is valid and suitable for our purpose */
2750             if ((   prog->saved_copy
2751                  && SvIsCOW(prog->saved_copy)
2752                  && SvPOKp(prog->saved_copy)
2753                  && SvIsCOW(sv)
2754                  && SvPOKp(sv)
2755                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2756             {
2757                 /* just reuse saved_copy SV */
2758                 if (RXp_MATCH_COPIED(prog)) {
2759                     Safefree(prog->subbeg);
2760                     RXp_MATCH_COPIED_off(prog);
2761                 }
2762             }
2763             else {
2764                 /* create new COW SV to share string */
2765                 RX_MATCH_COPY_FREE(rx);
2766                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2767             }
2768             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2769             assert (SvPOKp(prog->saved_copy));
2770             prog->sublen  = strend - strbeg;
2771             prog->suboffset = 0;
2772             prog->subcoffset = 0;
2773         } else
2774 #endif
2775         {
2776             SSize_t min = 0;
2777             SSize_t max = strend - strbeg;
2778             SSize_t sublen;
2779
2780             if (    (flags & REXEC_COPY_SKIP_POST)
2781                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2782                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2783             ) { /* don't copy $' part of string */
2784                 U32 n = 0;
2785                 max = -1;
2786                 /* calculate the right-most part of the string covered
2787                  * by a capture. Due to lookahead, this may be to
2788                  * the right of $&, so we have to scan all captures */
2789                 while (n <= prog->lastparen) {
2790                     if (prog->offs[n].end > max)
2791                         max = prog->offs[n].end;
2792                     n++;
2793                 }
2794                 if (max == -1)
2795                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2796                             ? prog->offs[0].start
2797                             : 0;
2798                 assert(max >= 0 && max <= strend - strbeg);
2799             }
2800
2801             if (    (flags & REXEC_COPY_SKIP_PRE)
2802                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2803                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2804             ) { /* don't copy $` part of string */
2805                 U32 n = 0;
2806                 min = max;
2807                 /* calculate the left-most part of the string covered
2808                  * by a capture. Due to lookbehind, this may be to
2809                  * the left of $&, so we have to scan all captures */
2810                 while (min && n <= prog->lastparen) {
2811                     if (   prog->offs[n].start != -1
2812                         && prog->offs[n].start < min)
2813                     {
2814                         min = prog->offs[n].start;
2815                     }
2816                     n++;
2817                 }
2818                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2819                     && min >  prog->offs[0].end
2820                 )
2821                     min = prog->offs[0].end;
2822
2823             }
2824
2825             assert(min >= 0 && min <= max && min <= strend - strbeg);
2826             sublen = max - min;
2827
2828             if (RX_MATCH_COPIED(rx)) {
2829                 if (sublen > prog->sublen)
2830                     prog->subbeg =
2831                             (char*)saferealloc(prog->subbeg, sublen+1);
2832             }
2833             else
2834                 prog->subbeg = (char*)safemalloc(sublen+1);
2835             Copy(strbeg + min, prog->subbeg, sublen, char);
2836             prog->subbeg[sublen] = '\0';
2837             prog->suboffset = min;
2838             prog->sublen = sublen;
2839             RX_MATCH_COPIED_on(rx);
2840         }
2841         prog->subcoffset = prog->suboffset;
2842         if (prog->suboffset && utf8_target) {
2843             /* Convert byte offset to chars.
2844              * XXX ideally should only compute this if @-/@+
2845              * has been seen, a la PL_sawampersand ??? */
2846
2847             /* If there's a direct correspondence between the
2848              * string which we're matching and the original SV,
2849              * then we can use the utf8 len cache associated with
2850              * the SV. In particular, it means that under //g,
2851              * sv_pos_b2u() will use the previously cached
2852              * position to speed up working out the new length of
2853              * subcoffset, rather than counting from the start of
2854              * the string each time. This stops
2855              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2856              * from going quadratic */
2857             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2858                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2859                                                 SV_GMAGIC|SV_CONST_RETURN);
2860             else
2861                 prog->subcoffset = utf8_length((U8*)strbeg,
2862                                     (U8*)(strbeg+prog->suboffset));
2863         }
2864     }
2865     else {
2866         RX_MATCH_COPY_FREE(rx);
2867         prog->subbeg = strbeg;
2868         prog->suboffset = 0;
2869         prog->subcoffset = 0;
2870         prog->sublen = strend - strbeg;
2871     }
2872 }
2873
2874
2875
2876
2877 /*
2878  - regexec_flags - match a regexp against a string
2879  */
2880 I32
2881 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2882               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2883 /* stringarg: the point in the string at which to begin matching */
2884 /* strend:    pointer to null at end of string */
2885 /* strbeg:    real beginning of string */
2886 /* minend:    end of match must be >= minend bytes after stringarg. */
2887 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2888  *            itself is accessed via the pointers above */
2889 /* data:      May be used for some additional optimizations.
2890               Currently unused. */
2891 /* flags:     For optimizations. See REXEC_* in regexp.h */
2892
2893 {
2894     struct regexp *const prog = ReANY(rx);
2895     char *s;
2896     regnode *c;
2897     char *startpos;
2898     SSize_t minlen;             /* must match at least this many chars */
2899     SSize_t dontbother = 0;     /* how many characters not to try at end */
2900     const bool utf8_target = cBOOL(DO_UTF8(sv));
2901     I32 multiline;
2902     RXi_GET_DECL(prog,progi);
2903     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2904     regmatch_info *const reginfo = &reginfo_buf;
2905     regexp_paren_pair *swap = NULL;
2906     I32 oldsave;
2907     GET_RE_DEBUG_FLAGS_DECL;
2908
2909     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2910     PERL_UNUSED_ARG(data);
2911
2912     /* Be paranoid... */
2913     if (prog == NULL) {
2914         Perl_croak(aTHX_ "NULL regexp parameter");
2915     }
2916
2917     DEBUG_EXECUTE_r(
2918         debug_start_match(rx, utf8_target, stringarg, strend,
2919         "Matching");
2920     );
2921
2922     startpos = stringarg;
2923
2924     /* set these early as they may be used by the HOP macros below */
2925     reginfo->strbeg = strbeg;
2926     reginfo->strend = strend;
2927     reginfo->is_utf8_target = cBOOL(utf8_target);
2928
2929     if (prog->intflags & PREGf_GPOS_SEEN) {
2930         MAGIC *mg;
2931
2932         /* set reginfo->ganch, the position where \G can match */
2933
2934         reginfo->ganch =
2935             (flags & REXEC_IGNOREPOS)
2936             ? stringarg /* use start pos rather than pos() */
2937             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2938               /* Defined pos(): */
2939             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2940             : strbeg; /* pos() not defined; use start of string */
2941
2942         DEBUG_GPOS_r(Perl_re_printf( aTHX_
2943             "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2944
2945         /* in the presence of \G, we may need to start looking earlier in
2946          * the string than the suggested start point of stringarg:
2947          * if prog->gofs is set, then that's a known, fixed minimum
2948          * offset, such as
2949          * /..\G/:   gofs = 2
2950          * /ab|c\G/: gofs = 1
2951          * or if the minimum offset isn't known, then we have to go back
2952          * to the start of the string, e.g. /w+\G/
2953          */
2954
2955         if (prog->intflags & PREGf_ANCH_GPOS) {
2956             if (prog->gofs) {
2957                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
2958                 if (!startpos ||
2959                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
2960                 {
2961                     DEBUG_r(Perl_re_printf( aTHX_
2962                             "fail: ganch-gofs before earliest possible start\n"));
2963                     return 0;
2964                 }
2965             }
2966             else
2967                 startpos = reginfo->ganch;
2968         }
2969         else if (prog->gofs) {
2970             startpos = HOPBACKc(startpos, prog->gofs);
2971             if (!startpos)
2972                 startpos = strbeg;
2973         }
2974         else if (prog->intflags & PREGf_GPOS_FLOAT)
2975             startpos = strbeg;
2976     }
2977
2978     minlen = prog->minlen;
2979     if ((startpos + minlen) > strend || startpos < strbeg) {
2980         DEBUG_r(Perl_re_printf( aTHX_
2981                     "Regex match can't succeed, so not even tried\n"));
2982         return 0;
2983     }
2984
2985     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2986      * which will call destuctors to reset PL_regmatch_state, free higher
2987      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2988      * regmatch_info_aux_eval */
2989
2990     oldsave = PL_savestack_ix;
2991
2992     s = startpos;
2993
2994     if ((prog->extflags & RXf_USE_INTUIT)
2995         && !(flags & REXEC_CHECKED))
2996     {
2997         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2998                                     flags, NULL);
2999         if (!s)
3000             return 0;
3001
3002         if (prog->extflags & RXf_CHECK_ALL) {
3003             /* we can match based purely on the result of INTUIT.
3004              * Set up captures etc just for $& and $-[0]
3005              * (an intuit-only match wont have $1,$2,..) */
3006             assert(!prog->nparens);
3007
3008             /* s/// doesn't like it if $& is earlier than where we asked it to
3009              * start searching (which can happen on something like /.\G/) */
3010             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3011                     && (s < stringarg))
3012             {
3013                 /* this should only be possible under \G */
3014                 assert(prog->intflags & PREGf_GPOS_SEEN);
3015                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3016                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3017                 goto phooey;
3018             }
3019
3020             /* match via INTUIT shouldn't have any captures.
3021              * Let @-, @+, $^N know */
3022             prog->lastparen = prog->lastcloseparen = 0;
3023             RX_MATCH_UTF8_set(rx, utf8_target);
3024             prog->offs[0].start = s - strbeg;
3025             prog->offs[0].end = utf8_target
3026                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3027                 : s - strbeg + prog->minlenret;
3028             if ( !(flags & REXEC_NOT_FIRST) )
3029                 S_reg_set_capture_string(aTHX_ rx,
3030                                         strbeg, strend,
3031                                         sv, flags, utf8_target);
3032
3033             return 1;
3034         }
3035     }
3036
3037     multiline = prog->extflags & RXf_PMf_MULTILINE;
3038
3039     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3040         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3041                               "String too short [regexec_flags]...\n"));
3042         goto phooey;
3043     }
3044
3045     /* Check validity of program. */
3046     if (UCHARAT(progi->program) != REG_MAGIC) {
3047         Perl_croak(aTHX_ "corrupted regexp program");
3048     }
3049
3050     RX_MATCH_TAINTED_off(rx);
3051     RX_MATCH_UTF8_set(rx, utf8_target);
3052
3053     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3054     reginfo->intuit = 0;
3055     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3056     reginfo->warned = FALSE;
3057     reginfo->sv = sv;
3058     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3059     /* see how far we have to get to not match where we matched before */
3060     reginfo->till = stringarg + minend;
3061
3062     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3063         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3064            S_cleanup_regmatch_info_aux has executed (registered by
3065            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3066            magic belonging to this SV.
3067            Not newSVsv, either, as it does not COW.
3068         */
3069         reginfo->sv = newSV(0);
3070         SvSetSV_nosteal(reginfo->sv, sv);
3071         SAVEFREESV(reginfo->sv);
3072     }
3073
3074     /* reserve next 2 or 3 slots in PL_regmatch_state:
3075      * slot N+0: may currently be in use: skip it
3076      * slot N+1: use for regmatch_info_aux struct
3077      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3078      * slot N+3: ready for use by regmatch()
3079      */
3080
3081     {
3082         regmatch_state *old_regmatch_state;
3083         regmatch_slab  *old_regmatch_slab;
3084         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3085
3086         /* on first ever match, allocate first slab */
3087         if (!PL_regmatch_slab) {
3088             Newx(PL_regmatch_slab, 1, regmatch_slab);
3089             PL_regmatch_slab->prev = NULL;
3090             PL_regmatch_slab->next = NULL;
3091             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3092         }
3093
3094         old_regmatch_state = PL_regmatch_state;
3095         old_regmatch_slab  = PL_regmatch_slab;
3096
3097         for (i=0; i <= max; i++) {
3098             if (i == 1)
3099                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3100             else if (i ==2)
3101                 reginfo->info_aux_eval =
3102                 reginfo->info_aux->info_aux_eval =
3103                             &(PL_regmatch_state->u.info_aux_eval);
3104
3105             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3106                 PL_regmatch_state = S_push_slab(aTHX);
3107         }
3108
3109         /* note initial PL_regmatch_state position; at end of match we'll
3110          * pop back to there and free any higher slabs */
3111
3112         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3113         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3114         reginfo->info_aux->poscache = NULL;
3115
3116         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3117
3118         if ((prog->extflags & RXf_EVAL_SEEN))
3119             S_setup_eval_state(aTHX_ reginfo);
3120         else
3121             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3122     }
3123
3124     /* If there is a "must appear" string, look for it. */
3125
3126     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3127         /* We have to be careful. If the previous successful match
3128            was from this regex we don't want a subsequent partially
3129            successful match to clobber the old results.
3130            So when we detect this possibility we add a swap buffer
3131            to the re, and switch the buffer each match. If we fail,
3132            we switch it back; otherwise we leave it swapped.
3133         */
3134         swap = prog->offs;
3135         /* do we need a save destructor here for eval dies? */
3136         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3137         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3138             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3139             0,
3140             PTR2UV(prog),
3141             PTR2UV(swap),
3142             PTR2UV(prog->offs)
3143         ));
3144     }
3145
3146     if (prog->recurse_locinput)
3147         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3148
3149     /* Simplest case: anchored match need be tried only once, or with
3150      * MBOL, only at the beginning of each line.
3151      *
3152      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3153      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3154      * match at the start of the string then it won't match anywhere else
3155      * either; while with /.*.../, if it doesn't match at the beginning,
3156      * the earliest it could match is at the start of the next line */
3157
3158     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3159         char *end;
3160
3161         if (regtry(reginfo, &s))
3162             goto got_it;
3163
3164         if (!(prog->intflags & PREGf_ANCH_MBOL))
3165             goto phooey;
3166
3167         /* didn't match at start, try at other newline positions */
3168
3169         if (minlen)
3170             dontbother = minlen - 1;
3171         end = HOP3c(strend, -dontbother, strbeg) - 1;
3172
3173         /* skip to next newline */
3174
3175         while (s <= end) { /* note it could be possible to match at the end of the string */
3176             /* NB: newlines are the same in unicode as they are in latin */
3177             if (*s++ != '\n')
3178                 continue;
3179             if (prog->check_substr || prog->check_utf8) {
3180             /* note that with PREGf_IMPLICIT, intuit can only fail
3181              * or return the start position, so it's of limited utility.
3182              * Nevertheless, I made the decision that the potential for
3183              * quick fail was still worth it - DAPM */
3184                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3185                 if (!s)
3186                     goto phooey;
3187             }
3188             if (regtry(reginfo, &s))
3189                 goto got_it;
3190         }
3191         goto phooey;
3192     } /* end anchored search */
3193
3194     if (prog->intflags & PREGf_ANCH_GPOS)
3195     {
3196         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3197         assert(prog->intflags & PREGf_GPOS_SEEN);
3198         /* For anchored \G, the only position it can match from is
3199          * (ganch-gofs); we already set startpos to this above; if intuit
3200          * moved us on from there, we can't possibly succeed */
3201         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3202         if (s == startpos && regtry(reginfo, &s))
3203             goto got_it;
3204         goto phooey;
3205     }
3206
3207     /* Messy cases:  unanchored match. */
3208     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3209         /* we have /x+whatever/ */
3210         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3211         char ch;
3212 #ifdef DEBUGGING
3213         int did_match = 0;
3214 #endif
3215         if (utf8_target) {
3216             if (! prog->anchored_utf8) {
3217                 to_utf8_substr(prog);
3218             }
3219             ch = SvPVX_const(prog->anchored_utf8)[0];
3220             REXEC_FBC_SCAN(
3221                 if (*s == ch) {
3222                     DEBUG_EXECUTE_r( did_match = 1 );
3223                     if (regtry(reginfo, &s)) goto got_it;
3224                     s += UTF8SKIP(s);
3225                     while (s < strend && *s == ch)
3226                         s += UTF8SKIP(s);
3227                 }
3228             );
3229
3230         }
3231         else {
3232             if (! prog->anchored_substr) {
3233                 if (! to_byte_substr(prog)) {
3234                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3235                 }
3236             }
3237             ch = SvPVX_const(prog->anchored_substr)[0];
3238             REXEC_FBC_SCAN(
3239                 if (*s == ch) {
3240                     DEBUG_EXECUTE_r( did_match = 1 );
3241                     if (regtry(reginfo, &s)) goto got_it;
3242                     s++;
3243                     while (s < strend && *s == ch)
3244                         s++;
3245                 }
3246             );
3247         }
3248         DEBUG_EXECUTE_r(if (!did_match)
3249                 Perl_re_printf( aTHX_
3250                                   "Did not find anchored character...\n")
3251                );
3252     }
3253     else if (prog->anchored_substr != NULL
3254               || prog->anchored_utf8 != NULL
3255               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3256                   && prog->float_max_offset < strend - s)) {
3257         SV *must;
3258         SSize_t back_max;
3259         SSize_t back_min;
3260         char *last;
3261         char *last1;            /* Last position checked before */
3262 #ifdef DEBUGGING
3263         int did_match = 0;
3264 #endif
3265         if (prog->anchored_substr || prog->anchored_utf8) {
3266             if (utf8_target) {
3267                 if (! prog->anchored_utf8) {
3268                     to_utf8_substr(prog);
3269                 }
3270                 must = prog->anchored_utf8;
3271             }
3272             else {
3273                 if (! prog->anchored_substr) {
3274                     if (! to_byte_substr(prog)) {
3275                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3276                     }
3277                 }
3278                 must = prog->anchored_substr;
3279             }
3280             back_max = back_min = prog->anchored_offset;
3281         } else {
3282             if (utf8_target) {
3283                 if (! prog->float_utf8) {
3284                     to_utf8_substr(prog);
3285                 }
3286                 must = prog->float_utf8;
3287             }
3288             else {
3289                 if (! prog->float_substr) {
3290                     if (! to_byte_substr(prog)) {
3291                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3292                     }
3293                 }
3294                 must = prog->float_substr;
3295             }
3296             back_max = prog->float_max_offset;
3297             back_min = prog->float_min_offset;
3298         }
3299
3300         if (back_min<0) {
3301             last = strend;
3302         } else {
3303             last = HOP3c(strend,        /* Cannot start after this */
3304                   -(SSize_t)(CHR_SVLEN(must)
3305                          - (SvTAIL(must) != 0) + back_min), strbeg);
3306         }
3307         if (s > reginfo->strbeg)
3308             last1 = HOPc(s, -1);
3309         else
3310             last1 = s - 1;      /* bogus */
3311
3312         /* XXXX check_substr already used to find "s", can optimize if
3313            check_substr==must. */
3314         dontbother = 0;
3315         strend = HOPc(strend, -dontbother);
3316         while ( (s <= last) &&
3317                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3318                                   (unsigned char*)strend, must,
3319                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3320             DEBUG_EXECUTE_r( did_match = 1 );
3321             if (HOPc(s, -back_max) > last1) {
3322                 last1 = HOPc(s, -back_min);
3323                 s = HOPc(s, -back_max);
3324             }
3325             else {
3326                 char * const t = (last1 >= reginfo->strbeg)
3327                                     ? HOPc(last1, 1) : last1 + 1;
3328
3329                 last1 = HOPc(s, -back_min);
3330                 s = t;
3331             }
3332             if (utf8_target) {
3333                 while (s <= last1) {
3334                     if (regtry(reginfo, &s))
3335                         goto got_it;
3336                     if (s >= last1) {
3337                         s++; /* to break out of outer loop */
3338                         break;
3339                     }
3340                     s += UTF8SKIP(s);
3341                 }
3342             }
3343             else {
3344                 while (s <= last1) {
3345                     if (regtry(reginfo, &s))
3346                         goto got_it;
3347                     s++;
3348                 }
3349             }
3350         }
3351         DEBUG_EXECUTE_r(if (!did_match) {
3352             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3353                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3354             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3355                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3356                                ? "anchored" : "floating"),
3357                 quoted, RE_SV_TAIL(must));
3358         });
3359         goto phooey;
3360     }
3361     else if ( (c = progi->regstclass) ) {
3362         if (minlen) {
3363             const OPCODE op = OP(progi->regstclass);
3364             /* don't bother with what can't match */
3365             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3366                 strend = HOPc(strend, -(minlen - 1));
3367         }
3368         DEBUG_EXECUTE_r({
3369             SV * const prop = sv_newmortal();
3370             regprop(prog, prop, c, reginfo, NULL);
3371             {
3372                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3373                     s,strend-s,60);
3374                 Perl_re_printf( aTHX_
3375                     "Matching stclass %.*s against %s (%d bytes)\n",
3376                     (int)SvCUR(prop), SvPVX_const(prop),
3377                      quoted, (int)(strend - s));
3378             }
3379         });
3380         if (find_byclass(prog, c, s, strend, reginfo))
3381             goto got_it;
3382         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3383     }
3384     else {
3385         dontbother = 0;
3386         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3387             /* Trim the end. */
3388             char *last= NULL;
3389             SV* float_real;
3390             STRLEN len;
3391             const char *little;
3392
3393             if (utf8_target) {
3394                 if (! prog->float_utf8) {
3395                     to_utf8_substr(prog);
3396                 }
3397                 float_real = prog->float_utf8;
3398             }
3399             else {
3400                 if (! prog->float_substr) {
3401                     if (! to_byte_substr(prog)) {
3402                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3403                     }
3404                 }
3405                 float_real = prog->float_substr;
3406             }
3407
3408             little = SvPV_const(float_real, len);
3409             if (SvTAIL(float_real)) {
3410                     /* This means that float_real contains an artificial \n on
3411                      * the end due to the presence of something like this:
3412                      * /foo$/ where we can match both "foo" and "foo\n" at the
3413                      * end of the string.  So we have to compare the end of the
3414                      * string first against the float_real without the \n and
3415                      * then against the full float_real with the string.  We
3416                      * have to watch out for cases where the string might be
3417                      * smaller than the float_real or the float_real without
3418                      * the \n. */
3419                     char *checkpos= strend - len;
3420                     DEBUG_OPTIMISE_r(
3421                         Perl_re_printf( aTHX_
3422                             "%sChecking for float_real.%s\n",
3423                             PL_colors[4], PL_colors[5]));
3424                     if (checkpos + 1 < strbeg) {
3425                         /* can't match, even if we remove the trailing \n
3426                          * string is too short to match */
3427                         DEBUG_EXECUTE_r(
3428                             Perl_re_printf( aTHX_
3429                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3430                                 PL_colors[4], PL_colors[5]));
3431                         goto phooey;
3432                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3433                         /* can match, the end of the string matches without the
3434                          * "\n" */
3435                         last = checkpos + 1;
3436                     } else if (checkpos < strbeg) {
3437                         /* cant match, string is too short when the "\n" is
3438                          * included */
3439                         DEBUG_EXECUTE_r(
3440                             Perl_re_printf( aTHX_
3441                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3442                                 PL_colors[4], PL_colors[5]));
3443                         goto phooey;
3444                     } else if (!multiline) {
3445                         /* non multiline match, so compare with the "\n" at the
3446                          * end of the string */
3447                         if (memEQ(checkpos, little, len)) {
3448                             last= checkpos;
3449                         } else {
3450                             DEBUG_EXECUTE_r(
3451                                 Perl_re_printf( aTHX_
3452                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3453                                     PL_colors[4], PL_colors[5]));
3454                             goto phooey;
3455                         }
3456                     } else {
3457                         /* multiline match, so we have to search for a place
3458                          * where the full string is located */
3459                         goto find_last;
3460                     }
3461             } else {
3462                   find_last:
3463                     if (len)
3464                         last = rninstr(s, strend, little, little + len);
3465                     else
3466                         last = strend;  /* matching "$" */
3467             }
3468             if (!last) {
3469                 /* at one point this block contained a comment which was
3470                  * probably incorrect, which said that this was a "should not
3471                  * happen" case.  Even if it was true when it was written I am
3472                  * pretty sure it is not anymore, so I have removed the comment
3473                  * and replaced it with this one. Yves */
3474                 DEBUG_EXECUTE_r(
3475                     Perl_re_printf( aTHX_
3476                         "%sString does not contain required substring, cannot match.%s\n",
3477                         PL_colors[4], PL_colors[5]
3478                     ));
3479                 goto phooey;
3480             }
3481             dontbother = strend - last + prog->float_min_offset;
3482         }
3483         if (minlen && (dontbother < minlen))
3484             dontbother = minlen - 1;
3485         strend -= dontbother;              /* this one's always in bytes! */
3486         /* We don't know much -- general case. */
3487         if (utf8_target) {
3488             for (;;) {
3489                 if (regtry(reginfo, &s))
3490                     goto got_it;
3491                 if (s >= strend)
3492                     break;
3493                 s += UTF8SKIP(s);
3494             };
3495         }
3496         else {
3497             do {
3498                 if (regtry(reginfo, &s))
3499                     goto got_it;
3500             } while (s++ < strend);
3501         }
3502     }
3503
3504     /* Failure. */
3505     goto phooey;
3506
3507   got_it:
3508     /* s/// doesn't like it if $& is earlier than where we asked it to
3509      * start searching (which can happen on something like /.\G/) */
3510     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3511             && (prog->offs[0].start < stringarg - strbeg))
3512     {
3513         /* this should only be possible under \G */
3514         assert(prog->intflags & PREGf_GPOS_SEEN);
3515         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3516             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3517         goto phooey;
3518     }
3519
3520     DEBUG_BUFFERS_r(
3521         if (swap)
3522             Perl_re_exec_indentf( aTHX_
3523                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3524                 0,
3525                 PTR2UV(prog),
3526                 PTR2UV(swap)
3527             );
3528     );
3529     Safefree(swap);
3530
3531     /* clean up; this will trigger destructors that will free all slabs
3532      * above the current one, and cleanup the regmatch_info_aux
3533      * and regmatch_info_aux_eval sructs */
3534
3535     LEAVE_SCOPE(oldsave);
3536
3537     if (RXp_PAREN_NAMES(prog))
3538         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3539
3540     /* make sure $`, $&, $', and $digit will work later */
3541     if ( !(flags & REXEC_NOT_FIRST) )
3542         S_reg_set_capture_string(aTHX_ rx,
3543                                     strbeg, reginfo->strend,
3544                                     sv, flags, utf8_target);
3545
3546     return 1;
3547
3548   phooey:
3549     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3550                           PL_colors[4], PL_colors[5]));
3551
3552     /* clean up; this will trigger destructors that will free all slabs
3553      * above the current one, and cleanup the regmatch_info_aux
3554      * and regmatch_info_aux_eval sructs */
3555
3556     LEAVE_SCOPE(oldsave);
3557
3558     if (swap) {
3559         /* we failed :-( roll it back */
3560         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3561             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3562             0,
3563             PTR2UV(prog),
3564             PTR2UV(prog->offs),
3565             PTR2UV(swap)
3566         ));
3567         Safefree(prog->offs);
3568         prog->offs = swap;
3569     }
3570     return 0;
3571 }
3572
3573
3574 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3575  * Do inc before dec, in case old and new rex are the same */
3576 #define SET_reg_curpm(Re2)                          \
3577     if (reginfo->info_aux_eval) {                   \
3578         (void)ReREFCNT_inc(Re2);                    \
3579         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3580         PM_SETRE((PL_reg_curpm), (Re2));            \
3581     }
3582
3583
3584 /*
3585  - regtry - try match at specific point
3586  */
3587 STATIC bool                     /* 0 failure, 1 success */
3588 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3589 {
3590     CHECKPOINT lastcp;
3591     REGEXP *const rx = reginfo->prog;
3592     regexp *const prog = ReANY(rx);
3593     SSize_t result;
3594 #ifdef DEBUGGING
3595     U32 depth = 0; /* used by REGCP_SET */
3596 #endif
3597     RXi_GET_DECL(prog,progi);
3598     GET_RE_DEBUG_FLAGS_DECL;
3599
3600     PERL_ARGS_ASSERT_REGTRY;
3601
3602     reginfo->cutpoint=NULL;
3603
3604     prog->offs[0].start = *startposp - reginfo->strbeg;
3605     prog->lastparen = 0;
3606     prog->lastcloseparen = 0;
3607
3608     /* XXXX What this code is doing here?!!!  There should be no need
3609        to do this again and again, prog->lastparen should take care of
3610        this!  --ilya*/
3611
3612     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3613      * Actually, the code in regcppop() (which Ilya may be meaning by
3614      * prog->lastparen), is not needed at all by the test suite
3615      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3616      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3617      * Meanwhile, this code *is* needed for the
3618      * above-mentioned test suite tests to succeed.  The common theme
3619      * on those tests seems to be returning null fields from matches.
3620      * --jhi updated by dapm */
3621
3622     /* After encountering a variant of the issue mentioned above I think
3623      * the point Ilya was making is that if we properly unwind whenever
3624      * we set lastparen to a smaller value then we should not need to do
3625      * this every time, only when needed. So if we have tests that fail if
3626      * we remove this, then it suggests somewhere else we are improperly
3627      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3628      * places it is called, and related regcp() routines. - Yves */
3629 #if 1
3630     if (prog->nparens) {
3631         regexp_paren_pair *pp = prog->offs;
3632         I32 i;
3633         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3634             ++pp;
3635             pp->start = -1;
3636             pp->end = -1;
3637         }
3638     }
3639 #endif
3640     REGCP_SET(lastcp);
3641     result = regmatch(reginfo, *startposp, progi->program + 1);
3642     if (result != -1) {
3643         prog->offs[0].end = result;
3644         return 1;
3645     }
3646     if (reginfo->cutpoint)
3647         *startposp= reginfo->cutpoint;
3648     REGCP_UNWIND(lastcp);
3649     return 0;
3650 }
3651
3652
3653 #define sayYES goto yes
3654 #define sayNO goto no
3655 #define sayNO_SILENT goto no_silent
3656
3657 /* we dont use STMT_START/END here because it leads to
3658    "unreachable code" warnings, which are bogus, but distracting. */
3659 #define CACHEsayNO \
3660     if (ST.cache_mask) \
3661        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3662     sayNO
3663
3664 /* this is used to determine how far from the left messages like
3665    'failed...' are printed in regexec.c. It should be set such that
3666    messages are inline with the regop output that created them.
3667 */
3668 #define REPORT_CODE_OFF 29
3669 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3670 #ifdef DEBUGGING
3671 int
3672 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3673 {
3674     va_list ap;
3675     int result;
3676     PerlIO *f= Perl_debug_log;
3677     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3678     va_start(ap, depth);
3679     PerlIO_printf(f, "%*s|%4"UVuf"| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3680     result = PerlIO_vprintf(f, fmt, ap);
3681     va_end(ap);
3682     return result;
3683 }
3684 #endif /* DEBUGGING */
3685
3686
3687 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3688 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3689 #define CHRTEST_NOT_A_CP_1 -999
3690 #define CHRTEST_NOT_A_CP_2 -998
3691
3692 /* grab a new slab and return the first slot in it */
3693
3694 STATIC regmatch_state *
3695 S_push_slab(pTHX)
3696 {
3697 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3698     dMY_CXT;
3699 #endif
3700     regmatch_slab *s = PL_regmatch_slab->next;
3701     if (!s) {
3702         Newx(s, 1, regmatch_slab);
3703         s->prev = PL_regmatch_slab;
3704         s->next = NULL;
3705         PL_regmatch_slab->next = s;
3706     }
3707     PL_regmatch_slab = s;
3708     return SLAB_FIRST(s);
3709 }
3710
3711
3712 /* push a new state then goto it */
3713
3714 #define PUSH_STATE_GOTO(state, node, input) \
3715     pushinput = input; \
3716     scan = node; \
3717     st->resume_state = state; \
3718     goto push_state;
3719
3720 /* push a new state with success backtracking, then goto it */
3721
3722 #define PUSH_YES_STATE_GOTO(state, node, input) \
3723     pushinput = input; \
3724     scan = node; \
3725     st->resume_state = state; \
3726     goto push_yes_state;
3727
3728
3729
3730
3731 /*
3732
3733 regmatch() - main matching routine
3734
3735 This is basically one big switch statement in a loop. We execute an op,
3736 set 'next' to point the next op, and continue. If we come to a point which
3737 we may need to backtrack to on failure such as (A|B|C), we push a
3738 backtrack state onto the backtrack stack. On failure, we pop the top
3739 state, and re-enter the loop at the state indicated. If there are no more
3740 states to pop, we return failure.
3741
3742 Sometimes we also need to backtrack on success; for example /A+/, where
3743 after successfully matching one A, we need to go back and try to
3744 match another one; similarly for lookahead assertions: if the assertion
3745 completes successfully, we backtrack to the state just before the assertion
3746 and then carry on.  In these cases, the pushed state is marked as
3747 'backtrack on success too'. This marking is in fact done by a chain of
3748 pointers, each pointing to the previous 'yes' state. On success, we pop to
3749 the nearest yes state, discarding any intermediate failure-only states.
3750 Sometimes a yes state is pushed just to force some cleanup code to be
3751 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3752 it to free the inner regex.
3753
3754 Note that failure backtracking rewinds the cursor position, while
3755 success backtracking leaves it alone.
3756
3757 A pattern is complete when the END op is executed, while a subpattern
3758 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3759 ops trigger the "pop to last yes state if any, otherwise return true"
3760 behaviour.
3761
3762 A common convention in this function is to use A and B to refer to the two
3763 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3764 the subpattern to be matched possibly multiple times, while B is the entire
3765 rest of the pattern. Variable and state names reflect this convention.
3766
3767 The states in the main switch are the union of ops and failure/success of
3768 substates associated with with that op.  For example, IFMATCH is the op
3769 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3770 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3771 successfully matched A and IFMATCH_A_fail is a state saying that we have
3772 just failed to match A. Resume states always come in pairs. The backtrack
3773 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3774 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3775 on success or failure.
3776
3777 The struct that holds a backtracking state is actually a big union, with
3778 one variant for each major type of op. The variable st points to the
3779 top-most backtrack struct. To make the code clearer, within each
3780 block of code we #define ST to alias the relevant union.
3781
3782 Here's a concrete example of a (vastly oversimplified) IFMATCH
3783 implementation:
3784
3785     switch (state) {
3786     ....
3787
3788 #define ST st->u.ifmatch
3789
3790     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3791         ST.foo = ...; // some state we wish to save
3792         ...
3793         // push a yes backtrack state with a resume value of
3794         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3795         // first node of A:
3796         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3797         // NOTREACHED
3798
3799     case IFMATCH_A: // we have successfully executed A; now continue with B
3800         next = B;
3801         bar = ST.foo; // do something with the preserved value
3802         break;
3803
3804     case IFMATCH_A_fail: // A failed, so the assertion failed
3805         ...;   // do some housekeeping, then ...
3806         sayNO; // propagate the failure
3807
3808 #undef ST
3809
3810     ...
3811     }
3812
3813 For any old-timers reading this who are familiar with the old recursive
3814 approach, the code above is equivalent to:
3815
3816     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3817     {
3818         int foo = ...
3819         ...
3820         if (regmatch(A)) {
3821             next = B;
3822             bar = foo;
3823             break;
3824         }
3825         ...;   // do some housekeeping, then ...
3826         sayNO; // propagate the failure
3827     }
3828
3829 The topmost backtrack state, pointed to by st, is usually free. If you
3830 want to claim it, populate any ST.foo fields in it with values you wish to
3831 save, then do one of
3832
3833         PUSH_STATE_GOTO(resume_state, node, newinput);
3834         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3835
3836 which sets that backtrack state's resume value to 'resume_state', pushes a
3837 new free entry to the top of the backtrack stack, then goes to 'node'.
3838 On backtracking, the free slot is popped, and the saved state becomes the
3839 new free state. An ST.foo field in this new top state can be temporarily
3840 accessed to retrieve values, but once the main loop is re-entered, it
3841 becomes available for reuse.
3842
3843 Note that the depth of the backtrack stack constantly increases during the
3844 left-to-right execution of the pattern, rather than going up and down with
3845 the pattern nesting. For example the stack is at its maximum at Z at the
3846 end of the pattern, rather than at X in the following:
3847
3848     /(((X)+)+)+....(Y)+....Z/
3849
3850 The only exceptions to this are lookahead/behind assertions and the cut,
3851 (?>A), which pop all the backtrack states associated with A before
3852 continuing.
3853
3854 Backtrack state structs are allocated in slabs of about 4K in size.
3855 PL_regmatch_state and st always point to the currently active state,
3856 and PL_regmatch_slab points to the slab currently containing
3857 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3858 allocated, and is never freed until interpreter destruction. When the slab
3859 is full, a new one is allocated and chained to the end. At exit from
3860 regmatch(), slabs allocated since entry are freed.
3861
3862 */
3863
3864
3865 #define DEBUG_STATE_pp(pp)                                  \
3866     DEBUG_STATE_r({                                         \
3867         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
3868         Perl_re_printf( aTHX_                                           \
3869             "%*s" pp " %s%s%s%s%s\n",                       \
3870             INDENT_CHARS(depth), "",                        \
3871             PL_reg_name[st->resume_state],                  \
3872             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3873             ((st==yes_state) ? "Y" : ""),                   \
3874             ((st==mark_state) ? "M" : ""),                  \
3875             ((st==yes_state||st==mark_state) ? "]" : "")    \
3876         );                                                  \
3877     });
3878
3879
3880 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3881
3882 #ifdef DEBUGGING
3883
3884 STATIC void
3885 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3886     const char *start, const char *end, const char *blurb)
3887 {
3888     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3889
3890     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3891
3892     if (!PL_colorset)
3893             reginitcolors();
3894     {
3895         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3896             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3897
3898         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3899             start, end - start, 60);
3900
3901         Perl_re_printf( aTHX_
3902             "%s%s REx%s %s against %s\n",
3903                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3904
3905         if (utf8_target||utf8_pat)
3906             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
3907                 utf8_pat ? "pattern" : "",
3908                 utf8_pat && utf8_target ? " and " : "",
3909                 utf8_target ? "string" : ""
3910             );
3911     }
3912 }
3913
3914 STATIC void
3915 S_dump_exec_pos(pTHX_ const char *locinput,
3916                       const regnode *scan,
3917                       const char *loc_regeol,
3918                       const char *loc_bostr,
3919                       const char *loc_reg_starttry,
3920                       const bool utf8_target,
3921                       const U32 depth
3922                 )
3923 {
3924     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3925     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3926     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3927     /* The part of the string before starttry has one color
3928        (pref0_len chars), between starttry and current
3929        position another one (pref_len - pref0_len chars),
3930        after the current position the third one.
3931        We assume that pref0_len <= pref_len, otherwise we
3932        decrease pref0_len.  */
3933     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3934         ? (5 + taill) - l : locinput - loc_bostr;
3935     int pref0_len;
3936
3937     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3938
3939     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3940         pref_len++;
3941     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3942     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3943         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3944               ? (5 + taill) - pref_len : loc_regeol - locinput);
3945     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3946         l--;
3947     if (pref0_len < 0)
3948         pref0_len = 0;
3949     if (pref0_len > pref_len)
3950         pref0_len = pref_len;
3951     {
3952         const int is_uni = utf8_target ? 1 : 0;
3953
3954         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3955             (locinput - pref_len),pref0_len, 60, 4, 5);
3956
3957         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3958                     (locinput - pref_len + pref0_len),
3959                     pref_len - pref0_len, 60, 2, 3);
3960
3961         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3962                     locinput, loc_regeol - locinput, 10, 0, 1);
3963
3964         const STRLEN tlen=len0+len1+len2;
3965         Perl_re_printf( aTHX_
3966                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|%4u| ",
3967                     (IV)(locinput - loc_bostr),
3968                     len0, s0,
3969                     len1, s1,
3970                     (docolor ? "" : "> <"),
3971                     len2, s2,
3972                     (int)(tlen > 19 ? 0 :  19 - tlen),
3973                     "",
3974                     depth);
3975     }
3976 }
3977
3978 #endif
3979
3980 /* reg_check_named_buff_matched()
3981  * Checks to see if a named buffer has matched. The data array of
3982  * buffer numbers corresponding to the buffer is expected to reside
3983  * in the regexp->data->data array in the slot stored in the ARG() of
3984  * node involved. Note that this routine doesn't actually care about the
3985  * name, that information is not preserved from compilation to execution.
3986  * Returns the index of the leftmost defined buffer with the given name
3987  * or 0 if non of the buffers matched.
3988  */
3989 STATIC I32
3990 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3991 {
3992     I32 n;
3993     RXi_GET_DECL(rex,rexi);
3994     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3995     I32 *nums=(I32*)SvPVX(sv_dat);
3996
3997     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3998
3999     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4000         if ((I32)rex->lastparen >= nums[n] &&
4001             rex->offs[nums[n]].end != -1)
4002         {
4003             return nums[n];
4004         }
4005     }
4006     return 0;
4007 }
4008
4009
4010 static bool
4011 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4012         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4013 {
4014     /* This function determines if there are one or two characters that match
4015      * the first character of the passed-in EXACTish node <text_node>, and if
4016      * so, returns them in the passed-in pointers.
4017      *
4018      * If it determines that no possible character in the target string can
4019      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4020      * the first character in <text_node> requires UTF-8 to represent, and the
4021      * target string isn't in UTF-8.)
4022      *
4023      * If there are more than two characters that could match the beginning of
4024      * <text_node>, or if more context is required to determine a match or not,
4025      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4026      *
4027      * The motiviation behind this function is to allow the caller to set up
4028      * tight loops for matching.  If <text_node> is of type EXACT, there is
4029      * only one possible character that can match its first character, and so
4030      * the situation is quite simple.  But things get much more complicated if
4031      * folding is involved.  It may be that the first character of an EXACTFish
4032      * node doesn't participate in any possible fold, e.g., punctuation, so it
4033      * can be matched only by itself.  The vast majority of characters that are
4034      * in folds match just two things, their lower and upper-case equivalents.
4035      * But not all are like that; some have multiple possible matches, or match
4036      * sequences of more than one character.  This function sorts all that out.
4037      *
4038      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4039      * loop of trying to match A*, we know we can't exit where the thing
4040      * following it isn't a B.  And something can't be a B unless it is the
4041      * beginning of B.  By putting a quick test for that beginning in a tight
4042      * loop, we can rule out things that can't possibly be B without having to
4043      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4044      * character, we can make a tight loop matching A*, using the outputs of
4045      * this function.
4046      *
4047      * If the target string to match isn't in UTF-8, and there aren't
4048      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4049      * the one or two possible octets (which are characters in this situation)
4050      * that can match.  In all cases, if there is only one character that can
4051      * match, *<c1p> and *<c2p> will be identical.
4052      *
4053      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4054      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4055      * can match the beginning of <text_node>.  They should be declared with at
4056      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4057      * undefined what these contain.)  If one or both of the buffers are
4058      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4059      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4060      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4061      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4062      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4063
4064     const bool utf8_target = reginfo->is_utf8_target;
4065
4066     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4067     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4068     bool use_chrtest_void = FALSE;
4069     const bool is_utf8_pat = reginfo->is_utf8_pat;
4070
4071     /* Used when we have both utf8 input and utf8 output, to avoid converting
4072      * to/from code points */
4073     bool utf8_has_been_setup = FALSE;
4074
4075     dVAR;
4076
4077     U8 *pat = (U8*)STRING(text_node);
4078     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4079
4080     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4081
4082         /* In an exact node, only one thing can be matched, that first
4083          * character.  If both the pat and the target are UTF-8, we can just
4084          * copy the input to the output, avoiding finding the code point of
4085          * that character */
4086         if (!is_utf8_pat) {
4087             c2 = c1 = *pat;
4088         }
4089         else if (utf8_target) {
4090             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4091             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4092             utf8_has_been_setup = TRUE;
4093         }
4094         else {
4095             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4096         }
4097     }
4098     else { /* an EXACTFish node */
4099         U8 *pat_end = pat + STR_LEN(text_node);
4100
4101         /* An EXACTFL node has at least some characters unfolded, because what
4102          * they match is not known until now.  So, now is the time to fold
4103          * the first few of them, as many as are needed to determine 'c1' and
4104          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4105          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4106          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4107          * need to fold as many characters as a single character can fold to,
4108          * so that later we can check if the first ones are such a multi-char
4109          * fold.  But, in such a pattern only locale-problematic characters
4110          * aren't folded, so we can skip this completely if the first character
4111          * in the node isn't one of the tricky ones */
4112         if (OP(text_node) == EXACTFL) {
4113
4114             if (! is_utf8_pat) {
4115                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4116                 {
4117                     folded[0] = folded[1] = 's';
4118                     pat = folded;
4119                     pat_end = folded + 2;
4120                 }
4121             }
4122             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4123                 U8 *s = pat;
4124                 U8 *d = folded;
4125                 int i;
4126
4127                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4128                     if (isASCII(*s)) {
4129                         *(d++) = (U8) toFOLD_LC(*s);
4130                         s++;
4131                     }
4132                     else {
4133                         STRLEN len;
4134                         _to_utf8_fold_flags(s,
4135                                             d,
4136                                             &len,
4137                                             FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4138                         d += len;
4139                         s += UTF8SKIP(s);
4140                     }
4141                 }
4142
4143                 pat = folded;
4144                 pat_end = d;
4145             }
4146         }
4147
4148         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4149              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4150         {
4151             /* Multi-character folds require more context to sort out.  Also
4152              * PL_utf8_foldclosures used below doesn't handle them, so have to
4153              * be handled outside this routine */
4154             use_chrtest_void = TRUE;
4155         }
4156         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4157             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4158             if (c1 > 255) {
4159                 /* Load the folds hash, if not already done */
4160                 SV** listp;
4161                 if (! PL_utf8_foldclosures) {
4162                     _load_PL_utf8_foldclosures();
4163                 }
4164
4165                 /* The fold closures data structure is a hash with the keys
4166                  * being the UTF-8 of every character that is folded to, like
4167                  * 'k', and the values each an array of all code points that
4168                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4169                  * Multi-character folds are not included */
4170                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4171                                         (char *) pat,
4172                                         UTF8SKIP(pat),
4173                                         FALSE))))
4174                 {
4175                     /* Not found in the hash, therefore there are no folds
4176                     * containing it, so there is only a single character that
4177                     * could match */
4178                     c2 = c1;
4179                 }
4180                 else {  /* Does participate in folds */
4181                     AV* list = (AV*) *listp;
4182                     if (av_tindex_nomg(list) != 1) {
4183
4184                         /* If there aren't exactly two folds to this, it is
4185                          * outside the scope of this function */
4186                         use_chrtest_void = TRUE;
4187                     }
4188                     else {  /* There are two.  Get them */
4189                         SV** c_p = av_fetch(list, 0, FALSE);
4190                         if (c_p == NULL) {
4191                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4192                         }
4193                         c1 = SvUV(*c_p);
4194
4195                         c_p = av_fetch(list, 1, FALSE);
4196                         if (c_p == NULL) {
4197                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4198                         }
4199                         c2 = SvUV(*c_p);
4200
4201                         /* Folds that cross the 255/256 boundary are forbidden
4202                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4203                          * one is ASCIII.  Since the pattern character is above
4204                          * 255, and its only other match is below 256, the only
4205                          * legal match will be to itself.  We have thrown away
4206                          * the original, so have to compute which is the one
4207                          * above 255. */
4208                         if ((c1 < 256) != (c2 < 256)) {
4209                             if ((OP(text_node) == EXACTFL
4210                                  && ! IN_UTF8_CTYPE_LOCALE)
4211                                 || ((OP(text_node) == EXACTFA
4212                                     || OP(text_node) == EXACTFA_NO_TRIE)
4213                                     && (isASCII(c1) || isASCII(c2))))
4214                             {
4215                                 if (c1 < 256) {
4216                                     c1 = c2;
4217                                 }
4218                                 else {
4219                                     c2 = c1;
4220                                 }
4221                             }
4222                         }
4223                     }
4224                 }
4225             }
4226             else /* Here, c1 is <= 255 */
4227                 if (utf8_target
4228                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4229                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4230                     && ((OP(text_node) != EXACTFA
4231                         && OP(text_node) != EXACTFA_NO_TRIE)
4232                         || ! isASCII(c1)))
4233             {
4234                 /* Here, there could be something above Latin1 in the target
4235                  * which folds to this character in the pattern.  All such
4236                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4237                  * than two characters involved in their folds, so are outside
4238                  * the scope of this function */
4239                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4240                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4241                 }
4242                 else {
4243                     use_chrtest_void = TRUE;
4244                 }
4245             }
4246             else { /* Here nothing above Latin1 can fold to the pattern
4247                       character */
4248                 switch (OP(text_node)) {
4249
4250                     case EXACTFL:   /* /l rules */
4251                         c2 = PL_fold_locale[c1];
4252                         break;
4253
4254                     case EXACTF:   /* This node only generated for non-utf8
4255                                     patterns */
4256                         assert(! is_utf8_pat);
4257                         if (! utf8_target) {    /* /d rules */
4258                             c2 = PL_fold[c1];
4259                             break;
4260                         }
4261                         /* FALLTHROUGH */
4262                         /* /u rules for all these.  This happens to work for
4263                         * EXACTFA as nothing in Latin1 folds to ASCII */
4264                     case EXACTFA_NO_TRIE:   /* This node only generated for
4265                                             non-utf8 patterns */
4266                         assert(! is_utf8_pat);
4267                         /* FALLTHROUGH */
4268                     case EXACTFA:
4269                     case EXACTFU_SS:
4270                     case EXACTFU:
4271                         c2 = PL_fold_latin1[c1];
4272                         break;
4273
4274                     default:
4275                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4276                         NOT_REACHED; /* NOTREACHED */
4277                 }
4278             }
4279         }
4280     }
4281
4282     /* Here have figured things out.  Set up the returns */
4283     if (use_chrtest_void) {
4284         *c2p = *c1p = CHRTEST_VOID;
4285     }
4286     else if (utf8_target) {
4287         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4288             uvchr_to_utf8(c1_utf8, c1);
4289             uvchr_to_utf8(c2_utf8, c2);
4290         }
4291
4292         /* Invariants are stored in both the utf8 and byte outputs; Use
4293          * negative numbers otherwise for the byte ones.  Make sure that the
4294          * byte ones are the same iff the utf8 ones are the same */
4295         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4296         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4297                 ? *c2_utf8
4298                 : (c1 == c2)
4299                   ? CHRTEST_NOT_A_CP_1
4300                   : CHRTEST_NOT_A_CP_2;
4301     }
4302     else if (c1 > 255) {
4303        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4304                            can represent */
4305            return FALSE;
4306        }
4307
4308        *c1p = *c2p = c2;    /* c2 is the only representable value */
4309     }
4310     else {  /* c1 is representable; see about c2 */
4311        *c1p = c1;
4312        *c2p = (c2 < 256) ? c2 : c1;
4313     }
4314
4315     return TRUE;
4316 }
4317
4318 STATIC bool
4319 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4320 {
4321     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4322      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4323
4324     PERL_ARGS_ASSERT_ISGCB;
4325
4326     switch (GCB_table[before][after]) {
4327         case GCB_BREAKABLE:
4328             return TRUE;
4329
4330         case GCB_NOBREAK:
4331             return FALSE;
4332
4333         case GCB_RI_then_RI:
4334             {
4335                 int RI_count = 1;
4336                 U8 * temp_pos = (U8 *) curpos;
4337
4338                 /* Do not break within emoji flag sequences. That is, do not
4339                  * break between regional indicator (RI) symbols if there is an
4340                  * odd number of RI characters before the break point.
4341                  *  GB12     ^ (RI RI)* RI × RI
4342                  *  GB13 [^RI] (RI RI)* RI × RI */
4343
4344                 while (backup_one_GCB(strbeg,
4345                                     &temp_pos,
4346                                     utf8_target) == GCB_Regional_Indicator)
4347                 {
4348                     RI_count++;
4349                 }
4350
4351                 return RI_count % 2 != 1;
4352             }
4353
4354         case GCB_EX_then_EM:
4355
4356             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4357             {
4358                 U8 * temp_pos = (U8 *) curpos;
4359                 GCB_enum prev;
4360
4361                 do {
4362                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4363                 }
4364                 while (prev == GCB_Extend);
4365
4366                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4367             }
4368
4369         default:
4370             break;
4371     }
4372
4373 #ifdef DEBUGGING
4374     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4375                                   before, after, GCB_table[before][after]);
4376     assert(0);
4377 #endif
4378     return TRUE;
4379 }
4380
4381 STATIC GCB_enum
4382 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4383 {
4384     GCB_enum gcb;
4385
4386     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4387
4388     if (*curpos < strbeg) {
4389         return GCB_EDGE;
4390     }
4391
4392     if (utf8_target) {
4393         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4394         U8 * prev_prev_char_pos;
4395
4396         if (! prev_char_pos) {
4397             return GCB_EDGE;
4398         }
4399
4400         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4401             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4402             *curpos = prev_char_pos;
4403             prev_char_pos = prev_prev_char_pos;
4404         }
4405         else {
4406             *curpos = (U8 *) strbeg;
4407             return GCB_EDGE;
4408         }
4409     }
4410     else {
4411         if (*curpos - 2 < strbeg) {
4412             *curpos = (U8 *) strbeg;
4413             return GCB_EDGE;
4414         }
4415         (*curpos)--;
4416         gcb = getGCB_VAL_CP(*(*curpos - 1));
4417     }
4418
4419     return gcb;
4420 }
4421
4422 /* Combining marks attach to most classes that precede them, but this defines
4423  * the exceptions (from TR14) */
4424 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4425                                      || prev == LB_Mandatory_Break      \
4426                                      || prev == LB_Carriage_Return      \
4427                                      || prev == LB_Line_Feed            \
4428                                      || prev == LB_Next_Line            \
4429                                      || prev == LB_Space                \
4430                                      || prev == LB_ZWSpace))
4431
4432 STATIC bool
4433 S_isLB(pTHX_ LB_enum before,
4434              LB_enum after,
4435              const U8 * const strbeg,
4436              const U8 * const curpos,
4437              const U8 * const strend,
4438              const bool utf8_target)
4439 {
4440     U8 * temp_pos = (U8 *) curpos;
4441     LB_enum prev = before;
4442
4443     /* Is the boundary between 'before' and 'after' line-breakable?
4444      * Most of this is just a table lookup of a generated table from Unicode
4445      * rules.  But some rules require context to decide, and so have to be
4446      * implemented in code */
4447
4448     PERL_ARGS_ASSERT_ISLB;
4449
4450     /* Rule numbers in the comments below are as of Unicode 9.0 */
4451
4452   redo:
4453     before = prev;
4454     switch (LB_table[before][after]) {
4455         case LB_BREAKABLE:
4456             return TRUE;
4457
4458         case LB_NOBREAK:
4459         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4460             return FALSE;
4461
4462         case LB_SP_foo + LB_BREAKABLE:
4463         case LB_SP_foo + LB_NOBREAK:
4464         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4465
4466             /* When we have something following a SP, we have to look at the
4467              * context in order to know what to do.
4468              *
4469              * SP SP should not reach here because LB7: Do not break before
4470              * spaces.  (For two spaces in a row there is nothing that
4471              * overrides that) */
4472             assert(after != LB_Space);
4473
4474             /* Here we have a space followed by a non-space.  Mostly this is a
4475              * case of LB18: "Break after spaces".  But there are complications
4476              * as the handling of spaces is somewhat tricky.  They are in a
4477              * number of rules, which have to be applied in priority order, but
4478              * something earlier in the string can cause a rule to be skipped
4479              * and a lower priority rule invoked.  A prime example is LB7 which
4480              * says don't break before a space.  But rule LB8 (lower priority)
4481              * says that the first break opportunity after a ZW is after any
4482              * span of spaces immediately after it.  If a ZW comes before a SP
4483              * in the input, rule LB8 applies, and not LB7.  Other such rules
4484              * involve combining marks which are rules 9 and 10, but they may
4485              * override higher priority rules if they come earlier in the
4486              * string.  Since we're doing random access into the middle of the
4487              * string, we have to look for rules that should get applied based
4488              * on both string position and priority.  Combining marks do not
4489              * attach to either ZW nor SP, so we don't have to consider them
4490              * until later.
4491              *
4492              * To check for LB8, we have to find the first non-space character
4493              * before this span of spaces */
4494             do {
4495                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4496             }
4497             while (prev == LB_Space);
4498
4499             /* LB8 Break before any character following a zero-width space,
4500              * even if one or more spaces intervene.
4501              *      ZW SP* ÷
4502              * So if we have a ZW just before this span, and to get here this
4503              * is the final space in the span. */
4504             if (prev == LB_ZWSpace) {
4505                 return TRUE;
4506             }
4507
4508             /* Here, not ZW SP+.  There are several rules that have higher
4509              * priority than LB18 and can be resolved now, as they don't depend
4510              * on anything earlier in the string (except ZW, which we have
4511              * already handled).  One of these rules is LB11 Do not break
4512              * before Word joiner, but we have specially encoded that in the
4513              * lookup table so it is caught by the single test below which
4514              * catches the other ones. */
4515             if (LB_table[LB_Space][after] - LB_SP_foo
4516                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4517             {
4518                 return FALSE;
4519             }
4520
4521             /* If we get here, we have to XXX consider combining marks. */
4522             if (prev == LB_Combining_Mark) {
4523
4524                 /* What happens with these depends on the character they
4525                  * follow.  */
4526                 do {
4527                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4528                 }
4529                 while (prev == LB_Combining_Mark);
4530
4531                 /* Most times these attach to and inherit the characteristics
4532                  * of that character, but not always, and when not, they are to
4533                  * be treated as AL by rule LB10. */
4534                 if (! LB_CM_ATTACHES_TO(prev)) {
4535                     prev = LB_Alphabetic;
4536                 }
4537             }
4538
4539             /* Here, we have the character preceding the span of spaces all set
4540              * up.  We follow LB18: "Break after spaces" unless the table shows
4541              * that is overriden */
4542             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4543
4544         case LB_CM_ZWJ_foo:
4545
4546             /* We don't know how to treat the CM except by looking at the first
4547              * non-CM character preceding it.  ZWJ is treated as CM */
4548             do {
4549                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4550             }
4551             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4552
4553             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4554              * attatches to it, then it inherits the behavior of 'prev'.  If it
4555              * doesn't attach, it is to be treated as an AL */
4556             if (! LB_CM_ATTACHES_TO(prev)) {
4557                 prev = LB_Alphabetic;
4558             }
4559
4560             goto redo;
4561
4562         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4563         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4564
4565             /* LB21a Don't break after Hebrew + Hyphen.
4566              * HL (HY | BA) × */
4567
4568             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4569                                                           == LB_Hebrew_Letter)
4570             {
4571                 return FALSE;
4572             }
4573
4574             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4575
4576         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4577         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4578
4579             /* LB25a (PR | PO) × ( OP | HY )? NU */
4580             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4581                 return FALSE;
4582             }
4583
4584             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4585                                                                 == LB_BREAKABLE;
4586
4587         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4588         case LB_SY_or_IS_then_various + LB_NOBREAK:
4589         {
4590             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4591
4592             LB_enum temp = prev;
4593             do {
4594                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4595             }
4596             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4597             if (temp == LB_Numeric) {
4598                 return FALSE;
4599             }
4600
4601             return LB_table[prev][after] - LB_SY_or_IS_then_various
4602                                                                == LB_BREAKABLE;
4603         }
4604
4605         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4606         case LB_various_then_PO_or_PR + LB_NOBREAK:
4607         {
4608             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4609
4610             LB_enum temp = prev;
4611             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4612             {
4613                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4614             }
4615             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4616                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4617             }
4618             if (temp == LB_Numeric) {
4619                 return FALSE;
4620             }
4621             return LB_various_then_PO_or_PR;
4622         }
4623
4624         case LB_RI_then_RI + LB_NOBREAK:
4625         case LB_RI_then_RI + LB_BREAKABLE:
4626             {
4627                 int RI_count = 1;
4628
4629                 /* LB30a Break between two regional indicator symbols if and
4630                  * only if there are an even number of regional indicators
4631                  * preceding the position of the break.
4632                  *
4633                  *  sot (RI RI)* RI × RI
4634                  *  [^RI] (RI RI)* RI × RI */
4635
4636                 while (backup_one_LB(strbeg,
4637                                      &temp_pos,
4638                                      utf8_target) == LB_Regional_Indicator)
4639                 {
4640                     RI_count++;
4641                 }
4642
4643                 return RI_count % 2 == 0;
4644             }
4645
4646         default:
4647             break;
4648     }
4649
4650 #ifdef DEBUGGING
4651     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4652                                   before, after, LB_table[before][after]);
4653     assert(0);
4654 #endif
4655     return TRUE;
4656 }
4657
4658 STATIC LB_enum
4659 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4660 {
4661     LB_enum lb;
4662
4663     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4664
4665     if (*curpos >= strend) {
4666         return LB_EDGE;
4667     }
4668
4669     if (utf8_target) {
4670         *curpos += UTF8SKIP(*curpos);
4671         if (*curpos >= strend) {
4672             return LB_EDGE;
4673         }
4674         lb = getLB_VAL_UTF8(*curpos, strend);
4675     }
4676     else {
4677         (*curpos)++;
4678         if (*curpos >= strend) {
4679             return LB_EDGE;
4680         }
4681         lb = getLB_VAL_CP(**curpos);
4682     }
4683
4684     return lb;
4685 }
4686
4687 STATIC LB_enum
4688 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4689 {
4690     LB_enum lb;
4691
4692     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4693
4694     if (*curpos < strbeg) {
4695         return LB_EDGE;
4696     }
4697
4698     if (utf8_target) {
4699         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4700         U8 * prev_prev_char_pos;
4701
4702         if (! prev_char_pos) {
4703             return LB_EDGE;
4704         }
4705
4706         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4707             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4708             *curpos = prev_char_pos;
4709             prev_char_pos = prev_prev_char_pos;
4710         }
4711         else {
4712             *curpos = (U8 *) strbeg;
4713             return LB_EDGE;
4714         }
4715     }
4716     else {
4717         if (*curpos - 2 < strbeg) {
4718             *curpos = (U8 *) strbeg;
4719             return LB_EDGE;
4720         }
4721         (*curpos)--;
4722         lb = getLB_VAL_CP(*(*curpos - 1));
4723     }
4724
4725     return lb;
4726 }
4727
4728 STATIC bool
4729 S_isSB(pTHX_ SB_enum before,
4730              SB_enum after,
4731              const U8 * const strbeg,
4732              const U8 * const curpos,
4733              const U8 * const strend,
4734              const bool utf8_target)
4735 {
4736     /* returns a boolean indicating if there is a Sentence Boundary Break
4737      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4738
4739     U8 * lpos = (U8 *) curpos;
4740     bool has_para_sep = FALSE;
4741     bool has_sp = FALSE;
4742
4743     PERL_ARGS_ASSERT_ISSB;
4744
4745     /* Break at the start and end of text.
4746         SB1.  sot  ÷
4747         SB2.  ÷  eot
4748       But unstated in Unicode is don't break if the text is empty */
4749     if (before == SB_EDGE || after == SB_EDGE) {
4750         return before != after;
4751     }
4752
4753     /* SB 3: Do not break within CRLF. */
4754     if (before == SB_CR && after == SB_LF) {
4755         return FALSE;
4756     }
4757
4758     /* Break after paragraph separators.  CR and LF are considered
4759      * so because Unicode views text as like word processing text where there
4760      * are no newlines except between paragraphs, and the word processor takes
4761      * care of wrapping without there being hard line-breaks in the text *./
4762        SB4.  Sep | CR | LF  ÷ */
4763     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4764         return TRUE;
4765     }
4766
4767     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4768      * (See Section 6.2, Replacing Ignore Rules.)
4769         SB5.  X (Extend | Format)*  →  X */
4770     if (after == SB_Extend || after == SB_Format) {
4771
4772         /* Implied is that the these characters attach to everything
4773          * immediately prior to them except for those separator-type
4774          * characters.  And the rules earlier have already handled the case
4775          * when one of those immediately precedes the extend char */
4776         return FALSE;
4777     }
4778
4779     if (before == SB_Extend || before == SB_Format) {
4780         U8 * temp_pos = lpos;
4781         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4782         if (   backup != SB_EDGE
4783             && backup != SB_Sep
4784             && backup != SB_CR
4785             && backup != SB_LF)
4786         {
4787             before = backup;
4788             lpos = temp_pos;
4789         }
4790
4791         /* Here, both 'before' and 'backup' are these types; implied is that we
4792          * don't break between them */
4793         if (backup == SB_Extend || backup == SB_Format) {
4794             return FALSE;
4795         }
4796     }
4797
4798     /* Do not break after ambiguous terminators like period, if they are
4799      * immediately followed by a number or lowercase letter, if they are
4800      * between uppercase letters, if the first following letter (optionally
4801      * after certain punctuation) is lowercase, or if they are followed by
4802      * "continuation" punctuation such as comma, colon, or semicolon. For
4803      * example, a period may be an abbreviation or numeric period, and thus may
4804      * not mark the end of a sentence.
4805
4806      * SB6. ATerm  ×  Numeric */
4807     if (before == SB_ATerm && after == SB_Numeric) {
4808         return FALSE;
4809     }
4810
4811     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4812     if (before == SB_ATerm && after == SB_Upper) {
4813         U8 * temp_pos = lpos;
4814         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4815         if (backup == SB_Upper || backup == SB_Lower) {
4816             return FALSE;
4817         }
4818     }
4819
4820     /* The remaining rules that aren't the final one, all require an STerm or
4821      * an ATerm after having backed up over some Close* Sp*, and in one case an
4822      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4823      * So do that backup now, setting flags if either Sp or a paragraph
4824      * separator are found */
4825
4826     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4827         has_para_sep = TRUE;
4828         before = backup_one_SB(strbeg, &lpos, utf8_target);
4829     }
4830
4831     if (before == SB_Sp) {
4832         has_sp = TRUE;
4833         do {
4834             before = backup_one_SB(strbeg, &lpos, utf8_target);
4835         }
4836         while (before == SB_Sp);
4837     }
4838
4839     while (before == SB_Close) {
4840         before = backup_one_SB(strbeg, &lpos, utf8_target);
4841     }
4842
4843     /* The next few rules apply only when the backed-up-to is an ATerm, and in
4844      * most cases an STerm */
4845     if (before == SB_STerm || before == SB_ATerm) {
4846
4847         /* So, here the lhs matches
4848          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
4849          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
4850          * The rules that apply here are:
4851          *
4852          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
4853                                            | LF | STerm | ATerm) )* Lower
4854            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
4855            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
4856            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
4857            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
4858          */
4859
4860         /* And all but SB11 forbid having seen a paragraph separator */
4861         if (! has_para_sep) {
4862             if (before == SB_ATerm) {          /* SB8 */
4863                 U8 * rpos = (U8 *) curpos;
4864                 SB_enum later = after;
4865
4866                 while (    later != SB_OLetter
4867                         && later != SB_Upper
4868                         && later != SB_Lower
4869                         && later != SB_Sep
4870                         && later != SB_CR
4871                         && later != SB_LF
4872                         && later != SB_STerm
4873                         && later != SB_ATerm
4874                         && later != SB_EDGE)
4875                 {
4876                     later = advance_one_SB(&rpos, strend, utf8_target);
4877                 }
4878                 if (later == SB_Lower) {
4879                     return FALSE;
4880                 }
4881             }
4882
4883             if (   after == SB_SContinue    /* SB8a */
4884                 || after == SB_STerm
4885                 || after == SB_ATerm)
4886             {
4887                 return FALSE;
4888             }
4889
4890             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
4891                 if (   after == SB_Close
4892                     || after == SB_Sp
4893                     || after == SB_Sep
4894                     || after == SB_CR
4895                     || after == SB_LF)
4896                 {
4897                     return FALSE;
4898                 }
4899             }
4900
4901             /* SB10.  This and SB9 could probably be combined some way, but khw
4902              * has decided to follow the Unicode rule book precisely for
4903              * simplified maintenance */
4904             if (   after == SB_Sp
4905                 || after == SB_Sep
4906                 || after == SB_CR
4907                 || after == SB_LF)
4908             {
4909                 return FALSE;
4910             }
4911         }
4912
4913         /* SB11.  */
4914         return TRUE;
4915     }
4916
4917     /* Otherwise, do not break.
4918     SB12.  Any  ×  Any */
4919
4920     return FALSE;
4921 }
4922
4923 STATIC SB_enum
4924 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4925 {
4926     SB_enum sb;
4927
4928     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4929
4930     if (*curpos >= strend) {
4931         return SB_EDGE;
4932     }
4933
4934     if (utf8_target) {
4935         do {
4936             *curpos += UTF8SKIP(*curpos);
4937             if (*curpos >= strend) {
4938                 return SB_EDGE;
4939             }
4940             sb = getSB_VAL_UTF8(*curpos, strend);
4941         } while (sb == SB_Extend || sb == SB_Format);
4942     }
4943     else {
4944         do {
4945             (*curpos)++;
4946             if (*curpos >= strend) {
4947                 return SB_EDGE;
4948             }
4949             sb = getSB_VAL_CP(**curpos);
4950         } while (sb == SB_Extend || sb == SB_Format);
4951     }
4952
4953     return sb;
4954 }
4955
4956 STATIC SB_enum
4957 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4958 {
4959     SB_enum sb;
4960
4961     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4962
4963     if (*curpos < strbeg) {
4964         return SB_EDGE;
4965     }
4966
4967     if (utf8_target) {
4968         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4969         if (! prev_char_pos) {
4970             return SB_EDGE;
4971         }
4972
4973         /* Back up over Extend and Format.  curpos is always just to the right
4974          * of the characater whose value we are getting */
4975         do {
4976             U8 * prev_prev_char_pos;
4977             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4978                                                                       strbeg)))
4979             {
4980                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4981                 *curpos = prev_char_pos;
4982                 prev_char_pos = prev_prev_char_pos;
4983             }
4984             else {
4985                 *curpos = (U8 *) strbeg;
4986                 return SB_EDGE;
4987             }
4988         } while (sb == SB_Extend || sb == SB_Format);
4989     }
4990     else {
4991         do {
4992             if (*curpos - 2 < strbeg) {
4993                 *curpos = (U8 *) strbeg;
4994                 return SB_EDGE;
4995             }
4996             (*curpos)--;
4997             sb = getSB_VAL_CP(*(*curpos - 1));
4998         } while (sb == SB_Extend || sb == SB_Format);
4999     }
5000
5001     return sb;
5002 }
5003
5004 STATIC bool
5005 S_isWB(pTHX_ WB_enum previous,
5006              WB_enum before,
5007              WB_enum after,
5008              const U8 * const strbeg,
5009              const U8 * const curpos,
5010              const U8 * const strend,
5011              const bool utf8_target)
5012 {
5013     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5014      *  a Unicode word break, using their published algorithm, but tailored for
5015      *  Perl by treating spans of white space as one unit.  Context may be
5016      *  needed to make this determination.  If the value for the character
5017      *  before 'before' is known, it is passed as 'previous'; otherwise that
5018      *  should be set to WB_UNKNOWN.  The other input parameters give the
5019      *  boundaries and current position in the matching of the string.  That
5020      *  is, 'curpos' marks the position where the character whose wb value is
5021      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5022
5023     U8 * before_pos = (U8 *) curpos;
5024     U8 * after_pos = (U8 *) curpos;
5025     WB_enum prev = before;
5026     WB_enum next;
5027
5028     PERL_ARGS_ASSERT_ISWB;
5029
5030     /* Rule numbers in the comments below are as of Unicode 9.0 */
5031
5032   redo:
5033     before = prev;
5034     switch (WB_table[before][after]) {
5035         case WB_BREAKABLE:
5036             return TRUE;
5037
5038         case WB_NOBREAK:
5039             return FALSE;
5040
5041         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5042             next = advance_one_WB(&after_pos, strend, utf8_target,
5043                                  FALSE /* Don't skip Extend nor Format */ );
5044             /* A space immediately preceeding an Extend or Format is attached
5045              * to by them, and hence gets separated from previous spaces.
5046              * Otherwise don't break between horizontal white space */
5047             return next == WB_Extend || next == WB_Format;
5048
5049         /* WB4 Ignore Format and Extend characters, except when they appear at
5050          * the beginning of a region of text.  This code currently isn't
5051          * general purpose, but it works as the rules are currently and likely
5052          * to be laid out.  The reason it works is that when 'they appear at
5053          * the beginning of a region of text', the rule is to break before
5054          * them, just like any other character.  Therefore, the default rule
5055          * applies and we don't have to look in more depth.  Should this ever
5056          * change, we would have to have 2 'case' statements, like in the rules
5057          * below, and backup a single character (not spacing over the extend
5058          * ones) and then see if that is one of the region-end characters and
5059          * go from there */
5060         case WB_Ex_or_FO_or_ZWJ_then_foo:
5061             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5062             goto redo;
5063
5064         case WB_DQ_then_HL + WB_BREAKABLE:
5065         case WB_DQ_then_HL + WB_NOBREAK:
5066
5067             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5068
5069             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5070                                                             == WB_Hebrew_Letter)
5071             {
5072                 return FALSE;
5073             }
5074
5075              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5076
5077         case WB_HL_then_DQ + WB_BREAKABLE:
5078         case WB_HL_then_DQ + WB_NOBREAK:
5079
5080             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5081
5082             if (advance_one_WB(&after_pos, strend, utf8_target,
5083                                        TRUE /* Do skip Extend and Format */ )
5084                                                             == WB_Hebrew_Letter)
5085             {
5086                 return FALSE;
5087             }
5088
5089             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5090
5091         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5092         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5093
5094             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5095              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5096
5097             next = advance_one_WB(&after_pos, strend, utf8_target,
5098                                        TRUE /* Do skip Extend and Format */ );
5099
5100             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5101             {
5102                 return FALSE;
5103             }
5104
5105             return WB_table[before][after]
5106                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5107
5108         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5109         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5110
5111             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5112              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5113
5114             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5115             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5116             {
5117                 return FALSE;
5118             }
5119
5120             return WB_table[before][after]
5121                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5122
5123         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5124         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5125
5126             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5127              * */
5128
5129             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5130                                                             == WB_Numeric)
5131             {
5132                 return FALSE;
5133             }
5134
5135             return WB_table[before][after]
5136                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5137
5138         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5139         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5140
5141             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5142
5143             if (advance_one_WB(&after_pos, strend, utf8_target,
5144                                        TRUE /* Do skip Extend and Format */ )
5145                                                             == WB_Numeric)
5146             {
5147                 return FALSE;
5148             }
5149
5150             return WB_table[before][after]
5151                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5152
5153         case WB_RI_then_RI + WB_NOBREAK:
5154         case WB_RI_then_RI + WB_BREAKABLE:
5155             {
5156                 int RI_count = 1;
5157
5158                 /* Do not break within emoji flag sequences. That is, do not
5159                  * break between regional indicator (RI) symbols if there is an
5160                  * odd number of RI characters before the potential break
5161                  * point.
5162                  *
5163                  * WB15     ^ (RI RI)* RI × RI
5164                  * WB16 [^RI] (RI RI)* RI × RI */
5165
5166                 while (backup_one_WB(&previous,
5167                                      strbeg,
5168                                      &before_pos,
5169                                      utf8_target) == WB_Regional_Indicator)
5170                 {
5171                     RI_count++;
5172                 }
5173
5174                 return RI_count % 2 != 1;
5175             }
5176
5177         default:
5178             break;
5179     }
5180
5181 #ifdef DEBUGGING
5182     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5183                                   before, after, WB_table[before][after]);
5184     assert(0);
5185 #endif
5186     return TRUE;
5187 }
5188
5189 STATIC WB_enum
5190 S_advance_one_WB(pTHX_ U8 ** curpos,
5191                        const U8 * const strend,
5192                        const bool utf8_target,
5193                        const bool skip_Extend_Format)
5194 {
5195     WB_enum wb;
5196
5197     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5198
5199     if (*curpos >= strend) {
5200         return WB_EDGE;
5201     }
5202
5203     if (utf8_target) {
5204
5205         /* Advance over Extend and Format */
5206         do {
5207             *curpos += UTF8SKIP(*curpos);
5208             if (*curpos >= strend) {
5209                 return WB_EDGE;
5210             }
5211             wb = getWB_VAL_UTF8(*curpos, strend);
5212         } while (    skip_Extend_Format
5213                  && (wb == WB_Extend || wb == WB_Format));
5214     }
5215     else {
5216         do {
5217             (*curpos)++;
5218             if (*curpos >= strend) {
5219                 return WB_EDGE;
5220             }
5221             wb = getWB_VAL_CP(**curpos);
5222         } while (    skip_Extend_Format
5223                  && (wb == WB_Extend || wb == WB_Format));
5224     }
5225
5226     return wb;
5227 }
5228
5229 STATIC WB_enum
5230 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5231 {
5232     WB_enum wb;
5233
5234     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5235
5236     /* If we know what the previous character's break value is, don't have
5237         * to look it up */
5238     if (*previous != WB_UNKNOWN) {
5239         wb = *previous;
5240
5241         /* But we need to move backwards by one */
5242         if (utf8_target) {
5243             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5244             if (! *curpos) {
5245                 *previous = WB_EDGE;
5246                 *curpos = (U8 *) strbeg;
5247             }
5248             else {
5249                 *previous = WB_UNKNOWN;
5250             }
5251         }
5252         else {
5253             (*curpos)--;
5254             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5255         }
5256
5257         /* And we always back up over these three types */
5258         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5259             return wb;
5260         }
5261     }
5262
5263     if (*curpos < strbeg) {
5264         return WB_EDGE;
5265     }
5266
5267     if (utf8_target) {
5268         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5269         if (! prev_char_pos) {
5270             return WB_EDGE;
5271         }
5272
5273         /* Back up over Extend and Format.  curpos is always just to the right
5274          * of the characater whose value we are getting */
5275         do {
5276             U8 * prev_prev_char_pos;
5277             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5278                                                    -1,
5279                                                    strbeg)))
5280             {
5281                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5282                 *curpos = prev_char_pos;
5283                 prev_char_pos = prev_prev_char_pos;
5284             }
5285             else {
5286                 *curpos = (U8 *) strbeg;
5287                 return WB_EDGE;
5288             }
5289         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5290     }
5291     else {
5292         do {
5293             if (*curpos - 2 < strbeg) {
5294                 *curpos = (U8 *) strbeg;
5295                 return WB_EDGE;
5296             }
5297             (*curpos)--;
5298             wb = getWB_VAL_CP(*(*curpos - 1));
5299         } while (wb == WB_Extend || wb == WB_Format);
5300     }
5301
5302     return wb;
5303 }
5304
5305 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5306 (                                                           \
5307     (   ( st )                                         ) && \
5308     (   ( st )->u.eval.close_paren                     ) && \
5309     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5310 )
5311
5312 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5313 (                                                           \
5314     (   ( st )                                         ) && \
5315     (   ( st )->u.eval.close_paren                     ) && \
5316     (   ( expr )                                       ) && \
5317     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5318 )
5319
5320
5321 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5322     (st)->u.eval.close_paren = ( (expr) + 1 )
5323
5324 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5325     (st)->u.eval.close_paren = 0
5326
5327 /* returns -1 on failure, $+[0] on success */
5328 STATIC SSize_t
5329 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5330 {
5331
5332 #if PERL_VERSION < 9 && !defined(PERL_CORE)
5333     dMY_CXT;
5334 #endif
5335     dVAR;
5336     const bool utf8_target = reginfo->is_utf8_target;
5337     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5338     REGEXP *rex_sv = reginfo->prog;
5339     regexp *rex = ReANY(rex_sv);
5340     RXi_GET_DECL(rex,rexi);
5341     /* the current state. This is a cached copy of PL_regmatch_state */
5342     regmatch_state *st;
5343     /* cache heavy used fields of st in registers */
5344     regnode *scan;
5345     regnode *next;
5346     U32 n = 0;  /* general value; init to avoid compiler warning */
5347     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5348     char *locinput = startpos;
5349     char *pushinput; /* where to continue after a PUSH */
5350     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5351
5352     bool result = 0;        /* return value of S_regmatch */
5353     U32 depth = 0;            /* depth of backtrack stack */
5354     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5355     const U32 max_nochange_depth =
5356         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5357         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5358     regmatch_state *yes_state = NULL; /* state to pop to on success of
5359                                                             subpattern */
5360     /* mark_state piggy backs on the yes_state logic so that when we unwind
5361        the stack on success we can update the mark_state as we go */
5362     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5363     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5364     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5365     U32 state_num;
5366     bool no_final = 0;      /* prevent failure from backtracking? */
5367     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5368     char *startpoint = locinput;
5369     SV *popmark = NULL;     /* are we looking for a mark? */
5370     SV *sv_commit = NULL;   /* last mark name seen in failure */
5371     SV *sv_yes_mark = NULL; /* last mark name we have seen
5372                                during a successful match */
5373     U32 lastopen = 0;       /* last open we saw */
5374     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
5375     SV* const oreplsv = GvSVn(PL_replgv);
5376     /* these three flags are set by various ops to signal information to
5377      * the very next op. They have a useful lifetime of exactly one loop
5378      * iteration, and are not preserved or restored by state pushes/pops
5379      */
5380     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5381     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5382     int logical = 0;        /* the following EVAL is:
5383                                 0: (?{...})
5384                                 1: (?(?{...})X|Y)
5385                                 2: (??{...})
5386                                or the following IFMATCH/UNLESSM is:
5387                                 false: plain (?=foo)
5388                                 true:  used as a condition: (?(?=foo))
5389                             */
5390     PAD* last_pad = NULL;
5391     dMULTICALL;
5392     U8 gimme = G_SCALAR;
5393     CV *caller_cv = NULL;       /* who called us */
5394     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5395     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
5396     U32 maxopenparen = 0;       /* max '(' index seen so far */
5397     int to_complement;  /* Invert the result? */
5398     _char_class_number classnum;
5399     bool is_utf8_pat = reginfo->is_utf8_pat;
5400     bool match = FALSE;
5401
5402 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5403 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5404 #  define SOLARIS_BAD_OPTIMIZER
5405     const U32 *pl_charclass_dup = PL_charclass;
5406 #  define PL_charclass pl_charclass_dup
5407 #endif
5408
5409 #ifdef DEBUGGING
5410     GET_RE_DEBUG_FLAGS_DECL;
5411 #endif
5412
5413     /* protect against undef(*^R) */
5414     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5415
5416     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5417     multicall_oldcatch = 0;
5418     PERL_UNUSED_VAR(multicall_cop);
5419
5420     PERL_ARGS_ASSERT_REGMATCH;
5421
5422     st = PL_regmatch_state;
5423
5424     /* Note that nextchr is a byte even in UTF */
5425     SET_nextchr;
5426     scan = prog;
5427
5428     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5429             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5430             Perl_re_printf( aTHX_ "regmatch start\n" );
5431     }));
5432
5433     while (scan != NULL) {
5434
5435
5436         next = scan + NEXT_OFF(scan);
5437         if (next == scan)
5438             next = NULL;
5439         state_num = OP(scan);
5440
5441       reenter_switch:
5442         DEBUG_EXECUTE_r(
5443             if (state_num <= REGNODE_MAX) {
5444                 SV * const prop = sv_newmortal();
5445                 regnode *rnext = regnext(scan);
5446
5447                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5448                 regprop(rex, prop, scan, reginfo, NULL);
5449                 Perl_re_printf( aTHX_
5450                     "%*s%"IVdf":%s(%"IVdf")\n",
5451                     INDENT_CHARS(depth), "",
5452                     (IV)(scan - rexi->program),
5453                     SvPVX_const(prop),
5454                     (PL_regkind[OP(scan)] == END || !rnext) ?
5455                         0 : (IV)(rnext - rexi->program));
5456             }
5457         );
5458
5459         to_complement = 0;
5460
5461         SET_nextchr;
5462         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5463
5464         switch (state_num) {
5465         case SBOL: /*  /^../ and /\A../  */
5466             if (locinput == reginfo->strbeg)
5467                 break;
5468             sayNO;
5469
5470         case MBOL: /*  /^../m  */
5471             if (locinput == reginfo->strbeg ||
5472                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5473             {
5474                 break;
5475             }
5476             sayNO;
5477
5478         case GPOS: /*  \G  */
5479             if (locinput == reginfo->ganch)
5480                 break;
5481             sayNO;
5482
5483         case KEEPS: /*   \K  */
5484             /* update the startpoint */
5485             st->u.keeper.val = rex->offs[0].start;
5486             rex->offs[0].start = locinput - reginfo->strbeg;
5487             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5488             NOT_REACHED; /* NOTREACHED */
5489
5490         case KEEPS_next_fail:
5491             /* rollback the start point change */
5492             rex->offs[0].start = st->u.keeper.val;
5493             sayNO_SILENT;
5494             NOT_REACHED; /* NOTREACHED */
5495
5496         case MEOL: /* /..$/m  */
5497             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5498                 sayNO;
5499             break;
5500
5501         case SEOL: /* /..$/  */
5502             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5503                 sayNO;
5504             if (reginfo->strend - locinput > 1)
5505                 sayNO;
5506             break;
5507
5508         case EOS: /*  \z  */
5509             if (!NEXTCHR_IS_EOS)
5510                 sayNO;
5511             break;
5512
5513         case SANY: /*  /./s  */
5514             if (NEXTCHR_IS_EOS)
5515                 sayNO;
5516             goto increment_locinput;
5517
5518         case REG_ANY: /*  /./  */
5519             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5520                 sayNO;
5521             goto increment_locinput;
5522
5523
5524 #undef  ST
5525 #define ST st->u.trie
5526         case TRIEC: /* (ab|cd) with known charclass */
5527             /* In this case the charclass data is available inline so
5528                we can fail fast without a lot of extra overhead.
5529              */
5530             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5531                 DEBUG_EXECUTE_r(
5532                     Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5533                               depth, PL_colors[4], PL_colors[5])
5534                 );
5535                 sayNO_SILENT;
5536                 NOT_REACHED; /* NOTREACHED */
5537             }
5538             /* FALLTHROUGH */
5539         case TRIE:  /* (ab|cd)  */
5540             /* the basic plan of execution of the trie is:
5541              * At the beginning, run though all the states, and
5542              * find the longest-matching word. Also remember the position
5543              * of the shortest matching word. For example, this pattern:
5544              *    1  2 3 4    5
5545              *    ab|a|x|abcd|abc
5546              * when matched against the string "abcde", will generate
5547              * accept states for all words except 3, with the longest
5548              * matching word being 4, and the shortest being 2 (with
5549              * the position being after char 1 of the string).
5550              *
5551              * Then for each matching word, in word order (i.e. 1,2,4,5),
5552              * we run the remainder of the pattern; on each try setting
5553              * the current position to the character following the word,
5554              * returning to try the next word on failure.
5555              *
5556              * We avoid having to build a list of words at runtime by
5557              * using a compile-time structure, wordinfo[].prev, which
5558              * gives, for each word, the previous accepting word (if any).
5559              * In the case above it would contain the mappings 1->2, 2->0,
5560              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5561              * the longest word (4 above), a list of all words, by
5562              * following the list of prev pointers; this gives us the
5563              * unordered list 4,5,1,2. Then given the current word we have
5564              * just tried, we can go through the list and find the
5565              * next-biggest word to try (so if we just failed on word 2,
5566              * the next in the list is 4).
5567              *
5568              * Since at runtime we don't record the matching position in
5569              * the string for each word, we have to work that out for
5570              * each word we're about to process. The wordinfo table holds
5571              * the character length of each word; given that we recorded
5572              * at the start: the position of the shortest word and its
5573              * length in chars, we just need to move the pointer the
5574              * difference between the two char lengths. Depending on
5575              * Unicode status and folding, that's cheap or expensive.
5576              *
5577              * This algorithm is optimised for the case where are only a
5578              * small number of accept states, i.e. 0,1, or maybe 2.
5579              * With lots of accepts states, and having to try all of them,
5580              * it becomes quadratic on number of accept states to find all
5581              * the next words.
5582              */
5583
5584             {
5585                 /* what type of TRIE am I? (utf8 makes this contextual) */
5586                 DECL_TRIE_TYPE(scan);
5587
5588                 /* what trie are we using right now */
5589                 reg_trie_data * const trie
5590                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5591                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5592                 U32 state = trie->startstate;
5593
5594                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5595                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5596                     if (utf8_target
5597                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5598                         && scan->flags == EXACTL)
5599                     {
5600                         /* We only output for EXACTL, as we let the folder
5601                          * output this message for EXACTFLU8 to avoid
5602                          * duplication */
5603                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5604                                                                reginfo->strend);
5605                     }
5606                 }
5607                 if (   trie->bitmap
5608                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5609                 {
5610                     if (trie->states[ state ].wordnum) {
5611                          DEBUG_EXECUTE_r(
5612                             Perl_re_exec_indentf( aTHX_  "%smatched empty string...%s\n",
5613                                           depth, PL_colors[4], PL_colors[5])
5614                         );
5615                         if (!trie->jump)
5616                             break;
5617                     } else {
5618                         DEBUG_EXECUTE_r(
5619                             Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5620                                           depth, PL_colors[4], PL_colors[5])
5621                         );
5622                         sayNO_SILENT;
5623                    }
5624                 }
5625
5626             {
5627                 U8 *uc = ( U8* )locinput;
5628
5629                 STRLEN len = 0;
5630                 STRLEN foldlen = 0;
5631                 U8 *uscan = (U8*)NULL;
5632                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5633                 U32 charcount = 0; /* how many input chars we have matched */
5634                 U32 accepted = 0; /* have we seen any accepting states? */
5635
5636                 ST.jump = trie->jump;
5637                 ST.me = scan;
5638                 ST.firstpos = NULL;
5639                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5640                 ST.nextword = 0;
5641
5642                 /* fully traverse the TRIE; note the position of the
5643                    shortest accept state and the wordnum of the longest
5644                    accept state */
5645
5646                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5647                     U32 base = trie->states[ state ].trans.base;
5648                     UV uvc = 0;
5649                     U16 charid = 0;
5650                     U16 wordnum;
5651                     wordnum = trie->states[ state ].wordnum;
5652
5653                     if (wordnum) { /* it's an accept state */
5654                         if (!accepted) {
5655                             accepted = 1;
5656                             /* record first match position */
5657                             if (ST.longfold) {
5658                                 ST.firstpos = (U8*)locinput;
5659                                 ST.firstchars = 0;
5660                             }
5661                             else {
5662                                 ST.firstpos = uc;
5663                                 ST.firstchars = charcount;
5664                             }
5665                         }
5666                         if (!ST.nextword || wordnum < ST.nextword)
5667                             ST.nextword = wordnum;
5668                         ST.topword = wordnum;
5669                     }
5670
5671                     DEBUG_TRIE_EXECUTE_r({
5672                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5673                                 /* HERE */
5674                                 PerlIO_printf( Perl_debug_log,
5675                                     "%*s%sState: %4"UVxf" Accepted: %c ",
5676                                     INDENT_CHARS(depth), "", PL_colors[4],
5677                                     (UV)state, (accepted ? 'Y' : 'N'));
5678                     });
5679
5680                     /* read a char and goto next state */
5681                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5682                         I32 offset;
5683                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5684                                              uscan, len, uvc, charid, foldlen,
5685                                              foldbuf, uniflags);
5686                         charcount++;
5687                         if (foldlen>0)
5688                             ST.longfold = TRUE;
5689                         if (charid &&
5690                              ( ((offset =
5691                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5692
5693                              && ((U32)offset < trie->lasttrans)
5694                              && trie->trans[offset].check == state)
5695                         {
5696                             state = trie->trans[offset].next;
5697                         }
5698                         else {
5699                             state = 0;
5700                         }
5701                         uc += len;
5702
5703                     }
5704                     else {
5705                         state = 0;
5706                     }
5707                     DEBUG_TRIE_EXECUTE_r(
5708                         Perl_re_printf( aTHX_
5709                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5710                             charid, uvc, (UV)state, PL_colors[5] );
5711                     );
5712                 }
5713                 if (!accepted)
5714                    sayNO;
5715
5716                 /* calculate total number of accept states */
5717                 {
5718                     U16 w = ST.topword;
5719                     accepted = 0;
5720                     while (w) {
5721                         w = trie->wordinfo[w].prev;
5722                         accepted++;
5723                     }
5724                     ST.accepted = accepted;
5725                 }
5726
5727                 DEBUG_EXECUTE_r(
5728                     Perl_re_exec_indentf( aTHX_  "%sgot %"IVdf" possible matches%s\n",
5729                         depth,
5730                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5731                 );
5732                 goto trie_first_try; /* jump into the fail handler */
5733             }}
5734             NOT_REACHED; /* NOTREACHED */
5735
5736         case TRIE_next_fail: /* we failed - try next alternative */
5737         {
5738             U8 *uc;
5739             if ( ST.jump ) {
5740                 REGCP_UNWIND(ST.cp);
5741                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5742             }
5743             if (!--ST.accepted) {
5744                 DEBUG_EXECUTE_r({
5745                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
5746                         depth,
5747                         PL_colors[4],
5748                         PL_colors[5] );
5749                 });
5750                 sayNO_SILENT;
5751             }
5752             {
5753                 /* Find next-highest word to process.  Note that this code
5754                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5755                 U16 min = 0;
5756                 U16 word;
5757                 U16 const nextword = ST.nextword;
5758                 reg_trie_wordinfo * const wordinfo
5759                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5760                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5761                     if (word > nextword && (!min || word < min))
5762                         min = word;
5763                 }
5764                 ST.nextword = min;
5765             }
5766
5767           trie_first_try:
5768             if (do_cutgroup) {
5769                 do_cutgroup = 0;
5770                 no_final = 0;
5771             }
5772
5773             if ( ST.jump ) {
5774                 ST.lastparen = rex->lastparen;
5775                 ST.lastcloseparen = rex->lastcloseparen;
5776                 REGCP_SET(ST.cp);
5777             }
5778
5779             /* find start char of end of current word */
5780             {
5781                 U32 chars; /* how many chars to skip */
5782                 reg_trie_data * const trie
5783                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5784
5785                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5786                             >=  ST.firstchars);
5787                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5788                             - ST.firstchars;
5789                 uc = ST.firstpos;
5790
5791                 if (ST.longfold) {
5792                     /* the hard option - fold each char in turn and find
5793                      * its folded length (which may be different */
5794                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5795                     STRLEN foldlen;
5796                     STRLEN len;
5797                     UV uvc;
5798                     U8 *uscan;
5799
5800                     while (chars) {
5801                         if (utf8_target) {
5802                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5803                                                     uniflags);
5804                             uc += len;
5805                         }
5806                         else {
5807                             uvc = *uc;
5808                             uc++;
5809                         }
5810                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5811                         uscan = foldbuf;
5812                         while (foldlen) {
5813                             if (!--chars)
5814                                 break;
5815                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5816                                             uniflags);
5817                             uscan += len;
5818                             foldlen -= len;
5819                         }
5820                     }
5821                 }
5822                 else {
5823                     if (utf8_target)
5824                         while (chars--)
5825                             uc += UTF8SKIP(uc);
5826                     else
5827                         uc += chars;
5828                 }
5829             }
5830
5831             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5832                             ? ST.jump[ST.nextword]
5833                             : NEXT_OFF(ST.me));
5834
5835             DEBUG_EXECUTE_r({
5836                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
5837                     depth,
5838                     PL_colors[4],
5839                     ST.nextword,
5840                     PL_colors[5]
5841                     );
5842             });
5843
5844             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
5845                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5846                 NOT_REACHED; /* NOTREACHED */
5847             }
5848             /* only one choice left - just continue */
5849             DEBUG_EXECUTE_r({
5850                 AV *const trie_words
5851                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5852                 SV ** const tmp = trie_words
5853                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5854                 SV *sv= tmp ? sv_newmortal() : NULL;
5855
5856                 Perl_re_exec_indentf( aTHX_  "%sonly one match left, short-circuiting: #%d <%s>%s\n",
5857                     depth, PL_colors[4],
5858                     ST.nextword,
5859                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5860                             PL_colors[0], PL_colors[1],
5861                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5862                         )
5863                     : "not compiled under -Dr",
5864                     PL_colors[5] );
5865             });
5866
5867             locinput = (char*)uc;
5868             continue; /* execute rest of RE */
5869             /* NOTREACHED */
5870         }
5871 #undef  ST
5872
5873         case EXACTL:             /*  /abc/l       */
5874             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5875
5876             /* Complete checking would involve going through every character
5877              * matched by the string to see if any is above latin1.  But the
5878              * comparision otherwise might very well be a fast assembly
5879              * language routine, and I (khw) don't think slowing things down
5880              * just to check for this warning is worth it.  So this just checks
5881              * the first character */
5882             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5883                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5884             }
5885             /* FALLTHROUGH */
5886         case EXACT: {            /*  /abc/        */
5887             char *s = STRING(scan);
5888             ln = STR_LEN(scan);
5889             if (utf8_target != is_utf8_pat) {
5890                 /* The target and the pattern have differing utf8ness. */
5891                 char *l = locinput;
5892                 const char * const e = s + ln;
5893
5894                 if (utf8_target) {
5895                     /* The target is utf8, the pattern is not utf8.
5896                      * Above-Latin1 code points can't match the pattern;
5897                      * invariants match exactly, and the other Latin1 ones need
5898                      * to be downgraded to a single byte in order to do the
5899                      * comparison.  (If we could be confident that the target
5900                      * is not malformed, this could be refactored to have fewer
5901                      * tests by just assuming that if the first bytes match, it
5902                      * is an invariant, but there are tests in the test suite
5903                      * dealing with (??{...}) which violate this) */
5904                     while (s < e) {
5905                         if (l >= reginfo->strend
5906                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5907                         {
5908                             sayNO;
5909                         }
5910                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
5911                             if (*l != *s) {
5912                                 sayNO;
5913                             }
5914                             l++;
5915                         }
5916                         else {
5917                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5918                             {
5919                                 sayNO;
5920                             }
5921                             l += 2;
5922                         }
5923                         s++;
5924                     }
5925                 }
5926                 else {
5927                     /* The target is not utf8, the pattern is utf8. */
5928                     while (s < e) {
5929                         if (l >= reginfo->strend
5930                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5931                         {
5932                             sayNO;
5933                         }
5934                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
5935                             if (*s != *l) {
5936                                 sayNO;
5937                             }
5938                             s++;
5939                         }
5940                         else {
5941                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5942                             {
5943                                 sayNO;
5944                             }
5945                             s += 2;
5946                         }
5947                         l++;
5948                     }
5949                 }
5950                 locinput = l;
5951             }
5952             else {
5953                 /* The target and the pattern have the same utf8ness. */
5954                 /* Inline the first character, for speed. */
5955                 if (reginfo->strend - locinput < ln
5956                     || UCHARAT(s) != nextchr
5957                     || (ln > 1 && memNE(s, locinput, ln)))
5958                 {
5959                     sayNO;
5960                 }
5961                 locinput += ln;
5962             }
5963             break;
5964             }
5965
5966         case EXACTFL: {          /*  /abc/il      */
5967             re_fold_t folder;
5968             const U8 * fold_array;
5969             const char * s;
5970             U32 fold_utf8_flags;
5971
5972             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5973             folder = foldEQ_locale;
5974             fold_array = PL_fold_locale;
5975             fold_utf8_flags = FOLDEQ_LOCALE;
5976             goto do_exactf;
5977
5978         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
5979                                       is effectively /u; hence to match, target
5980                                       must be UTF-8. */
5981             if (! utf8_target) {
5982                 sayNO;
5983             }
5984             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5985                                              | FOLDEQ_S1_FOLDS_SANE;
5986             folder = foldEQ_latin1;
5987             fold_array = PL_fold_latin1;
5988             goto do_exactf;
5989
5990         case EXACTFU_SS:         /*  /\x{df}/iu   */
5991         case EXACTFU:            /*  /abc/iu      */
5992             folder = foldEQ_latin1;
5993             fold_array = PL_fold_latin1;
5994             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5995             goto do_exactf;
5996
5997         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
5998                                    patterns */
5999             assert(! is_utf8_pat);
6000             /* FALLTHROUGH */
6001         case EXACTFA:            /*  /abc/iaa     */
6002             folder = foldEQ_latin1;
6003             fold_array = PL_fold_latin1;
6004             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6005             goto do_exactf;
6006
6007         case EXACTF:             /*  /abc/i    This node only generated for
6008                                                non-utf8 patterns */
6009             assert(! is_utf8_pat);
6010             folder = foldEQ;
6011             fold_array = PL_fold;
6012             fold_utf8_flags = 0;
6013
6014           do_exactf:
6015             s = STRING(scan);
6016             ln = STR_LEN(scan);
6017
6018             if (utf8_target
6019                 || is_utf8_pat
6020                 || state_num == EXACTFU_SS
6021                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6022             {
6023               /* Either target or the pattern are utf8, or has the issue where
6024                * the fold lengths may differ. */
6025                 const char * const l = locinput;
6026                 char *e = reginfo->strend;
6027
6028                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6029                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6030                 {
6031                     sayNO;
6032                 }
6033                 locinput = e;
6034                 break;
6035             }
6036
6037             /* Neither the target nor the pattern are utf8 */
6038             if (UCHARAT(s) != nextchr
6039                 && !NEXTCHR_IS_EOS
6040                 && UCHARAT(s) != fold_array[nextchr])
6041             {
6042                 sayNO;
6043             }
6044             if (reginfo->strend - locinput < ln)
6045                 sayNO;
6046             if (ln > 1 && ! folder(s, locinput, ln))
6047                 sayNO;
6048             locinput += ln;
6049             break;
6050         }
6051
6052         case NBOUNDL: /*  /\B/l  */
6053             to_complement = 1;
6054             /* FALLTHROUGH */
6055
6056         case BOUNDL:  /*  /\b/l  */
6057         {
6058             bool b1, b2;
6059             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6060
6061             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6062                 if (! IN_UTF8_CTYPE_LOCALE) {
6063                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6064                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6065                 }
6066                 goto boundu;
6067             }
6068
6069             if (utf8_target) {
6070                 if (locinput == reginfo->strbeg)
6071                     b1 = isWORDCHAR_LC('\n');
6072                 else {
6073                     b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
6074                                                         (U8*)(reginfo->strbeg)));
6075                 }
6076                 b2 = (NEXTCHR_IS_EOS)
6077                     ? isWORDCHAR_LC('\n')
6078                     : isWORDCHAR_LC_utf8((U8*)locinput);
6079             }
6080             else { /* Here the string isn't utf8 */
6081                 b1 = (locinput == reginfo->strbeg)
6082                      ? isWORDCHAR_LC('\n')
6083                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6084                 b2 = (NEXTCHR_IS_EOS)
6085                     ? isWORDCHAR_LC('\n')
6086                     : isWORDCHAR_LC(nextchr);
6087             }
6088             if (to_complement ^ (b1 == b2)) {
6089                 sayNO;
6090             }
6091             break;
6092         }
6093
6094         case NBOUND:  /*  /\B/   */
6095             to_complement = 1;
6096             /* FALLTHROUGH */
6097
6098         case BOUND:   /*  /\b/   */
6099             if (utf8_target) {
6100                 goto bound_utf8;
6101             }
6102             goto bound_ascii_match_only;
6103
6104         case NBOUNDA: /*  /\B/a  */
6105             to_complement = 1;
6106             /* FALLTHROUGH */
6107
6108         case BOUNDA:  /*  /\b/a  */
6109         {
6110             bool b1, b2;
6111
6112           bound_ascii_match_only:
6113             /* Here the string isn't utf8, or is utf8 and only ascii characters
6114              * are to match \w.  In the latter case looking at the byte just
6115              * prior to the current one may be just the final byte of a
6116              * multi-byte character.  This is ok.  There are two cases:
6117              * 1) it is a single byte character, and then the test is doing
6118              *    just what it's supposed to.
6119              * 2) it is a multi-byte character, in which case the final byte is
6120              *    never mistakable for ASCII, and so the test will say it is
6121              *    not a word character, which is the correct answer. */
6122             b1 = (locinput == reginfo->strbeg)
6123                  ? isWORDCHAR_A('\n')
6124                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6125             b2 = (NEXTCHR_IS_EOS)
6126                 ? isWORDCHAR_A('\n')
6127                 : isWORDCHAR_A(nextchr);
6128             if (to_complement ^ (b1 == b2)) {
6129                 sayNO;
6130             }
6131             break;
6132         }
6133
6134         case NBOUNDU: /*  /\B/u  */
6135             to_complement = 1;
6136             /* FALLTHROUGH */
6137
6138         case BOUNDU:  /*  /\b/u  */
6139
6140           boundu:
6141             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6142                 match = FALSE;
6143             }
6144             else if (utf8_target) {
6145               bound_utf8:
6146                 switch((bound_type) FLAGS(scan)) {
6147                     case TRADITIONAL_BOUND:
6148                     {
6149                         bool b1, b2;
6150                         b1 = (locinput == reginfo->strbeg)
6151                              ? 0 /* isWORDCHAR_L1('\n') */
6152                              : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
6153                                                        (U8*)(reginfo->strbeg)));
6154                         b2 = (NEXTCHR_IS_EOS)
6155                             ? 0 /* isWORDCHAR_L1('\n') */
6156                             : isWORDCHAR_utf8((U8*)locinput);
6157                         match = cBOOL(b1 != b2);
6158                         break;
6159                     }
6160                     case GCB_BOUND:
6161                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6162                             match = TRUE; /* GCB always matches at begin and
6163                                              end */
6164                         }
6165                         else {
6166                             /* Find the gcb values of previous and current
6167                              * chars, then see if is a break point */
6168                             match = isGCB(getGCB_VAL_UTF8(
6169                                                 reghop3((U8*)locinput,
6170                                                         -1,
6171                                                         (U8*)(reginfo->strbeg)),
6172                                                 (U8*) reginfo->strend),
6173                                           getGCB_VAL_UTF8((U8*) locinput,
6174                                                         (U8*) reginfo->strend),
6175                                           (U8*) reginfo->strbeg,
6176                                           (U8*) locinput,
6177                                           utf8_target);
6178                         }
6179                         break;
6180
6181                     case LB_BOUND:
6182                         if (locinput == reginfo->strbeg) {
6183                             match = FALSE;
6184                         }
6185                         else if (NEXTCHR_IS_EOS) {
6186                             match = TRUE;
6187                         }
6188                         else {
6189                             match = isLB(getLB_VAL_UTF8(
6190                                                 reghop3((U8*)locinput,
6191                                                         -1,
6192                                                         (U8*)(reginfo->strbeg)),
6193                                                 (U8*) reginfo->strend),
6194                                           getLB_VAL_UTF8((U8*) locinput,
6195                                                         (U8*) reginfo->strend),
6196                                           (U8*) reginfo->strbeg,
6197                                           (U8*) locinput,
6198                                           (U8*) reginfo->strend,
6199                                           utf8_target);
6200                         }
6201                         break;
6202
6203                     case SB_BOUND: /* Always matches at begin and end */
6204                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6205                             match = TRUE;
6206                         }
6207                         else {
6208                             match = isSB(getSB_VAL_UTF8(
6209                                                 reghop3((U8*)locinput,
6210                                                         -1,
6211                                                         (U8*)(reginfo->strbeg)),
6212                                                 (U8*) reginfo->strend),
6213                                           getSB_VAL_UTF8((U8*) locinput,
6214                                                         (U8*) reginfo->strend),
6215                                           (U8*) reginfo->strbeg,
6216                                           (U8*) locinput,
6217                                           (U8*) reginfo->strend,
6218                                           utf8_target);
6219                         }
6220                         break;
6221
6222                     case WB_BOUND:
6223                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6224                             match = TRUE;
6225                         }
6226                         else {
6227                             match = isWB(WB_UNKNOWN,
6228                                          getWB_VAL_UTF8(
6229                                                 reghop3((U8*)locinput,
6230                                                         -1,
6231                                                         (U8*)(reginfo->strbeg)),
6232                                                 (U8*) reginfo->strend),
6233                                           getWB_VAL_UTF8((U8*) locinput,
6234                                                         (U8*) reginfo->strend),
6235                                           (U8*) reginfo->strbeg,
6236                                           (U8*) locinput,
6237                                           (U8*) reginfo->strend,
6238                                           utf8_target);
6239                         }
6240                         break;
6241                 }
6242             }
6243             else {  /* Not utf8 target */
6244                 switch((bound_type) FLAGS(scan)) {
6245                     case TRADITIONAL_BOUND:
6246                     {
6247                         bool b1, b2;
6248                         b1 = (locinput == reginfo->strbeg)
6249                             ? 0 /* isWORDCHAR_L1('\n') */
6250                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6251                         b2 = (NEXTCHR_IS_EOS)
6252                             ? 0 /* isWORDCHAR_L1('\n') */
6253                             : isWORDCHAR_L1(nextchr);
6254                         match = cBOOL(b1 != b2);
6255                         break;
6256                     }
6257
6258                     case GCB_BOUND:
6259                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6260                             match = TRUE; /* GCB always matches at begin and
6261                                              end */
6262                         }
6263                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6264                                    range */
6265                             match =    UCHARAT(locinput - 1) != '\r'
6266                                     || UCHARAT(locinput) != '\n';
6267                         }
6268                         break;
6269
6270                     case LB_BOUND:
6271                         if (locinput == reginfo->strbeg) {
6272                             match = FALSE;
6273                         }
6274                         else if (NEXTCHR_IS_EOS) {
6275                             match = TRUE;
6276                         }
6277                         else {
6278                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6279                                          getLB_VAL_CP(UCHARAT(locinput)),
6280                                          (U8*) reginfo->strbeg,
6281                                          (U8*) locinput,
6282                                          (U8*) reginfo->strend,
6283                                          utf8_target);
6284                         }
6285                         break;
6286
6287                     case SB_BOUND: /* Always matches at begin and end */
6288                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6289                             match = TRUE;
6290                         }
6291                         else {
6292                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6293                                          getSB_VAL_CP(UCHARAT(locinput)),
6294                                          (U8*) reginfo->strbeg,
6295                                          (U8*) locinput,
6296                                          (U8*) reginfo->strend,
6297                                          utf8_target);
6298                         }
6299                         break;
6300
6301                     case WB_BOUND:
6302                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6303                             match = TRUE;
6304                         }
6305                         else {
6306                             match = isWB(WB_UNKNOWN,
6307                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6308                                          getWB_VAL_CP(UCHARAT(locinput)),
6309                                          (U8*) reginfo->strbeg,
6310                                          (U8*) locinput,
6311                                          (U8*) reginfo->strend,
6312                                          utf8_target);
6313                         }
6314                         break;
6315                 }
6316             }
6317
6318             if (to_complement ^ ! match) {
6319                 sayNO;
6320             }
6321             break;
6322
6323         case ANYOFL:  /*  /[abc]/l      */
6324             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6325
6326             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6327             {
6328               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6329             }
6330             /* FALLTHROUGH */
6331         case ANYOFD:  /*   /[abc]/d       */
6332         case ANYOF:  /*   /[abc]/       */
6333             if (NEXTCHR_IS_EOS)
6334                 sayNO;
6335             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6336                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6337                                                                    utf8_target))
6338                     sayNO;
6339                 locinput += UTF8SKIP(locinput);
6340             }
6341             else {
6342                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6343                     sayNO;
6344                 locinput++;
6345             }
6346             break;
6347
6348         /* The argument (FLAGS) to all the POSIX node types is the class number
6349          * */
6350
6351         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6352             to_complement = 1;
6353             /* FALLTHROUGH */
6354
6355         case POSIXL:    /* \w or [:punct:] etc. under /l */
6356             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6357             if (NEXTCHR_IS_EOS)
6358                 sayNO;
6359
6360             /* Use isFOO_lc() for characters within Latin1.  (Note that
6361              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6362              * wouldn't be invariant) */
6363             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6364                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6365                     sayNO;
6366                 }
6367
6368                 locinput++;
6369                 break;
6370             }
6371
6372             if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
6373                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6374                 goto utf8_posix_above_latin1;
6375             }
6376
6377             /* Here is a UTF-8 variant code point below 256 and the target is
6378              * UTF-8 */
6379             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6380                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6381                                             *(locinput + 1))))))
6382             {
6383                 sayNO;
6384             }
6385
6386             goto increment_locinput;
6387
6388         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6389             to_complement = 1;
6390             /* FALLTHROUGH */
6391
6392         case POSIXD:    /* \w or [:punct:] etc. under /d */
6393             if (utf8_target) {
6394                 goto utf8_posix;
6395             }
6396             goto posixa;
6397
6398         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6399
6400             if (NEXTCHR_IS_EOS) {
6401                 sayNO;
6402             }
6403
6404             /* All UTF-8 variants match */
6405             if (! UTF8_IS_INVARIANT(nextchr)) {
6406                 goto increment_locinput;
6407             }
6408
6409             to_complement = 1;
6410             goto join_nposixa;
6411
6412         case POSIXA:    /* \w or [:punct:] etc. under /a */
6413
6414           posixa:
6415             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6416              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6417              * character is a single byte */
6418
6419             if (NEXTCHR_IS_EOS) {
6420                 sayNO;
6421             }
6422
6423           join_nposixa:
6424
6425             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6426                                                                 FLAGS(scan)))))
6427             {
6428                 sayNO;
6429             }
6430
6431             /* Here we are either not in utf8, or we matched a utf8-invariant,
6432              * so the next char is the next byte */
6433             locinput++;
6434             break;
6435
6436         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6437             to_complement = 1;
6438             /* FALLTHROUGH */
6439
6440         case POSIXU:    /* \w or [:punct:] etc. under /u */
6441           utf8_posix:
6442             if (NEXTCHR_IS_EOS) {
6443                 sayNO;
6444             }
6445
6446             /* Use _generic_isCC() for characters within Latin1.  (Note that
6447              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6448              * wouldn't be invariant) */
6449             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6450                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6451                                                            FLAGS(scan)))))
6452                 {
6453                     sayNO;
6454                 }
6455                 locinput++;
6456             }
6457             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
6458                 if (! (to_complement
6459                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6460                                                                *(locinput + 1)),
6461                                              FLAGS(scan)))))
6462                 {
6463                     sayNO;
6464                 }
6465                 locinput += 2;
6466             }
6467             else {  /* Handle above Latin-1 code points */
6468               utf8_posix_above_latin1:
6469                 classnum = (_char_class_number) FLAGS(scan);
6470                 if (classnum < _FIRST_NON_SWASH_CC) {
6471
6472                     /* Here, uses a swash to find such code points.  Load if if
6473                      * not done already */
6474                     if (! PL_utf8_swash_ptrs[classnum]) {
6475                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6476                         PL_utf8_swash_ptrs[classnum]
6477                                 = _core_swash_init("utf8",
6478                                         "",
6479                                         &PL_sv_undef, 1, 0,
6480                                         PL_XPosix_ptrs[classnum], &flags);
6481                     }
6482                     if (! (to_complement
6483                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6484                                                (U8 *) locinput, TRUE))))
6485                     {
6486                         sayNO;
6487                     }
6488                 }
6489                 else {  /* Here, uses macros to find above Latin-1 code points */
6490                     switch (classnum) {
6491                         case _CC_ENUM_SPACE:
6492                             if (! (to_complement
6493                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
6494                             {
6495                                 sayNO;
6496                             }
6497                             break;
6498                         case _CC_ENUM_BLANK:
6499                             if (! (to_complement
6500                                             ^ cBOOL(is_HORIZWS_high(locinput))))
6501                             {
6502                                 sayNO;
6503                             }
6504                             break;
6505                         case _CC_ENUM_XDIGIT:
6506                             if (! (to_complement
6507                                             ^ cBOOL(is_XDIGIT_high(locinput))))
6508                             {
6509                                 sayNO;
6510                             }
6511                             break;
6512                         case _CC_ENUM_VERTSPACE:
6513                             if (! (to_complement
6514                                             ^ cBOOL(is_VERTWS_high(locinput))))
6515                             {
6516                                 sayNO;
6517                             }
6518                             break;
6519                         default:    /* The rest, e.g. [:cntrl:], can't match
6520                                        above Latin1 */
6521                             if (! to_complement) {
6522                                 sayNO;
6523                             }
6524                             break;
6525                     }
6526                 }
6527                 locinput += UTF8SKIP(locinput);
6528             }
6529             break;
6530
6531         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6532                        a Unicode extended Grapheme Cluster */
6533             if (NEXTCHR_IS_EOS)
6534                 sayNO;
6535             if  (! utf8_target) {
6536
6537                 /* Match either CR LF  or '.', as all the other possibilities
6538                  * require utf8 */
6539                 locinput++;         /* Match the . or CR */
6540                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6541                                        match the LF */
6542                     && locinput < reginfo->strend
6543                     && UCHARAT(locinput) == '\n')
6544                 {
6545                     locinput++;
6546                 }
6547             }
6548             else {
6549
6550                 /* Get the gcb type for the current character */
6551                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6552                                                        (U8*) reginfo->strend);
6553
6554                 /* Then scan through the input until we get to the first
6555                  * character whose type is supposed to be a gcb with the
6556                  * current character.  (There is always a break at the
6557                  * end-of-input) */
6558                 locinput += UTF8SKIP(locinput);
6559                 while (locinput < reginfo->strend) {
6560                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6561                                                          (U8*) reginfo->strend);
6562                     if (isGCB(prev_gcb, cur_gcb,
6563                               (U8*) reginfo->strbeg, (U8*) locinput,
6564                               utf8_target))
6565                     {
6566                         break;
6567                     }
6568
6569                     prev_gcb = cur_gcb;
6570                     locinput += UTF8SKIP(locinput);
6571                 }
6572
6573
6574             }
6575             break;
6576
6577         case NREFFL:  /*  /\g{name}/il  */
6578         {   /* The capture buffer cases.  The ones beginning with N for the
6579                named buffers just convert to the equivalent numbered and
6580                pretend they were called as the corresponding numbered buffer
6581                op.  */
6582             /* don't initialize these in the declaration, it makes C++
6583                unhappy */
6584             const char *s;
6585             char type;
6586             re_fold_t folder;
6587             const U8 *fold_array;
6588             UV utf8_fold_flags;
6589
6590             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6591             folder = foldEQ_locale;
6592             fold_array = PL_fold_locale;
6593             type = REFFL;
6594             utf8_fold_flags = FOLDEQ_LOCALE;
6595             goto do_nref;
6596
6597         case NREFFA:  /*  /\g{name}/iaa  */
6598             folder = foldEQ_latin1;
6599             fold_array = PL_fold_latin1;
6600             type = REFFA;
6601             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6602             goto do_nref;
6603
6604         case NREFFU:  /*  /\g{name}/iu  */
6605             folder = foldEQ_latin1;
6606             fold_array = PL_fold_latin1;
6607             type = REFFU;
6608             utf8_fold_flags = 0;
6609             goto do_nref;
6610
6611         case NREFF:  /*  /\g{name}/i  */
6612             folder = foldEQ;
6613             fold_array = PL_fold;
6614             type = REFF;
6615             utf8_fold_flags = 0;
6616             goto do_nref;
6617
6618         case NREF:  /*  /\g{name}/   */
6619             type = REF;
6620             folder = NULL;
6621             fold_array = NULL;
6622             utf8_fold_flags = 0;
6623           do_nref:
6624
6625             /* For the named back references, find the corresponding buffer
6626              * number */
6627             n = reg_check_named_buff_matched(rex,scan);
6628
6629             if ( ! n ) {
6630                 sayNO;
6631             }
6632             goto do_nref_ref_common;
6633
6634         case REFFL:  /*  /\1/il  */
6635             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6636             folder = foldEQ_locale;
6637             fold_array = PL_fold_locale;
6638             utf8_fold_flags = FOLDEQ_LOCALE;
6639             goto do_ref;
6640
6641         case REFFA:  /*  /\1/iaa  */
6642             folder = foldEQ_latin1;
6643             fold_array = PL_fold_latin1;
6644             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6645             goto do_ref;
6646
6647         case REFFU:  /*  /\1/iu  */
6648             folder = foldEQ_latin1;
6649             fold_array = PL_fold_latin1;
6650             utf8_fold_flags = 0;
6651             goto do_ref;
6652
6653         case REFF:  /*  /\1/i  */
6654             folder = foldEQ;
6655             fold_array = PL_fold;
6656             utf8_fold_flags = 0;
6657             goto do_ref;
6658
6659         case REF:  /*  /\1/    */
6660             folder = NULL;
6661             fold_array = NULL;
6662             utf8_fold_flags = 0;
6663
6664           do_ref:
6665             type = OP(scan);
6666             n = ARG(scan);  /* which paren pair */
6667
6668           do_nref_ref_common:
6669             ln = rex->offs[n].start;
6670             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6671             if (rex->lastparen < n || ln == -1)
6672                 sayNO;                  /* Do not match unless seen CLOSEn. */
6673             if (ln == rex->offs[n].end)
6674                 break;
6675
6676             s = reginfo->strbeg + ln;
6677             if (type != REF     /* REF can do byte comparison */
6678                 && (utf8_target || type == REFFU || type == REFFL))
6679             {
6680                 char * limit = reginfo->strend;
6681
6682                 /* This call case insensitively compares the entire buffer
6683                     * at s, with the current input starting at locinput, but
6684                     * not going off the end given by reginfo->strend, and
6685                     * returns in <limit> upon success, how much of the
6686                     * current input was matched */
6687                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6688                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6689                 {
6690                     sayNO;
6691                 }
6692                 locinput = limit;
6693                 break;
6694             }
6695
6696             /* Not utf8:  Inline the first character, for speed. */
6697             if (!NEXTCHR_IS_EOS &&
6698                 UCHARAT(s) != nextchr &&
6699                 (type == REF ||
6700                  UCHARAT(s) != fold_array[nextchr]))
6701                 sayNO;
6702             ln = rex->offs[n].end - ln;
6703             if (locinput + ln > reginfo->strend)
6704                 sayNO;
6705             if (ln > 1 && (type == REF
6706                            ? memNE(s, locinput, ln)
6707                            : ! folder(s, locinput, ln)))
6708                 sayNO;
6709             locinput += ln;
6710             break;
6711         }
6712
6713         case NOTHING: /* null op; e.g. the 'nothing' following
6714                        * the '*' in m{(a+|b)*}' */
6715             break;
6716         case TAIL: /* placeholder while compiling (A|B|C) */
6717             break;
6718
6719 #undef  ST
6720 #define ST st->u.eval
6721 #define CUR_EVAL cur_eval->u.eval
6722
6723         {
6724             SV *ret;
6725             REGEXP *re_sv;
6726             regexp *re;
6727             regexp_internal *rei;
6728             regnode *startpoint;
6729             U32 arg;
6730
6731         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6732             arg= (U32)ARG(scan);
6733             if (cur_eval && cur_eval->locinput == locinput) {
6734                 if ( ++nochange_depth > max_nochange_depth )
6735                     Perl_croak(aTHX_
6736                         "Pattern subroutine nesting without pos change"
6737                         " exceeded limit in regex");
6738             } else {
6739                 nochange_depth = 0;
6740             }
6741             re_sv = rex_sv;
6742             re = rex;
6743             rei = rexi;
6744             startpoint = scan + ARG2L(scan);
6745             EVAL_CLOSE_PAREN_SET( st, arg );
6746             /* Detect infinite recursion
6747              *
6748              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6749              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6750              * So we track the position in the string we are at each time
6751              * we recurse and if we try to enter the same routine twice from
6752              * the same position we throw an error.
6753              */
6754             if ( rex->recurse_locinput[arg] == locinput ) {
6755                 /* FIXME: we should show the regop that is failing as part
6756                  * of the error message. */
6757                 Perl_croak(aTHX_ "Infinite recursion in regex");
6758             } else {
6759                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6760                 rex->recurse_locinput[arg]= locinput;
6761
6762                 DEBUG_r({
6763                     GET_RE_DEBUG_FLAGS_DECL;
6764                     DEBUG_STACK_r({
6765                         Perl_re_exec_indentf( aTHX_
6766                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6767                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6768                         );
6769                     });
6770                 });
6771             }
6772
6773             /* Save all the positions seen so far. */
6774             ST.cp = regcppush(rex, 0, maxopenparen);
6775             REGCP_SET(ST.lastcp);
6776
6777             /* and then jump to the code we share with EVAL */
6778             goto eval_recurse_doit;
6779             /* NOTREACHED */
6780
6781         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
6782             if (cur_eval && cur_eval->locinput==locinput) {
6783                 if ( ++nochange_depth > max_nochange_depth )
6784                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6785             } else {
6786                 nochange_depth = 0;
6787             }
6788             {
6789                 /* execute the code in the {...} */
6790
6791                 dSP;
6792                 IV before;
6793                 OP * const oop = PL_op;
6794                 COP * const ocurcop = PL_curcop;
6795                 OP *nop;
6796                 CV *newcv;
6797
6798                 /* save *all* paren positions */
6799                 regcppush(rex, 0, maxopenparen);
6800                 REGCP_SET(runops_cp);
6801
6802                 if (!caller_cv)
6803                     caller_cv = find_runcv(NULL);
6804
6805                 n = ARG(scan);
6806
6807                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6808                     newcv = (ReANY(
6809                                     (REGEXP*)(rexi->data->data[n])
6810                             ))->qr_anoncv;
6811                     nop = (OP*)rexi->data->data[n+1];
6812                 }
6813                 else if (rexi->data->what[n] == 'l') { /* literal code */
6814                     newcv = caller_cv;
6815                     nop = (OP*)rexi->data->data[n];
6816                     assert(CvDEPTH(newcv));
6817                 }
6818                 else {
6819                     /* literal with own CV */
6820                     assert(rexi->data->what[n] == 'L');
6821                     newcv = rex->qr_anoncv;
6822                     nop = (OP*)rexi->data->data[n];
6823                 }
6824
6825                 /* normally if we're about to execute code from the same
6826                  * CV that we used previously, we just use the existing
6827                  * CX stack entry. However, its possible that in the
6828                  * meantime we may have backtracked, popped from the save
6829                  * stack, and undone the SAVECOMPPAD(s) associated with
6830                  * PUSH_MULTICALL; in which case PL_comppad no longer
6831                  * points to newcv's pad. */
6832                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6833                 {
6834                     U8 flags = (CXp_SUB_RE |
6835                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6836                     if (last_pushed_cv) {
6837                         /* PUSH/POP_MULTICALL save and restore the
6838                          * caller's PL_comppad; if we call multiple subs
6839                          * using the same CX block, we have to save and
6840                          * unwind the varying PL_comppad's ourselves,
6841                          * especially restoring the right PL_comppad on
6842                          * backtrack - so save it on the save stack */
6843                         SAVECOMPPAD();
6844                         CHANGE_MULTICALL_FLAGS(newcv, flags);
6845                     }
6846                     else {
6847                         PUSH_MULTICALL_FLAGS(newcv, flags);
6848                     }
6849                     last_pushed_cv = newcv;
6850                 }
6851                 else {
6852                     /* these assignments are just to silence compiler
6853                      * warnings */
6854                     multicall_cop = NULL;
6855                 }
6856                 last_pad = PL_comppad;
6857
6858                 /* the initial nextstate you would normally execute
6859                  * at the start of an eval (which would cause error
6860                  * messages to come from the eval), may be optimised
6861                  * away from the execution path in the regex code blocks;
6862                  * so manually set PL_curcop to it initially */
6863                 {
6864                     OP *o = cUNOPx(nop)->op_first;
6865                     assert(o->op_type == OP_NULL);
6866                     if (o->op_targ == OP_SCOPE) {
6867                         o = cUNOPo->op_first;
6868                     }
6869                     else {
6870                         assert(o->op_targ == OP_LEAVE);
6871                         o = cUNOPo->op_first;
6872                         assert(o->op_type == OP_ENTER);
6873                         o = OpSIBLING(o);
6874                     }
6875
6876                     if (o->op_type != OP_STUB) {
6877                         assert(    o->op_type == OP_NEXTSTATE
6878                                 || o->op_type == OP_DBSTATE
6879                                 || (o->op_type == OP_NULL
6880                                     &&  (  o->op_targ == OP_NEXTSTATE
6881                                         || o->op_targ == OP_DBSTATE
6882                                         )
6883                                     )
6884                         );
6885                         PL_curcop = (COP*)o;
6886                     }
6887                 }
6888                 nop = nop->op_next;
6889
6890                 DEBUG_STATE_r( Perl_re_printf( aTHX_
6891                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6892
6893                 rex->offs[0].end = locinput - reginfo->strbeg;
6894                 if (reginfo->info_aux_eval->pos_magic)
6895                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6896                                   reginfo->sv, reginfo->strbeg,
6897                                   locinput - reginfo->strbeg);
6898
6899                 if (sv_yes_mark) {
6900                     SV *sv_mrk = get_sv("REGMARK", 1);
6901                     sv_setsv(sv_mrk, sv_yes_mark);
6902                 }
6903
6904                 /* we don't use MULTICALL here as we want to call the
6905                  * first op of the block of interest, rather than the
6906                  * first op of the sub. Also, we don't want to free
6907                  * the savestack frame */
6908                 before = (IV)(SP-PL_stack_base);
6909                 PL_op = nop;
6910                 CALLRUNOPS(aTHX);                       /* Scalar context. */
6911                 SPAGAIN;
6912                 if ((IV)(SP-PL_stack_base) == before)
6913                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
6914                 else {
6915                     ret = POPs;
6916                     PUTBACK;
6917                 }
6918
6919                 /* before restoring everything, evaluate the returned
6920                  * value, so that 'uninit' warnings don't use the wrong
6921                  * PL_op or pad. Also need to process any magic vars
6922                  * (e.g. $1) *before* parentheses are restored */
6923
6924                 PL_op = NULL;
6925
6926                 re_sv = NULL;
6927                 if (logical == 0)        /*   (?{})/   */
6928                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6929                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
6930                     sw = cBOOL(SvTRUE(ret));
6931                     logical = 0;
6932                 }
6933                 else {                   /*  /(??{})  */
6934                     /*  if its overloaded, let the regex compiler handle
6935                      *  it; otherwise extract regex, or stringify  */
6936                     if (SvGMAGICAL(ret))
6937                         ret = sv_mortalcopy(ret);
6938                     if (!SvAMAGIC(ret)) {
6939                         SV *sv = ret;
6940                         if (SvROK(sv))
6941                             sv = SvRV(sv);
6942                         if (SvTYPE(sv) == SVt_REGEXP)
6943                             re_sv = (REGEXP*) sv;
6944                         else if (SvSMAGICAL(ret)) {
6945                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6946                             if (mg)
6947                                 re_sv = (REGEXP *) mg->mg_obj;
6948                         }
6949
6950                         /* force any undef warnings here */
6951                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6952                             ret = sv_mortalcopy(ret);
6953                             (void) SvPV_force_nolen(ret);
6954                         }
6955                     }
6956
6957                 }
6958
6959                 /* *** Note that at this point we don't restore
6960                  * PL_comppad, (or pop the CxSUB) on the assumption it may
6961                  * be used again soon. This is safe as long as nothing
6962                  * in the regexp code uses the pad ! */
6963                 PL_op = oop;
6964                 PL_curcop = ocurcop;
6965                 regcp_restore(rex, runops_cp, &maxopenparen);
6966                 PL_curpm_under = PL_curpm;
6967                 PL_curpm = PL_reg_curpm;
6968
6969                 if (logical != 2)
6970                     break;
6971             }
6972
6973                 /* only /(??{})/  from now on */
6974                 logical = 0;
6975                 {
6976                     /* extract RE object from returned value; compiling if
6977                      * necessary */
6978
6979                     if (re_sv) {
6980                         re_sv = reg_temp_copy(NULL, re_sv);
6981                     }
6982                     else {
6983                         U32 pm_flags = 0;
6984
6985                         if (SvUTF8(ret) && IN_BYTES) {
6986                             /* In use 'bytes': make a copy of the octet
6987                              * sequence, but without the flag on */
6988                             STRLEN len;
6989                             const char *const p = SvPV(ret, len);
6990                             ret = newSVpvn_flags(p, len, SVs_TEMP);
6991                         }
6992                         if (rex->intflags & PREGf_USE_RE_EVAL)
6993                             pm_flags |= PMf_USE_RE_EVAL;
6994
6995                         /* if we got here, it should be an engine which
6996                          * supports compiling code blocks and stuff */
6997                         assert(rex->engine && rex->engine->op_comp);
6998                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6999                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7000                                     rex->engine, NULL, NULL,
7001                                     /* copy /msixn etc to inner pattern */
7002                                     ARG2L(scan),
7003                                     pm_flags);
7004
7005                         if (!(SvFLAGS(ret)
7006                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7007                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7008                             /* This isn't a first class regexp. Instead, it's
7009                                caching a regexp onto an existing, Perl visible
7010                                scalar.  */
7011                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7012                         }
7013                     }
7014                     SAVEFREESV(re_sv);
7015                     re = ReANY(re_sv);
7016                 }
7017                 RXp_MATCH_COPIED_off(re);
7018                 re->subbeg = rex->subbeg;
7019                 re->sublen = rex->sublen;
7020                 re->suboffset = rex->suboffset;
7021                 re->subcoffset = rex->subcoffset;
7022                 re->lastparen = 0;
7023                 re->lastcloseparen = 0;
7024                 rei = RXi_GET(re);
7025                 DEBUG_EXECUTE_r(
7026                     debug_start_match(re_sv, utf8_target, locinput,
7027                                     reginfo->strend, "Matching embedded");
7028                 );
7029                 startpoint = rei->program + 1;
7030                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7031                                              * close_paren only for GOSUB */
7032                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7033                 /* Save all the seen positions so far. */
7034                 ST.cp = regcppush(rex, 0, maxopenparen);
7035                 REGCP_SET(ST.lastcp);
7036                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7037                 maxopenparen = 0;
7038                 /* run the pattern returned from (??{...}) */
7039
7040               eval_recurse_doit: /* Share code with GOSUB below this line
7041                             * At this point we expect the stack context to be
7042                             * set up correctly */
7043
7044                 /* invalidate the S-L poscache. We're now executing a
7045                  * different set of WHILEM ops (and their associated
7046                  * indexes) against the same string, so the bits in the
7047                  * cache are meaningless. Setting maxiter to zero forces
7048                  * the cache to be invalidated and zeroed before reuse.
7049                  * XXX This is too dramatic a measure. Ideally we should
7050                  * save the old cache and restore when running the outer
7051                  * pattern again */
7052                 reginfo->poscache_maxiter = 0;
7053
7054                 /* the new regexp might have a different is_utf8_pat than we do */
7055                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7056
7057                 ST.prev_rex = rex_sv;
7058                 ST.prev_curlyx = cur_curlyx;
7059                 rex_sv = re_sv;
7060                 SET_reg_curpm(rex_sv);
7061                 rex = re;
7062                 rexi = rei;
7063                 cur_curlyx = NULL;
7064                 ST.B = next;
7065                 ST.prev_eval = cur_eval;
7066                 cur_eval = st;
7067                 /* now continue from first node in postoned RE */
7068                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
7069                 NOT_REACHED; /* NOTREACHED */
7070         }
7071
7072         case EVAL_AB: /* cleanup after a successful (??{A})B */
7073             /* note: this is called twice; first after popping B, then A */
7074             DEBUG_STACK_r({
7075                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7076                     depth, cur_eval, ST.prev_eval);
7077             });
7078
7079 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7080             if ( cur_eval && CUR_EVAL.close_paren ) {\
7081                 DEBUG_STACK_r({ \
7082                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7083                         depth,    \
7084                         CUR_EVAL.close_paren - 1,\
7085                         cur_eval, \
7086                         VAL);     \
7087                 });               \
7088                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7089             }
7090
7091             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7092
7093             rex_sv = ST.prev_rex;
7094             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7095             SET_reg_curpm(rex_sv);
7096             rex = ReANY(rex_sv);
7097             rexi = RXi_GET(rex);
7098             {
7099                 /* preserve $^R across LEAVE's. See Bug 121070. */
7100                 SV *save_sv= GvSV(PL_replgv);
7101                 SvREFCNT_inc(save_sv);
7102                 regcpblow(ST.cp); /* LEAVE in disguise */
7103                 sv_setsv(GvSV(PL_replgv), save_sv);
7104                 SvREFCNT_dec(save_sv);
7105             }
7106             cur_eval = ST.prev_eval;
7107             cur_curlyx = ST.prev_curlyx;
7108
7109             /* Invalidate cache. See "invalidate" comment above. */
7110             reginfo->poscache_maxiter = 0;
7111             if ( nochange_depth )
7112                 nochange_depth--;
7113
7114             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7115             sayYES;
7116
7117
7118         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7119             /* note: this is called twice; first after popping B, then A */
7120             DEBUG_STACK_r({
7121                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7122                     depth, cur_eval, ST.prev_eval);
7123             });
7124
7125             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7126
7127             rex_sv = ST.prev_rex;
7128             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7129             SET_reg_curpm(rex_sv);
7130             rex = ReANY(rex_sv);
7131             rexi = RXi_GET(rex);
7132
7133             REGCP_UNWIND(ST.lastcp);
7134             regcppop(rex, &maxopenparen);
7135             cur_eval = ST.prev_eval;
7136             cur_curlyx = ST.prev_curlyx;
7137
7138             /* Invalidate cache. See "invalidate" comment above. */
7139             reginfo->poscache_maxiter = 0;
7140             if ( nochange_depth )
7141                 nochange_depth--;
7142
7143             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7144             sayNO_SILENT;
7145 #undef ST
7146
7147         case OPEN: /*  (  */
7148             n = ARG(scan);  /* which paren pair */
7149             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7150             if (n > maxopenparen)
7151                 maxopenparen = n;
7152             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7153                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
7154                 depth,
7155                 PTR2UV(rex),
7156                 PTR2UV(rex->offs),
7157                 (UV)n,
7158                 (IV)rex->offs[n].start_tmp,
7159                 (UV)maxopenparen
7160             ));
7161             lastopen = n;
7162             break;
7163
7164 /* XXX really need to log other places start/end are set too */
7165 #define CLOSE_CAPTURE                                                      \
7166     rex->offs[n].start = rex->offs[n].start_tmp;                           \
7167     rex->offs[n].end = locinput - reginfo->strbeg;                         \
7168     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
7169         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
7170         depth,                                                             \
7171         PTR2UV(rex),                                                       \
7172         PTR2UV(rex->offs),                                                 \
7173         (UV)n,                                                             \
7174         (IV)rex->offs[n].start,                                            \
7175         (IV)rex->offs[n].end                                               \
7176     ))
7177
7178         case CLOSE:  /*  )  */
7179             n = ARG(scan);  /* which paren pair */
7180             CLOSE_CAPTURE;
7181             if (n > rex->lastparen)
7182                 rex->lastparen = n;
7183             rex->lastcloseparen = n;
7184             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7185                 goto fake_end;
7186
7187             break;
7188
7189         case ACCEPT:  /*  (*ACCEPT)  */
7190             if (scan->flags)
7191                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7192             if (ARG2L(scan)){
7193                 regnode *cursor;
7194                 for (cursor=scan;
7195                      cursor && OP(cursor)!=END;
7196                      cursor=regnext(cursor))
7197                 {
7198                     if ( OP(cursor)==CLOSE ){
7199                         n = ARG(cursor);
7200                         if ( n <= lastopen ) {
7201                             CLOSE_CAPTURE;
7202                             if (n > rex->lastparen)
7203                                 rex->lastparen = n;
7204                             rex->lastcloseparen = n;
7205                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7206                                 break;
7207                         }
7208                     }
7209                 }
7210             }
7211             goto fake_end;
7212             /* NOTREACHED */
7213
7214         case GROUPP:  /*  (?(1))  */
7215             n = ARG(scan);  /* which paren pair */
7216             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7217             break;
7218
7219         case NGROUPP:  /*  (?(<name>))  */
7220             /* reg_check_named_buff_matched returns 0 for no match */
7221             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7222             break;
7223
7224         case INSUBP:   /*  (?(R))  */
7225             n = ARG(scan);
7226             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7227              * of SCAN is already set up as matches a eval.close_paren */
7228             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7229             break;
7230
7231         case DEFINEP:  /*  (?(DEFINE))  */
7232             sw = 0;
7233             break;
7234
7235         case IFTHEN:   /*  (?(cond)A|B)  */
7236             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7237             if (sw)
7238                 next = NEXTOPER(NEXTOPER(scan));
7239             else {
7240                 next = scan + ARG(scan);
7241                 if (OP(next) == IFTHEN) /* Fake one. */
7242                     next = NEXTOPER(NEXTOPER(next));
7243             }
7244             break;
7245
7246         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7247             logical = scan->flags;
7248             break;
7249
7250 /*******************************************************************
7251
7252 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7253 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7254 STAR/PLUS/CURLY/CURLYN are used instead.)
7255
7256 A*B is compiled as <CURLYX><A><WHILEM><B>
7257
7258 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7259 state, which contains the current count, initialised to -1. It also sets
7260 cur_curlyx to point to this state, with any previous value saved in the
7261 state block.
7262
7263 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7264 since the pattern may possibly match zero times (i.e. it's a while {} loop
7265 rather than a do {} while loop).
7266
7267 Each entry to WHILEM represents a successful match of A. The count in the
7268 CURLYX block is incremented, another WHILEM state is pushed, and execution
7269 passes to A or B depending on greediness and the current count.
7270
7271 For example, if matching against the string a1a2a3b (where the aN are
7272 substrings that match /A/), then the match progresses as follows: (the
7273 pushed states are interspersed with the bits of strings matched so far):
7274
7275     <CURLYX cnt=-1>
7276     <CURLYX cnt=0><WHILEM>
7277     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7278     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7279     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7280     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7281
7282 (Contrast this with something like CURLYM, which maintains only a single
7283 backtrack state:
7284
7285     <CURLYM cnt=0> a1
7286     a1 <CURLYM cnt=1> a2
7287     a1 a2 <CURLYM cnt=2> a3
7288     a1 a2 a3 <CURLYM cnt=3> b
7289 )
7290
7291 Each WHILEM state block marks a point to backtrack to upon partial failure
7292 of A or B, and also contains some minor state data related to that
7293 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7294 overall state, such as the count, and pointers to the A and B ops.
7295
7296 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7297 must always point to the *current* CURLYX block, the rules are:
7298
7299 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7300 and set cur_curlyx to point the new block.
7301
7302 When popping the CURLYX block after a successful or unsuccessful match,
7303 restore the previous cur_curlyx.
7304
7305 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7306 to the outer one saved in the CURLYX block.
7307
7308 When popping the WHILEM block after a successful or unsuccessful B match,
7309 restore the previous cur_curlyx.
7310
7311 Here's an example for the pattern (AI* BI)*BO
7312 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7313
7314 cur_
7315 curlyx backtrack stack
7316 ------ ---------------
7317 NULL
7318 CO     <CO prev=NULL> <WO>
7319 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7320 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7321 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7322
7323 At this point the pattern succeeds, and we work back down the stack to
7324 clean up, restoring as we go:
7325
7326 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7327 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7328 CO     <CO prev=NULL> <WO>
7329 NULL
7330
7331 *******************************************************************/
7332
7333 #define ST st->u.curlyx
7334
7335         case CURLYX:    /* start of /A*B/  (for complex A) */
7336         {
7337             /* No need to save/restore up to this paren */
7338             I32 parenfloor = scan->flags;
7339
7340             assert(next); /* keep Coverity happy */
7341             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7342                 next += ARG(next);
7343
7344             /* XXXX Probably it is better to teach regpush to support
7345                parenfloor > maxopenparen ... */
7346             if (parenfloor > (I32)rex->lastparen)
7347                 parenfloor = rex->lastparen; /* Pessimization... */
7348
7349             ST.prev_curlyx= cur_curlyx;
7350             cur_curlyx = st;
7351             ST.cp = PL_savestack_ix;
7352
7353             /* these fields contain the state of the current curly.
7354              * they are accessed by subsequent WHILEMs */
7355             ST.parenfloor = parenfloor;
7356             ST.me = scan;
7357             ST.B = next;
7358             ST.minmod = minmod;
7359             minmod = 0;
7360             ST.count = -1;      /* this will be updated by WHILEM */
7361             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7362
7363             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7364             NOT_REACHED; /* NOTREACHED */
7365         }
7366
7367         case CURLYX_end: /* just finished matching all of A*B */
7368             cur_curlyx = ST.prev_curlyx;
7369             sayYES;
7370             NOT_REACHED; /* NOTREACHED */
7371
7372         case CURLYX_end_fail: /* just failed to match all of A*B */
7373             regcpblow(ST.cp);
7374             cur_curlyx = ST.prev_curlyx;
7375             sayNO;
7376             NOT_REACHED; /* NOTREACHED */
7377
7378
7379 #undef ST
7380 #define ST st->u.whilem
7381
7382         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7383         {
7384             /* see the discussion above about CURLYX/WHILEM */
7385             I32 n;
7386             int min, max;
7387             regnode *A;
7388
7389             assert(cur_curlyx); /* keep Coverity happy */
7390
7391             min = ARG1(cur_curlyx->u.curlyx.me);
7392             max = ARG2(cur_curlyx->u.curlyx.me);
7393             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7394             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7395             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7396             ST.cache_offset = 0;
7397             ST.cache_mask = 0;
7398
7399
7400             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: matched %ld out of %d..%d\n",
7401                   depth, (long)n, min, max)
7402             );
7403
7404             /* First just match a string of min A's. */
7405
7406             if (n < min) {
7407                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7408                 cur_curlyx->u.curlyx.lastloc = locinput;
7409                 REGCP_SET(ST.lastcp);
7410
7411                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7412                 NOT_REACHED; /* NOTREACHED */
7413             }
7414
7415             /* If degenerate A matches "", assume A done. */
7416
7417             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7418                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: empty match detected, trying continuation...\n",
7419                    depth)
7420                 );
7421                 goto do_whilem_B_max;
7422             }
7423
7424             /* super-linear cache processing.
7425              *
7426              * The idea here is that for certain types of CURLYX/WHILEM -
7427              * principally those whose upper bound is infinity (and
7428              * excluding regexes that have things like \1 and other very
7429              * non-regular expresssiony things), then if a pattern like
7430              * /....A*.../ fails and we backtrack to the WHILEM, then we
7431              * make a note that this particular WHILEM op was at string
7432              * position 47 (say) when the rest of pattern failed. Then, if
7433              * we ever find ourselves back at that WHILEM, and at string
7434              * position 47 again, we can just fail immediately rather than
7435              * running the rest of the pattern again.
7436              *
7437              * This is very handy when patterns start to go
7438              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7439              * with a combinatorial explosion of backtracking.
7440              *
7441              * The cache is implemented as a bit array, with one bit per
7442              * string byte position per WHILEM op (up to 16) - so its
7443              * between 0.25 and 2x the string size.
7444              *
7445              * To avoid allocating a poscache buffer every time, we do an
7446              * initially countdown; only after we have  executed a WHILEM
7447              * op (string-length x #WHILEMs) times do we allocate the
7448              * cache.
7449              *
7450              * The top 4 bits of scan->flags byte say how many different
7451              * relevant CURLLYX/WHILEM op pairs there are, while the
7452              * bottom 4-bits is the identifying index number of this
7453              * WHILEM.
7454              */
7455
7456             if (scan->flags) {
7457
7458                 if (!reginfo->poscache_maxiter) {
7459                     /* start the countdown: Postpone detection until we
7460                      * know the match is not *that* much linear. */
7461                     reginfo->poscache_maxiter
7462                         =    (reginfo->strend - reginfo->strbeg + 1)
7463                            * (scan->flags>>4);
7464                     /* possible overflow for long strings and many CURLYX's */
7465                     if (reginfo->poscache_maxiter < 0)
7466                         reginfo->poscache_maxiter = I32_MAX;
7467                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7468                 }
7469
7470                 if (reginfo->poscache_iter-- == 0) {
7471                     /* initialise cache */
7472                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7473                     regmatch_info_aux *const aux = reginfo->info_aux;
7474                     if (aux->poscache) {
7475                         if ((SSize_t)reginfo->poscache_size < size) {
7476                             Renew(aux->poscache, size, char);
7477                             reginfo->poscache_size = size;
7478                         }
7479                         Zero(aux->poscache, size, char);
7480                     }
7481                     else {
7482                         reginfo->poscache_size = size;
7483                         Newxz(aux->poscache, size, char);
7484                     }
7485                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7486       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
7487                               PL_colors[4], PL_colors[5])
7488                     );
7489                 }
7490
7491                 if (reginfo->poscache_iter < 0) {
7492                     /* have we already failed at this position? */
7493                     SSize_t offset, mask;
7494
7495                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7496                     offset  = (scan->flags & 0xf) - 1
7497                                 +   (locinput - reginfo->strbeg)
7498                                   * (scan->flags>>4);
7499                     mask    = 1 << (offset % 8);
7500                     offset /= 8;
7501                     if (reginfo->info_aux->poscache[offset] & mask) {
7502                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: (cache) already tried at this position...\n",
7503                             depth)
7504                         );
7505                         sayNO; /* cache records failure */
7506                     }
7507                     ST.cache_offset = offset;
7508                     ST.cache_mask   = mask;
7509                 }
7510             }
7511
7512             /* Prefer B over A for minimal matching. */
7513
7514             if (cur_curlyx->u.curlyx.minmod) {
7515                 ST.save_curlyx = cur_curlyx;
7516                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7517                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
7518                             maxopenparen);
7519                 REGCP_SET(ST.lastcp);
7520                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7521                                     locinput);
7522                 NOT_REACHED; /* NOTREACHED */
7523             }
7524
7525             /* Prefer A over B for maximal matching. */
7526
7527             if (n < max) { /* More greed allowed? */
7528                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7529                             maxopenparen);
7530                 cur_curlyx->u.curlyx.lastloc = locinput;
7531                 REGCP_SET(ST.lastcp);
7532                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7533                 NOT_REACHED; /* NOTREACHED */
7534             }
7535             goto do_whilem_B_max;
7536         }
7537         NOT_REACHED; /* NOTREACHED */
7538
7539         case WHILEM_B_min: /* just matched B in a minimal match */
7540         case WHILEM_B_max: /* just matched B in a maximal match */
7541             cur_curlyx = ST.save_curlyx;
7542             sayYES;
7543             NOT_REACHED; /* NOTREACHED */
7544
7545         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7546             cur_curlyx = ST.save_curlyx;
7547             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7548             cur_curlyx->u.curlyx.count--;
7549             CACHEsayNO;
7550             NOT_REACHED; /* NOTREACHED */
7551
7552         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7553             /* FALLTHROUGH */
7554         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7555             REGCP_UNWIND(ST.lastcp);
7556             regcppop(rex, &maxopenparen);
7557             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7558             cur_curlyx->u.curlyx.count--;
7559             CACHEsayNO;
7560             NOT_REACHED; /* NOTREACHED */
7561
7562         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7563             REGCP_UNWIND(ST.lastcp);
7564             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
7565             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "whilem: failed, trying continuation...\n",
7566                 depth)
7567             );
7568           do_whilem_B_max:
7569             if (cur_curlyx->u.curlyx.count >= REG_INFTY
7570                 && ckWARN(WARN_REGEXP)
7571                 && !reginfo->warned)
7572             {
7573                 reginfo->warned = TRUE;
7574                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7575                      "Complex regular subexpression recursion limit (%d) "
7576                      "exceeded",
7577                      REG_INFTY - 1);
7578             }
7579
7580             /* now try B */
7581             ST.save_curlyx = cur_curlyx;
7582             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7583             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7584                                 locinput);
7585             NOT_REACHED; /* NOTREACHED */
7586
7587         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7588             cur_curlyx = ST.save_curlyx;
7589             REGCP_UNWIND(ST.lastcp);
7590             regcppop(rex, &maxopenparen);
7591
7592             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
7593                 /* Maximum greed exceeded */
7594                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7595                     && ckWARN(WARN_REGEXP)
7596                     && !reginfo->warned)
7597                 {
7598                     reginfo->warned     = TRUE;
7599                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7600                         "Complex regular subexpression recursion "
7601                         "limit (%d) exceeded",
7602                         REG_INFTY - 1);
7603                 }
7604                 cur_curlyx->u.curlyx.count--;
7605                 CACHEsayNO;
7606             }
7607
7608             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "trying longer...\n", depth)
7609             );
7610             /* Try grabbing another A and see if it helps. */
7611             cur_curlyx->u.curlyx.lastloc = locinput;
7612             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7613                             maxopenparen);
7614             REGCP_SET(ST.lastcp);
7615             PUSH_STATE_GOTO(WHILEM_A_min,
7616                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
7617                 locinput);
7618             NOT_REACHED; /* NOTREACHED */
7619
7620 #undef  ST
7621 #define ST st->u.branch
7622
7623         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
7624             next = scan + ARG(scan);
7625             if (next == scan)
7626                 next = NULL;
7627             scan = NEXTOPER(scan);
7628             /* FALLTHROUGH */
7629
7630         case BRANCH:        /*  /(...|A|...)/ */
7631             scan = NEXTOPER(scan); /* scan now points to inner node */
7632             ST.lastparen = rex->lastparen;
7633             ST.lastcloseparen = rex->lastcloseparen;
7634             ST.next_branch = next;
7635             REGCP_SET(ST.cp);
7636
7637             /* Now go into the branch */
7638             if (has_cutgroup) {
7639                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7640             } else {
7641                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7642             }
7643             NOT_REACHED; /* NOTREACHED */
7644
7645         case CUTGROUP:  /*  /(*THEN)/  */
7646             sv_yes_mark = st->u.mark.mark_name = scan->flags
7647                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7648                 : NULL;
7649             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7650             NOT_REACHED; /* NOTREACHED */
7651
7652         case CUTGROUP_next_fail:
7653             do_cutgroup = 1;
7654             no_final = 1;
7655             if (st->u.mark.mark_name)
7656                 sv_commit = st->u.mark.mark_name;
7657             sayNO;
7658             NOT_REACHED; /* NOTREACHED */
7659
7660         case BRANCH_next:
7661             sayYES;
7662             NOT_REACHED; /* NOTREACHED */
7663
7664         case BRANCH_next_fail: /* that branch failed; try the next, if any */
7665             if (do_cutgroup) {
7666                 do_cutgroup = 0;
7667                 no_final = 0;
7668             }
7669             REGCP_UNWIND(ST.cp);
7670             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7671             scan = ST.next_branch;
7672             /* no more branches? */
7673             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7674                 DEBUG_EXECUTE_r({
7675                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
7676                         depth,
7677                         PL_colors[4],
7678                         PL_colors[5] );
7679                 });
7680                 sayNO_SILENT;
7681             }
7682             continue; /* execute next BRANCH[J] op */
7683             /* NOTREACHED */
7684
7685         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7686             minmod = 1;
7687             break;
7688
7689 #undef  ST
7690 #define ST st->u.curlym
7691
7692         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
7693
7694             /* This is an optimisation of CURLYX that enables us to push
7695              * only a single backtracking state, no matter how many matches
7696              * there are in {m,n}. It relies on the pattern being constant
7697              * length, with no parens to influence future backrefs
7698              */
7699
7700             ST.me = scan;
7701             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7702
7703             ST.lastparen      = rex->lastparen;
7704             ST.lastcloseparen = rex->lastcloseparen;
7705
7706             /* if paren positive, emulate an OPEN/CLOSE around A */
7707             if (ST.me->flags) {
7708                 U32 paren = ST.me->flags;
7709                 if (paren > maxopenparen)
7710                     maxopenparen = paren;
7711                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7712             }
7713             ST.A = scan;
7714             ST.B = next;
7715             ST.alen = 0;
7716             ST.count = 0;
7717             ST.minmod = minmod;
7718             minmod = 0;
7719             ST.c1 = CHRTEST_UNINIT;
7720             REGCP_SET(ST.cp);
7721
7722             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7723                 goto curlym_do_B;
7724
7725           curlym_do_A: /* execute the A in /A{m,n}B/  */
7726             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7727             NOT_REACHED; /* NOTREACHED */
7728
7729         case CURLYM_A: /* we've just matched an A */
7730             ST.count++;
7731             /* after first match, determine A's length: u.curlym.alen */
7732             if (ST.count == 1) {
7733                 if (reginfo->is_utf8_target) {
7734                     char *s = st->locinput;
7735                     while (s < locinput) {
7736                         ST.alen++;
7737                         s += UTF8SKIP(s);
7738                     }
7739                 }
7740                 else {
7741                     ST.alen = locinput - st->locinput;
7742                 }
7743                 if (ST.alen == 0)
7744                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7745             }
7746             DEBUG_EXECUTE_r(
7747                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7748                           depth, (IV) ST.count, (IV)ST.alen)
7749             );
7750
7751             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7752                 goto fake_end;
7753
7754             {
7755                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7756                 if ( max == REG_INFTY || ST.count < max )
7757                     goto curlym_do_A; /* try to match another A */
7758             }
7759             goto curlym_do_B; /* try to match B */
7760
7761         case CURLYM_A_fail: /* just failed to match an A */
7762             REGCP_UNWIND(ST.cp);
7763
7764
7765             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7766                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7767                 sayNO;
7768
7769           curlym_do_B: /* execute the B in /A{m,n}B/  */
7770             if (ST.c1 == CHRTEST_UNINIT) {
7771                 /* calculate c1 and c2 for possible match of 1st char
7772                  * following curly */
7773                 ST.c1 = ST.c2 = CHRTEST_VOID;
7774                 assert(ST.B);
7775                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7776                     regnode *text_node = ST.B;
7777                     if (! HAS_TEXT(text_node))
7778                         FIND_NEXT_IMPT(text_node);
7779                     /* this used to be
7780
7781                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7782
7783                         But the former is redundant in light of the latter.
7784
7785                         if this changes back then the macro for
7786                         IS_TEXT and friends need to change.
7787                      */
7788                     if (PL_regkind[OP(text_node)] == EXACT) {
7789                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7790                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7791                            reginfo))
7792                         {
7793                             sayNO;
7794                         }
7795                     }
7796                 }
7797             }
7798
7799             DEBUG_EXECUTE_r(
7800                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%"IVdf"...\n",
7801                     depth, (IV)ST.count)
7802                 );
7803             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7804                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7805                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7806                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7807                     {
7808                         /* simulate B failing */
7809                         DEBUG_OPTIMISE_r(
7810                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7811                                 depth,
7812                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7813                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7814                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7815                         );
7816                         state_num = CURLYM_B_fail;
7817                         goto reenter_switch;
7818                     }
7819                 }
7820                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7821                     /* simulate B failing */
7822                     DEBUG_OPTIMISE_r(
7823                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7824                             depth,
7825                             (int) nextchr, ST.c1, ST.c2)
7826                     );
7827                     state_num = CURLYM_B_fail;
7828                     goto reenter_switch;
7829                 }
7830             }
7831
7832             if (ST.me->flags) {
7833                 /* emulate CLOSE: mark current A as captured */
7834                 I32 paren = ST.me->flags;
7835                 if (ST.count) {
7836                     rex->offs[paren].start
7837                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7838                     rex->offs[paren].end = locinput - reginfo->strbeg;
7839                     if ((U32)paren > rex->lastparen)
7840                         rex->lastparen = paren;
7841                     rex->lastcloseparen = paren;
7842                 }
7843                 else
7844                     rex->offs[paren].end = -1;
7845
7846                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7847                 {
7848                     if (ST.count)
7849                         goto fake_end;
7850                     else
7851                         sayNO;
7852                 }
7853             }
7854
7855             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7856             NOT_REACHED; /* NOTREACHED */
7857
7858         case CURLYM_B_fail: /* just failed to match a B */
7859             REGCP_UNWIND(ST.cp);
7860             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7861             if (ST.minmod) {
7862                 I32 max = ARG2(ST.me);
7863                 if (max != REG_INFTY && ST.count == max)
7864                     sayNO;
7865                 goto curlym_do_A; /* try to match a further A */
7866             }
7867             /* backtrack one A */
7868             if (ST.count == ARG1(ST.me) /* min */)
7869                 sayNO;
7870             ST.count--;
7871             SET_locinput(HOPc(locinput, -ST.alen));
7872             goto curlym_do_B; /* try to match B */
7873
7874 #undef ST
7875 #define ST st->u.curly
7876
7877 #define CURLY_SETPAREN(paren, success) \
7878     if (paren) { \
7879         if (success) { \
7880             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7881             rex->offs[paren].end = locinput - reginfo->strbeg; \
7882             if (paren > rex->lastparen) \
7883                 rex->lastparen = paren; \
7884             rex->lastcloseparen = paren; \
7885         } \
7886         else { \
7887             rex->offs[paren].end = -1; \
7888             rex->lastparen      = ST.lastparen; \
7889             rex->lastcloseparen = ST.lastcloseparen; \
7890         } \
7891     }
7892
7893         case STAR:              /*  /A*B/ where A is width 1 char */
7894             ST.paren = 0;
7895             ST.min = 0;
7896             ST.max = REG_INFTY;
7897             scan = NEXTOPER(scan);
7898             goto repeat;
7899
7900         case PLUS:              /*  /A+B/ where A is width 1 char */
7901             ST.paren = 0;
7902             ST.min = 1;
7903             ST.max = REG_INFTY;
7904             scan = NEXTOPER(scan);
7905             goto repeat;
7906
7907         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
7908             ST.paren = scan->flags;     /* Which paren to set */
7909             ST.lastparen      = rex->lastparen;
7910             ST.lastcloseparen = rex->lastcloseparen;
7911             if (ST.paren > maxopenparen)
7912                 maxopenparen = ST.paren;
7913             ST.min = ARG1(scan);  /* min to match */
7914             ST.max = ARG2(scan);  /* max to match */
7915             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
7916             {
7917                 ST.min=1;
7918                 ST.max=1;
7919             }
7920             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7921             goto repeat;
7922
7923         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
7924             ST.paren = 0;
7925             ST.min = ARG1(scan);  /* min to match */
7926             ST.max = ARG2(scan);  /* max to match */
7927             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7928           repeat:
7929             /*
7930             * Lookahead to avoid useless match attempts
7931             * when we know what character comes next.
7932             *
7933             * Used to only do .*x and .*?x, but now it allows
7934             * for )'s, ('s and (?{ ... })'s to be in the way
7935             * of the quantifier and the EXACT-like node.  -- japhy
7936             */
7937
7938             assert(ST.min <= ST.max);
7939             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7940                 ST.c1 = ST.c2 = CHRTEST_VOID;
7941             }
7942             else {
7943                 regnode *text_node = next;
7944
7945                 if (! HAS_TEXT(text_node))
7946                     FIND_NEXT_IMPT(text_node);
7947
7948                 if (! HAS_TEXT(text_node))
7949                     ST.c1 = ST.c2 = CHRTEST_VOID;
7950                 else {
7951                     if ( PL_regkind[OP(text_node)] != EXACT ) {
7952                         ST.c1 = ST.c2 = CHRTEST_VOID;
7953                     }
7954                     else {
7955
7956                     /*  Currently we only get here when
7957
7958                         PL_rekind[OP(text_node)] == EXACT
7959
7960                         if this changes back then the macro for IS_TEXT and
7961                         friends need to change. */
7962                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7963                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7964                            reginfo))
7965                         {
7966                             sayNO;
7967                         }
7968                     }
7969                 }
7970             }
7971
7972             ST.A = scan;
7973             ST.B = next;
7974             if (minmod) {
7975                 char *li = locinput;
7976                 minmod = 0;
7977                 if (ST.min &&
7978                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
7979                             < ST.min)
7980                     sayNO;
7981                 SET_locinput(li);
7982                 ST.count = ST.min;
7983                 REGCP_SET(ST.cp);
7984                 if (ST.c1 == CHRTEST_VOID)
7985                     goto curly_try_B_min;
7986
7987                 ST.oldloc = locinput;
7988
7989                 /* set ST.maxpos to the furthest point along the
7990                  * string that could possibly match */
7991                 if  (ST.max == REG_INFTY) {
7992                     ST.maxpos = reginfo->strend - 1;
7993                     if (utf8_target)
7994                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7995                             ST.maxpos--;
7996                 }
7997                 else if (utf8_target) {
7998                     int m = ST.max - ST.min;
7999                     for (ST.maxpos = locinput;
8000                          m >0 && ST.maxpos < reginfo->strend; m--)
8001                         ST.maxpos += UTF8SKIP(ST.maxpos);
8002                 }
8003                 else {
8004                     ST.maxpos = locinput + ST.max - ST.min;
8005                     if (ST.maxpos >= reginfo->strend)
8006                         ST.maxpos = reginfo->strend - 1;
8007                 }
8008                 goto curly_try_B_min_known;
8009
8010             }
8011             else {
8012                 /* avoid taking address of locinput, so it can remain
8013                  * a register var */
8014                 char *li = locinput;
8015                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8016                 if (ST.count < ST.min)
8017                     sayNO;
8018                 SET_locinput(li);
8019                 if ((ST.count > ST.min)
8020                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8021                 {
8022                     /* A{m,n} must come at the end of the string, there's
8023                      * no point in backing off ... */
8024                     ST.min = ST.count;
8025                     /* ...except that $ and \Z can match before *and* after
8026                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8027                        We may back off by one in this case. */
8028                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8029                         ST.min--;
8030                 }
8031                 REGCP_SET(ST.cp);
8032                 goto curly_try_B_max;
8033             }
8034             NOT_REACHED; /* NOTREACHED */
8035
8036         case CURLY_B_min_known_fail:
8037             /* failed to find B in a non-greedy match where c1,c2 valid */
8038
8039             REGCP_UNWIND(ST.cp);
8040             if (ST.paren) {
8041                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8042             }
8043             /* Couldn't or didn't -- move forward. */
8044             ST.oldloc = locinput;
8045             if (utf8_target)
8046                 locinput += UTF8SKIP(locinput);
8047             else
8048                 locinput++;
8049             ST.count++;
8050           curly_try_B_min_known:
8051              /* find the next place where 'B' could work, then call B */
8052             {
8053                 int n;
8054                 if (utf8_target) {
8055                     n = (ST.oldloc == locinput) ? 0 : 1;
8056                     if (ST.c1 == ST.c2) {
8057                         /* set n to utf8_distance(oldloc, locinput) */
8058                         while (locinput <= ST.maxpos
8059                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8060                         {
8061                             locinput += UTF8SKIP(locinput);
8062                             n++;
8063                         }
8064                     }
8065                     else {
8066                         /* set n to utf8_distance(oldloc, locinput) */
8067                         while (locinput <= ST.maxpos
8068                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8069                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8070                         {
8071                             locinput += UTF8SKIP(locinput);
8072                             n++;
8073                         }
8074                     }
8075                 }
8076                 else {  /* Not utf8_target */
8077                     if (ST.c1 == ST.c2) {
8078                         while (locinput <= ST.maxpos &&
8079                                UCHARAT(locinput) != ST.c1)
8080                             locinput++;
8081                     }
8082                     else {
8083                         while (locinput <= ST.maxpos
8084                                && UCHARAT(locinput) != ST.c1
8085                                && UCHARAT(locinput) != ST.c2)
8086                             locinput++;
8087                     }
8088                     n = locinput - ST.oldloc;
8089                 }
8090                 if (locinput > ST.maxpos)
8091                     sayNO;
8092                 if (n) {
8093                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8094                      * at b; check that everything between oldloc and
8095                      * locinput matches */
8096                     char *li = ST.oldloc;
8097                     ST.count += n;
8098                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8099                         sayNO;
8100                     assert(n == REG_INFTY || locinput == li);
8101                 }
8102                 CURLY_SETPAREN(ST.paren, ST.count);
8103                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8104                     goto fake_end;
8105                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8106             }
8107             NOT_REACHED; /* NOTREACHED */
8108
8109         case CURLY_B_min_fail:
8110             /* failed to find B in a non-greedy match where c1,c2 invalid */
8111
8112             REGCP_UNWIND(ST.cp);
8113             if (ST.paren) {
8114                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8115             }
8116             /* failed -- move forward one */
8117             {
8118                 char *li = locinput;
8119                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8120                     sayNO;
8121                 }
8122                 locinput = li;
8123             }
8124             {
8125                 ST.count++;
8126                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8127                         ST.count > 0)) /* count overflow ? */
8128                 {
8129                   curly_try_B_min:
8130                     CURLY_SETPAREN(ST.paren, ST.count);
8131                     if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8132                         goto fake_end;
8133                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8134                 }
8135             }
8136             sayNO;
8137             NOT_REACHED; /* NOTREACHED */
8138
8139           curly_try_B_max:
8140             /* a successful greedy match: now try to match B */
8141             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8142                 goto fake_end;
8143             {
8144                 bool could_match = locinput < reginfo->strend;
8145
8146                 /* If it could work, try it. */
8147                 if (ST.c1 != CHRTEST_VOID && could_match) {
8148                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8149                     {
8150                         could_match = memEQ(locinput,
8151                                             ST.c1_utf8,
8152                                             UTF8SKIP(locinput))
8153                                     || memEQ(locinput,
8154                                              ST.c2_utf8,
8155                                              UTF8SKIP(locinput));
8156                     }
8157                     else {
8158                         could_match = UCHARAT(locinput) == ST.c1
8159                                       || UCHARAT(locinput) == ST.c2;
8160                     }
8161                 }
8162                 if (ST.c1 == CHRTEST_VOID || could_match) {
8163                     CURLY_SETPAREN(ST.paren, ST.count);
8164                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8165                     NOT_REACHED; /* NOTREACHED */
8166                 }
8167             }
8168             /* FALLTHROUGH */
8169
8170         case CURLY_B_max_fail:
8171             /* failed to find B in a greedy match */
8172
8173             REGCP_UNWIND(ST.cp);
8174             if (ST.paren) {
8175                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8176             }
8177             /*  back up. */
8178             if (--ST.count < ST.min)
8179                 sayNO;
8180             locinput = HOPc(locinput, -1);
8181             goto curly_try_B_max;
8182
8183 #undef ST
8184
8185         case END: /*  last op of main pattern  */
8186           fake_end:
8187             if (cur_eval) {
8188                 /* we've just finished A in /(??{A})B/; now continue with B */
8189                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8190                 st->u.eval.prev_rex = rex_sv;           /* inner */
8191
8192                 /* Save *all* the positions. */
8193                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8194                 rex_sv = CUR_EVAL.prev_rex;
8195                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8196                 SET_reg_curpm(rex_sv);
8197                 rex = ReANY(rex_sv);
8198                 rexi = RXi_GET(rex);
8199
8200                 st->u.eval.prev_curlyx = cur_curlyx;
8201                 cur_curlyx = CUR_EVAL.prev_curlyx;
8202
8203                 REGCP_SET(st->u.eval.lastcp);
8204
8205                 /* Restore parens of the outer rex without popping the
8206                  * savestack */
8207                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8208
8209                 st->u.eval.prev_eval = cur_eval;
8210                 cur_eval = CUR_EVAL.prev_eval;
8211                 DEBUG_EXECUTE_r(
8212                     Perl_re_exec_indentf( aTHX_  "EVAL trying tail ... (cur_eval=%p)\n",
8213                                       depth, cur_eval););
8214                 if ( nochange_depth )
8215                     nochange_depth--;
8216
8217                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8218
8219                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
8220                                     locinput); /* match B */
8221             }
8222
8223             if (locinput < reginfo->till) {
8224                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8225                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8226                                       PL_colors[4],
8227                                       (long)(locinput - startpos),
8228                                       (long)(reginfo->till - startpos),
8229                                       PL_colors[5]));
8230
8231                 sayNO_SILENT;           /* Cannot match: too short. */
8232             }
8233             sayYES;                     /* Success! */
8234
8235         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8236             DEBUG_EXECUTE_r(
8237             Perl_re_exec_indentf( aTHX_  "%ssubpattern success...%s\n",
8238                 depth, PL_colors[4], PL_colors[5]));
8239             sayYES;                     /* Success! */
8240
8241 #undef  ST
8242 #define ST st->u.ifmatch
8243
8244         {
8245             char *newstart;
8246
8247         case SUSPEND:   /* (?>A) */
8248             ST.wanted = 1;
8249             newstart = locinput;
8250             goto do_ifmatch;
8251
8252         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8253             ST.wanted = 0;
8254             goto ifmatch_trivial_fail_test;
8255
8256         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8257             ST.wanted = 1;
8258           ifmatch_trivial_fail_test:
8259             if (scan->flags) {
8260                 char * const s = HOPBACKc(locinput, scan->flags);
8261                 if (!s) {
8262                     /* trivial fail */
8263                     if (logical) {
8264                         logical = 0;
8265                         sw = 1 - cBOOL(ST.wanted);
8266                     }
8267                     else if (ST.wanted)
8268                         sayNO;
8269                     next = scan + ARG(scan);
8270                     if (next == scan)
8271                         next = NULL;
8272                     break;
8273                 }
8274                 newstart = s;
8275             }
8276             else
8277                 newstart = locinput;
8278
8279           do_ifmatch:
8280             ST.me = scan;
8281             ST.logical = logical;
8282             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8283
8284             /* execute body of (?...A) */
8285             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8286             NOT_REACHED; /* NOTREACHED */
8287         }
8288
8289         case IFMATCH_A_fail: /* body of (?...A) failed */
8290             ST.wanted = !ST.wanted;
8291             /* FALLTHROUGH */
8292
8293         case IFMATCH_A: /* body of (?...A) succeeded */
8294             if (ST.logical) {
8295                 sw = cBOOL(ST.wanted);
8296             }
8297             else if (!ST.wanted)
8298                 sayNO;
8299
8300             if (OP(ST.me) != SUSPEND) {
8301                 /* restore old position except for (?>...) */
8302                 locinput = st->locinput;
8303             }
8304             scan = ST.me + ARG(ST.me);
8305             if (scan == ST.me)
8306                 scan = NULL;
8307             continue; /* execute B */
8308
8309 #undef ST
8310
8311         case LONGJMP: /*  alternative with many branches compiles to
8312                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8313             next = scan + ARG(scan);
8314             if (next == scan)
8315                 next = NULL;
8316             break;
8317
8318         case COMMIT:  /*  (*COMMIT)  */
8319             reginfo->cutpoint = reginfo->strend;
8320             /* FALLTHROUGH */
8321
8322         case PRUNE:   /*  (*PRUNE)   */
8323             if (scan->flags)
8324                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8325             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8326             NOT_REACHED; /* NOTREACHED */
8327
8328         case COMMIT_next_fail:
8329             no_final = 1;
8330             /* FALLTHROUGH */
8331             sayNO;
8332             NOT_REACHED; /* NOTREACHED */
8333
8334         case OPFAIL:   /* (*FAIL)  */
8335             if (scan->flags)
8336                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8337             if (logical) {
8338                 /* deal with (?(?!)X|Y) properly,
8339                  * make sure we trigger the no branch
8340                  * of the trailing IFTHEN structure*/
8341                 sw= 0;
8342                 break;
8343             } else {
8344                 sayNO;
8345             }
8346             NOT_REACHED; /* NOTREACHED */
8347
8348 #define ST st->u.mark
8349         case MARKPOINT: /*  (*MARK:foo)  */
8350             ST.prev_mark = mark_state;
8351             ST.mark_name = sv_commit = sv_yes_mark
8352                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8353             mark_state = st;
8354             ST.mark_loc = locinput;
8355             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8356             NOT_REACHED; /* NOTREACHED */
8357
8358         case MARKPOINT_next:
8359             mark_state = ST.prev_mark;
8360             sayYES;
8361             NOT_REACHED; /* NOTREACHED */
8362
8363         case MARKPOINT_next_fail:
8364             if (popmark && sv_eq(ST.mark_name,popmark))
8365             {
8366                 if (ST.mark_loc > startpoint)
8367                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8368                 popmark = NULL; /* we found our mark */
8369                 sv_commit = ST.mark_name;
8370
8371                 DEBUG_EXECUTE_r({
8372                         Perl_re_exec_indentf( aTHX_  "%ssetting cutpoint to mark:%"SVf"...%s\n",
8373                             depth,
8374                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8375                 });
8376             }
8377             mark_state = ST.prev_mark;
8378             sv_yes_mark = mark_state ?
8379                 mark_state->u.mark.mark_name : NULL;
8380             sayNO;
8381             NOT_REACHED; /* NOTREACHED */
8382
8383         case SKIP:  /*  (*SKIP)  */
8384             if (!scan->flags) {
8385                 /* (*SKIP) : if we fail we cut here*/
8386                 ST.mark_name = NULL;
8387                 ST.mark_loc = locinput;
8388                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8389             } else {
8390                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8391                    otherwise do nothing.  Meaning we need to scan
8392                  */
8393                 regmatch_state *cur = mark_state;
8394                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8395
8396                 while (cur) {
8397                     if ( sv_eq( cur->u.mark.mark_name,
8398                                 find ) )
8399                     {
8400                         ST.mark_name = find;
8401                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8402                     }
8403                     cur = cur->u.mark.prev_mark;
8404                 }
8405             }
8406             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8407             break;
8408
8409         case SKIP_next_fail:
8410             if (ST.mark_name) {
8411                 /* (*CUT:NAME) - Set up to search for the name as we
8412                    collapse the stack*/
8413                 popmark = ST.mark_name;
8414             } else {
8415                 /* (*CUT) - No name, we cut here.*/
8416                 if (ST.mark_loc > startpoint)
8417                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8418                 /* but we set sv_commit to latest mark_name if there
8419                    is one so they can test to see how things lead to this
8420                    cut */
8421                 if (mark_state)
8422                     sv_commit=mark_state->u.mark.mark_name;
8423             }
8424             no_final = 1;
8425             sayNO;
8426             NOT_REACHED; /* NOTREACHED */
8427 #undef ST
8428
8429         case LNBREAK: /* \R */
8430             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8431                 locinput += n;
8432             } else
8433                 sayNO;
8434             break;
8435
8436         default:
8437             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
8438                           PTR2UV(scan), OP(scan));
8439             Perl_croak(aTHX_ "regexp memory corruption");
8440
8441         /* this is a point to jump to in order to increment
8442          * locinput by one character */
8443           increment_locinput:
8444             assert(!NEXTCHR_IS_EOS);
8445             if (utf8_target) {
8446                 locinput += PL_utf8skip[nextchr];
8447                 /* locinput is allowed to go 1 char off the end, but not 2+ */
8448                 if (locinput > reginfo->strend)
8449                     sayNO;
8450             }
8451             else
8452                 locinput++;
8453             break;
8454
8455         } /* end switch */
8456
8457         /* switch break jumps here */
8458         scan = next; /* prepare to execute the next op and ... */
8459         continue;    /* ... jump back to the top, reusing st */
8460         /* NOTREACHED */
8461
8462       push_yes_state:
8463         /* push a state that backtracks on success */
8464         st->u.yes.prev_yes_state = yes_state;
8465         yes_state = st;
8466         /* FALLTHROUGH */
8467       push_state:
8468         /* push a new regex state, then continue at scan  */
8469         {
8470             regmatch_state *newst;
8471
8472             DEBUG_STACK_r({
8473                 regmatch_state *cur = st;
8474                 regmatch_state *curyes = yes_state;
8475                 int curd = depth;
8476                 regmatch_slab *slab = PL_regmatch_slab;
8477                 for (;curd > -1 && (depth-curd < 3);cur--,curd--) {
8478                     if (cur < SLAB_FIRST(slab)) {
8479                         slab = slab->prev;
8480                         cur = SLAB_LAST(slab);
8481                     }
8482                     Perl_re_exec_indentf( aTHX_ "#%-3d %-10s %s\n",
8483                         depth,
8484                         curd, PL_reg_name[cur->resume_state],
8485                         (curyes == cur) ? "yes" : ""
8486                     );
8487                     if (curyes == cur)
8488                         curyes = cur->u.yes.prev_yes_state;
8489                 }
8490             } else
8491                 DEBUG_STATE_pp("push")
8492             );
8493             depth++;
8494             st->locinput = locinput;
8495             newst = st+1;
8496             if (newst >  SLAB_LAST(PL_regmatch_slab))
8497                 newst = S_push_slab(aTHX);
8498             PL_regmatch_state = newst;
8499
8500             locinput = pushinput;
8501             st = newst;
8502             continue;
8503             /* NOTREACHED */
8504         }
8505     }
8506 #ifdef SOLARIS_BAD_OPTIMIZER
8507 #  undef PL_charclass
8508 #endif
8509
8510     /*
8511     * We get here only if there's trouble -- normally "case END" is
8512     * the terminating point.
8513     */
8514     Perl_croak(aTHX_ "corrupted regexp pointers");
8515     NOT_REACHED; /* NOTREACHED */
8516
8517   yes:
8518     if (yes_state) {
8519         /* we have successfully completed a subexpression, but we must now
8520          * pop to the state marked by yes_state and continue from there */
8521         assert(st != yes_state);
8522 #ifdef DEBUGGING
8523         while (st != yes_state) {
8524             st--;
8525             if (st < SLAB_FIRST(PL_regmatch_slab)) {
8526                 PL_regmatch_slab = PL_regmatch_slab->prev;
8527                 st = SLAB_LAST(PL_regmatch_slab);
8528             }
8529             DEBUG_STATE_r({
8530                 if (no_final) {
8531                     DEBUG_STATE_pp("pop (no final)");
8532                 } else {
8533                     DEBUG_STATE_pp("pop (yes)");
8534                 }
8535             });
8536             depth--;
8537         }
8538 #else
8539         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8540             || yes_state > SLAB_LAST(PL_regmatch_slab))
8541         {
8542             /* not in this slab, pop slab */
8543             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8544             PL_regmatch_slab = PL_regmatch_slab->prev;
8545             st = SLAB_LAST(PL_regmatch_slab);
8546         }
8547         depth -= (st - yes_state);
8548 #endif
8549         st = yes_state;
8550         yes_state = st->u.yes.prev_yes_state;
8551         PL_regmatch_state = st;
8552
8553         if (no_final)
8554             locinput= st->locinput;
8555         state_num = st->resume_state + no_final;
8556         goto reenter_switch;
8557     }
8558
8559     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
8560                           PL_colors[4], PL_colors[5]));
8561
8562     if (reginfo->info_aux_eval) {
8563         /* each successfully executed (?{...}) block does the equivalent of
8564          *   local $^R = do {...}
8565          * When popping the save stack, all these locals would be undone;
8566          * bypass this by setting the outermost saved $^R to the latest
8567          * value */
8568         /* I dont know if this is needed or works properly now.
8569          * see code related to PL_replgv elsewhere in this file.
8570          * Yves
8571          */
8572         if (oreplsv != GvSV(PL_replgv))
8573             sv_setsv(oreplsv, GvSV(PL_replgv));
8574     }
8575     result = 1;
8576     goto final_exit;
8577
8578   no:
8579     DEBUG_EXECUTE_r(
8580         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
8581             depth,
8582             PL_colors[4], PL_colors[5])
8583         );
8584
8585   no_silent:
8586     if (no_final) {
8587         if (yes_state) {
8588             goto yes;
8589         } else {
8590             goto final_exit;
8591         }
8592     }
8593     if (depth) {
8594         /* there's a previous state to backtrack to */
8595         st--;
8596         if (st < SLAB_FIRST(PL_regmatch_slab)) {
8597             PL_regmatch_slab = PL_regmatch_slab->prev;
8598             st = SLAB_LAST(PL_regmatch_slab);
8599         }
8600         PL_regmatch_state = st;
8601         locinput= st->locinput;
8602
8603         DEBUG_STATE_pp("pop");
8604         depth--;
8605         if (yes_state == st)
8606             yes_state = st->u.yes.prev_yes_state;
8607
8608         state_num = st->resume_state + 1; /* failure = success + 1 */
8609         PERL_ASYNC_CHECK();
8610         goto reenter_switch;
8611     }
8612     result = 0;
8613
8614   final_exit:
8615     if (rex->intflags & PREGf_VERBARG_SEEN) {
8616         SV *sv_err = get_sv("REGERROR", 1);
8617         SV *sv_mrk = get_sv("REGMARK", 1);
8618         if (result) {
8619             sv_commit = &PL_sv_no;
8620             if (!sv_yes_mark)
8621                 sv_yes_mark = &PL_sv_yes;
8622         } else {
8623             if (!sv_commit)
8624                 sv_commit = &PL_sv_yes;
8625             sv_yes_mark = &PL_sv_no;
8626         }
8627         assert(sv_err);
8628         assert(sv_mrk);
8629         sv_setsv(sv_err, sv_commit);
8630         sv_setsv(sv_mrk, sv_yes_mark);
8631     }
8632
8633
8634     if (last_pushed_cv) {
8635         dSP;
8636         POP_MULTICALL;
8637         PERL_UNUSED_VAR(SP);
8638     }
8639
8640     assert(!result ||  locinput - reginfo->strbeg >= 0);
8641     return result ?  locinput - reginfo->strbeg : -1;
8642 }
8643
8644 /*
8645  - regrepeat - repeatedly match something simple, report how many
8646  *
8647  * What 'simple' means is a node which can be the operand of a quantifier like
8648  * '+', or {1,3}
8649  *
8650  * startposp - pointer a pointer to the start position.  This is updated
8651  *             to point to the byte following the highest successful
8652  *             match.
8653  * p         - the regnode to be repeatedly matched against.
8654  * reginfo   - struct holding match state, such as strend
8655  * max       - maximum number of things to match.
8656  * depth     - (for debugging) backtracking depth.
8657  */
8658 STATIC I32
8659 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8660             regmatch_info *const reginfo, I32 max _pDEPTH)
8661 {
8662     char *scan;     /* Pointer to current position in target string */
8663     I32 c;
8664     char *loceol = reginfo->strend;   /* local version */
8665     I32 hardcount = 0;  /* How many matches so far */
8666     bool utf8_target = reginfo->is_utf8_target;
8667     unsigned int to_complement = 0;  /* Invert the result? */
8668     UV utf8_flags;
8669     _char_class_number classnum;
8670
8671     PERL_ARGS_ASSERT_REGREPEAT;
8672
8673     scan = *startposp;
8674     if (max == REG_INFTY)
8675         max = I32_MAX;
8676     else if (! utf8_target && loceol - scan > max)
8677         loceol = scan + max;
8678
8679     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8680      * to the maximum of how far we should go in it (leaving it set to the real
8681      * end, if the maximum permissible would take us beyond that).  This allows
8682      * us to make the loop exit condition that we haven't gone past <loceol> to
8683      * also mean that we haven't exceeded the max permissible count, saving a
8684      * test each time through the loop.  But it assumes that the OP matches a
8685      * single byte, which is true for most of the OPs below when applied to a
8686      * non-UTF-8 target.  Those relatively few OPs that don't have this
8687      * characteristic will have to compensate.
8688      *
8689      * There is no adjustment for UTF-8 targets, as the number of bytes per
8690      * character varies.  OPs will have to test both that the count is less
8691      * than the max permissible (using <hardcount> to keep track), and that we
8692      * are still within the bounds of the string (using <loceol>.  A few OPs
8693      * match a single byte no matter what the encoding.  They can omit the max
8694      * test if, for the UTF-8 case, they do the adjustment that was skipped
8695      * above.
8696      *
8697      * Thus, the code above sets things up for the common case; and exceptional
8698      * cases need extra work; the common case is to make sure <scan> doesn't
8699      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8700      * count doesn't exceed the maximum permissible */
8701
8702     switch (OP(p)) {
8703     case REG_ANY:
8704         if (utf8_target) {
8705             while (scan < loceol && hardcount < max && *scan != '\n') {
8706                 scan += UTF8SKIP(scan);
8707                 hardcount++;
8708             }
8709         } else {
8710             while (scan < loceol && *scan != '\n')
8711                 scan++;
8712         }
8713         break;
8714     case SANY:
8715         if (utf8_target) {
8716             while (scan < loceol && hardcount < max) {
8717                 scan += UTF8SKIP(scan);
8718                 hardcount++;
8719             }
8720         }
8721         else
8722             scan = loceol;
8723         break;
8724     case EXACTL:
8725         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8726         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8727             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8728         }
8729         /* FALLTHROUGH */
8730     case EXACT:
8731         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8732
8733         c = (U8)*STRING(p);
8734
8735         /* Can use a simple loop if the pattern char to match on is invariant
8736          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
8737          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8738          * true iff it doesn't matter if the argument is in UTF-8 or not */
8739         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8740             if (utf8_target && loceol - scan > max) {
8741                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8742                  * since here, to match at all, 1 char == 1 byte */
8743                 loceol = scan + max;
8744             }
8745             while (scan < loceol && UCHARAT(scan) == c) {
8746                 scan++;
8747             }
8748         }
8749         else if (reginfo->is_utf8_pat) {
8750             if (utf8_target) {
8751                 STRLEN scan_char_len;
8752
8753                 /* When both target and pattern are UTF-8, we have to do
8754                  * string EQ */
8755                 while (hardcount < max
8756                        && scan < loceol
8757                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8758                        && memEQ(scan, STRING(p), scan_char_len))
8759                 {
8760                     scan += scan_char_len;
8761                     hardcount++;
8762                 }
8763             }
8764             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8765
8766                 /* Target isn't utf8; convert the character in the UTF-8
8767                  * pattern to non-UTF8, and do a simple loop */
8768                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8769                 while (scan < loceol && UCHARAT(scan) == c) {
8770                     scan++;
8771                 }
8772             } /* else pattern char is above Latin1, can't possibly match the
8773                  non-UTF-8 target */
8774         }
8775         else {
8776
8777             /* Here, the string must be utf8; pattern isn't, and <c> is
8778              * different in utf8 than not, so can't compare them directly.
8779              * Outside the loop, find the two utf8 bytes that represent c, and
8780              * then look for those in sequence in the utf8 string */
8781             U8 high = UTF8_TWO_BYTE_HI(c);
8782             U8 low = UTF8_TWO_BYTE_LO(c);
8783
8784             while (hardcount < max
8785                     && scan + 1 < loceol
8786                     && UCHARAT(scan) == high
8787                     && UCHARAT(scan + 1) == low)
8788             {
8789                 scan += 2;
8790                 hardcount++;
8791             }
8792         }
8793         break;
8794
8795     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
8796         assert(! reginfo->is_utf8_pat);
8797         /* FALLTHROUGH */
8798     case EXACTFA:
8799         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8800         goto do_exactf;
8801
8802     case EXACTFL:
8803         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8804         utf8_flags = FOLDEQ_LOCALE;
8805         goto do_exactf;
8806
8807     case EXACTF:   /* This node only generated for non-utf8 patterns */
8808         assert(! reginfo->is_utf8_pat);
8809         utf8_flags = 0;
8810         goto do_exactf;
8811
8812     case EXACTFLU8:
8813         if (! utf8_target) {
8814             break;
8815         }
8816         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8817                                     | FOLDEQ_S2_FOLDS_SANE;
8818         goto do_exactf;
8819
8820     case EXACTFU_SS:
8821     case EXACTFU:
8822         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8823
8824       do_exactf: {
8825         int c1, c2;
8826         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8827
8828         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8829
8830         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8831                                         reginfo))
8832         {
8833             if (c1 == CHRTEST_VOID) {
8834                 /* Use full Unicode fold matching */
8835                 char *tmpeol = reginfo->strend;
8836                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8837                 while (hardcount < max
8838                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8839                                              STRING(p), NULL, pat_len,
8840                                              reginfo->is_utf8_pat, utf8_flags))
8841                 {
8842                     scan = tmpeol;
8843                     tmpeol = reginfo->strend;
8844                     hardcount++;
8845                 }
8846             }
8847             else if (utf8_target) {
8848                 if (c1 == c2) {
8849                     while (scan < loceol
8850                            && hardcount < max
8851                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8852                     {
8853                         scan += UTF8SKIP(scan);
8854                         hardcount++;
8855                     }
8856                 }
8857                 else {
8858                     while (scan < loceol
8859                            && hardcount < max
8860                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8861                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8862                     {
8863                         scan += UTF8SKIP(scan);
8864                         hardcount++;
8865                     }
8866                 }
8867             }
8868             else if (c1 == c2) {
8869                 while (scan < loceol && UCHARAT(scan) == c1) {
8870                     scan++;
8871                 }
8872             }
8873             else {
8874                 while (scan < loceol &&
8875                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8876                 {
8877                     scan++;
8878                 }
8879             }
8880         }
8881         break;
8882     }
8883     case ANYOFL:
8884         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8885
8886         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
8887             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8888         }
8889         /* FALLTHROUGH */
8890     case ANYOFD:
8891     case ANYOF:
8892         if (utf8_target) {
8893             while (hardcount < max
8894                    && scan < loceol
8895                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8896             {
8897                 scan += UTF8SKIP(scan);
8898                 hardcount++;
8899             }
8900         } else {
8901             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan, 0))
8902                 scan++;
8903         }
8904         break;
8905
8906     /* The argument (FLAGS) to all the POSIX node types is the class number */
8907
8908     case NPOSIXL:
8909         to_complement = 1;
8910         /* FALLTHROUGH */
8911
8912     case POSIXL:
8913         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8914         if (! utf8_target) {
8915             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8916                                                                    *scan)))
8917             {
8918                 scan++;
8919             }
8920         } else {
8921             while (hardcount < max && scan < loceol
8922                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8923                                                                   (U8 *) scan)))
8924             {
8925                 scan += UTF8SKIP(scan);
8926                 hardcount++;
8927             }
8928         }
8929         break;
8930
8931     case POSIXD:
8932         if (utf8_target) {
8933             goto utf8_posix;
8934         }
8935         /* FALLTHROUGH */
8936
8937     case POSIXA:
8938         if (utf8_target && loceol - scan > max) {
8939
8940             /* We didn't adjust <loceol> at the beginning of this routine
8941              * because is UTF-8, but it is actually ok to do so, since here, to
8942              * match, 1 char == 1 byte. */
8943             loceol = scan + max;
8944         }
8945         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8946             scan++;
8947         }
8948         break;
8949
8950     case NPOSIXD:
8951         if (utf8_target) {
8952             to_complement = 1;
8953             goto utf8_posix;
8954         }
8955         /* FALLTHROUGH */
8956
8957     case NPOSIXA:
8958         if (! utf8_target) {
8959             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8960                 scan++;
8961             }
8962         }
8963         else {
8964
8965             /* The complement of something that matches only ASCII matches all
8966              * non-ASCII, plus everything in ASCII that isn't in the class. */
8967             while (hardcount < max && scan < loceol
8968                    && (! isASCII_utf8(scan)
8969                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8970             {
8971                 scan += UTF8SKIP(scan);
8972                 hardcount++;
8973             }
8974         }
8975         break;
8976
8977     case NPOSIXU:
8978         to_complement = 1;
8979         /* FALLTHROUGH */
8980
8981     case POSIXU:
8982         if (! utf8_target) {
8983             while (scan < loceol && to_complement
8984                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8985             {
8986                 scan++;
8987             }
8988         }
8989         else {
8990           utf8_posix:
8991             classnum = (_char_class_number) FLAGS(p);
8992             if (classnum < _FIRST_NON_SWASH_CC) {
8993
8994                 /* Here, a swash is needed for above-Latin1 code points.
8995                  * Process as many Latin1 code points using the built-in rules.
8996                  * Go to another loop to finish processing upon encountering
8997                  * the first Latin1 code point.  We could do that in this loop
8998                  * as well, but the other way saves having to test if the swash
8999                  * has been loaded every time through the loop: extra space to
9000                  * save a test. */
9001                 while (hardcount < max && scan < loceol) {
9002                     if (UTF8_IS_INVARIANT(*scan)) {
9003                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
9004                                                                    classnum))))
9005                         {
9006                             break;
9007                         }
9008                         scan++;
9009                     }
9010                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
9011                         if (! (to_complement
9012                               ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
9013                                                                      *(scan + 1)),
9014                                                     classnum))))
9015                         {
9016                             break;
9017                         }
9018                         scan += 2;
9019                     }
9020                     else {
9021                         goto found_above_latin1;
9022                     }
9023
9024                     hardcount++;
9025                 }
9026             }
9027             else {
9028                 /* For these character classes, the knowledge of how to handle
9029                  * every code point is compiled in to Perl via a macro.  This
9030                  * code is written for making the loops as tight as possible.
9031                  * It could be refactored to save space instead */
9032                 switch (classnum) {
9033                     case _CC_ENUM_SPACE:
9034                         while (hardcount < max
9035                                && scan < loceol
9036                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
9037                         {
9038                             scan += UTF8SKIP(scan);
9039                             hardcount++;
9040                         }
9041                         break;
9042                     case _CC_ENUM_BLANK:
9043                         while (hardcount < max
9044                                && scan < loceol
9045                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
9046                         {
9047                             scan += UTF8SKIP(scan);
9048                             hardcount++;
9049                         }
9050                         break;
9051                     case _CC_ENUM_XDIGIT:
9052                         while (hardcount < max
9053                                && scan < loceol
9054                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
9055                         {
9056                             scan += UTF8SKIP(scan);
9057                             hardcount++;
9058                         }
9059                         break;
9060                     case _CC_ENUM_VERTSPACE:
9061                         while (hardcount < max
9062                                && scan < loceol
9063                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
9064                         {
9065                             scan += UTF8SKIP(scan);
9066                             hardcount++;
9067                         }
9068                         break;
9069                     case _CC_ENUM_CNTRL:
9070                         while (hardcount < max
9071                                && scan < loceol
9072                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
9073                         {
9074                             scan += UTF8SKIP(scan);
9075                             hardcount++;
9076                         }
9077                         break;
9078                     default:
9079                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9080                 }
9081             }
9082         }
9083         break;
9084
9085       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
9086
9087         /* Load the swash if not already present */
9088         if (! PL_utf8_swash_ptrs[classnum]) {
9089             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9090             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9091                                         "utf8",
9092                                         "",
9093                                         &PL_sv_undef, 1, 0,
9094                                         PL_XPosix_ptrs[classnum], &flags);
9095         }
9096
9097         while (hardcount < max && scan < loceol
9098                && to_complement ^ cBOOL(_generic_utf8(
9099                                        classnum,
9100                                        scan,
9101                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
9102                                                    (U8 *) scan,
9103                                                    TRUE))))
9104         {
9105             scan += UTF8SKIP(scan);
9106             hardcount++;
9107         }
9108         break;
9109
9110     case LNBREAK:
9111         if (utf8_target) {
9112             while (hardcount < max && scan < loceol &&
9113                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9114                 scan += c;
9115                 hardcount++;
9116             }
9117         } else {
9118             /* LNBREAK can match one or two latin chars, which is ok, but we
9119              * have to use hardcount in this situation, and throw away the
9120              * adjustment to <loceol> done before the switch statement */
9121             loceol = reginfo->strend;
9122             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9123                 scan+=c;
9124                 hardcount++;
9125             }
9126         }
9127         break;
9128
9129     case BOUNDL:
9130     case NBOUNDL:
9131         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9132         /* FALLTHROUGH */
9133     case BOUND:
9134     case BOUNDA:
9135     case BOUNDU:
9136     case EOS:
9137     case GPOS:
9138     case KEEPS:
9139     case NBOUND:
9140     case NBOUNDA:
9141     case NBOUNDU:
9142     case OPFAIL:
9143     case SBOL:
9144     case SEOL:
9145         /* These are all 0 width, so match right here or not at all. */
9146         break;
9147
9148     default:
9149         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9150         NOT_REACHED; /* NOTREACHED */
9151
9152     }
9153
9154     if (hardcount)
9155         c = hardcount;
9156     else
9157         c = scan - *startposp;
9158     *startposp = scan;
9159
9160     DEBUG_r({
9161         GET_RE_DEBUG_FLAGS_DECL;
9162         DEBUG_EXECUTE_r({
9163             SV * const prop = sv_newmortal();
9164             regprop(prog, prop, p, reginfo, NULL);
9165             Perl_re_exec_indentf( aTHX_  "%s can match %"IVdf" times out of %"IVdf"...\n",
9166                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9167         });
9168     });
9169
9170     return(c);
9171 }
9172
9173
9174 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9175 /*
9176 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9177 create a copy so that changes the caller makes won't change the shared one.
9178 If <altsvp> is non-null, will return NULL in it, for back-compat.
9179  */
9180 SV *
9181 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9182 {
9183     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9184
9185     if (altsvp) {
9186         *altsvp = NULL;
9187     }
9188
9189     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9190 }
9191
9192 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9193
9194 /*
9195  - reginclass - determine if a character falls into a character class
9196
9197   n is the ANYOF-type regnode
9198   p is the target string
9199   p_end points to one byte beyond the end of the target string
9200   utf8_target tells whether p is in UTF-8.
9201
9202   Returns true if matched; false otherwise.
9203
9204   Note that this can be a synthetic start class, a combination of various
9205   nodes, so things you think might be mutually exclusive, such as locale,
9206   aren't.  It can match both locale and non-locale
9207
9208  */
9209
9210 STATIC bool
9211 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9212 {
9213     dVAR;
9214     const char flags = ANYOF_FLAGS(n);
9215     bool match = FALSE;
9216     UV c = *p;
9217
9218     PERL_ARGS_ASSERT_REGINCLASS;
9219
9220     /* If c is not already the code point, get it.  Note that
9221      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9222     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9223         STRLEN c_len = 0;
9224         c = utf8n_to_uvchr(p, p_end - p, &c_len,
9225                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
9226                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
9227                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
9228                  * UTF8_ALLOW_FFFF */
9229         if (c_len == (STRLEN)-1)
9230             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
9231         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9232             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9233         }
9234     }
9235
9236     /* If this character is potentially in the bitmap, check it */
9237     if (c < NUM_ANYOF_CODE_POINTS) {
9238         if (ANYOF_BITMAP_TEST(n, c))
9239             match = TRUE;
9240         else if ((flags
9241                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9242                   && OP(n) == ANYOFD
9243                   && ! utf8_target
9244                   && ! isASCII(c))
9245         {
9246             match = TRUE;
9247         }
9248         else if (flags & ANYOF_LOCALE_FLAGS) {
9249             if ((flags & ANYOFL_FOLD)
9250                 && c < 256
9251                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9252             {
9253                 match = TRUE;
9254             }
9255             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9256                      && c < 256
9257             ) {
9258
9259                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9260                  * if the class includes the Posix character class given by
9261                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9262                  * complemented Posix class given by int(bit/2).  So we loop
9263                  * through the bits, each time changing whether we complement
9264                  * the result or not.  Suppose for the sake of illustration
9265                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9266                  * is set, it means there is a match for this ANYOF node if the
9267                  * character is in the class given by the expression (0 / 2 = 0
9268                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9269                  * and since 'to_complement' is 0, the result will stay TRUE,
9270                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9271                  * bit 1 is 1.  That means there is a match if the character
9272                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9273                  * but will on bit 1.  On the second iteration 'to_complement'
9274                  * will be 1, so the exclusive or will reverse things, so we
9275                  * are testing for \W.  On the third iteration, 'to_complement'
9276                  * will be 0, and we would be testing for \s; the fourth
9277                  * iteration would test for \S, etc.
9278                  *
9279                  * Note that this code assumes that all the classes are closed
9280                  * under folding.  For example, if a character matches \w, then
9281                  * its fold does too; and vice versa.  This should be true for
9282                  * any well-behaved locale for all the currently defined Posix
9283                  * classes, except for :lower: and :upper:, which are handled
9284                  * by the pseudo-class :cased: which matches if either of the
9285                  * other two does.  To get rid of this assumption, an outer
9286                  * loop could be used below to iterate over both the source
9287                  * character, and its fold (if different) */
9288
9289                 int count = 0;
9290                 int to_complement = 0;
9291
9292                 while (count < ANYOF_MAX) {
9293                     if (ANYOF_POSIXL_TEST(n, count)
9294                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9295                     {
9296                         match = TRUE;
9297                         break;
9298                     }
9299                     count++;
9300                     to_complement ^= 1;
9301                 }
9302             }
9303         }
9304     }
9305
9306
9307     /* If the bitmap didn't (or couldn't) match, and something outside the
9308      * bitmap could match, try that. */
9309     if (!match) {
9310         if (c >= NUM_ANYOF_CODE_POINTS
9311             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9312         {
9313             match = TRUE;       /* Everything above the bitmap matches */
9314         }
9315             /* Here doesn't match everything above the bitmap.  If there is
9316              * some information available beyond the bitmap, we may find a
9317              * match in it.  If so, this is most likely because the code point
9318              * is outside the bitmap range.  But rarely, it could be because of
9319              * some other reason.  If so, various flags are set to indicate
9320              * this possibility.  On ANYOFD nodes, there may be matches that
9321              * happen only when the target string is UTF-8; or for other node
9322              * types, because runtime lookup is needed, regardless of the
9323              * UTF-8ness of the target string.  Finally, under /il, there may
9324              * be some matches only possible if the locale is a UTF-8 one. */
9325         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9326                  && (   c >= NUM_ANYOF_CODE_POINTS
9327                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9328                          && (   UNLIKELY(OP(n) != ANYOFD)
9329                              || (utf8_target && ! isASCII_uni(c)
9330 #                               if NUM_ANYOF_CODE_POINTS > 256
9331                                                                  && c < 256
9332 #                               endif
9333                                 )))
9334                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9335                          && IN_UTF8_CTYPE_LOCALE)))
9336         {
9337             SV* only_utf8_locale = NULL;
9338             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9339                                                        &only_utf8_locale, NULL);
9340             if (sw) {
9341                 U8 utf8_buffer[2];
9342                 U8 * utf8_p;
9343                 if (utf8_target) {
9344                     utf8_p = (U8 *) p;
9345                 } else { /* Convert to utf8 */
9346                     utf8_p = utf8_buffer;
9347                     append_utf8_from_native_byte(*p, &utf8_p);
9348                     utf8_p = utf8_buffer;
9349                 }
9350
9351                 if (swash_fetch(sw, utf8_p, TRUE)) {
9352                     match = TRUE;
9353                 }
9354             }
9355             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9356                 match = _invlist_contains_cp(only_utf8_locale, c);
9357             }
9358         }
9359
9360         if (UNICODE_IS_SUPER(c)
9361             && (flags
9362                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9363             && OP(n) != ANYOFD
9364             && ckWARN_d(WARN_NON_UNICODE))
9365         {
9366             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9367                 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
9368         }
9369     }
9370
9371 #if ANYOF_INVERT != 1
9372     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9373      * use it, don't */
9374 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9375 #endif
9376
9377     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9378     return (flags & ANYOF_INVERT) ^ match;
9379 }
9380
9381 STATIC U8 *
9382 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9383 {
9384     /* return the position 'off' UTF-8 characters away from 's', forward if
9385      * 'off' >= 0, backwards if negative.  But don't go outside of position
9386      * 'lim', which better be < s  if off < 0 */
9387
9388     PERL_ARGS_ASSERT_REGHOP3;
9389
9390     if (off >= 0) {
9391         while (off-- && s < lim) {
9392             /* XXX could check well-formedness here */
9393             s += UTF8SKIP(s);
9394         }
9395     }
9396     else {
9397         while (off++ && s > lim) {
9398             s--;
9399             if (UTF8_IS_CONTINUED(*s)) {
9400                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9401                     s--;
9402                 if (! UTF8_IS_START(*s)) {
9403                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9404                 }
9405             }
9406             /* XXX could check well-formedness here */
9407         }
9408     }
9409     return s;
9410 }
9411
9412 STATIC U8 *
9413 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9414 {
9415     PERL_ARGS_ASSERT_REGHOP4;
9416
9417     if (off >= 0) {
9418         while (off-- && s < rlim) {
9419             /* XXX could check well-formedness here */
9420             s += UTF8SKIP(s);
9421         }
9422     }
9423     else {
9424         while (off++ && s > llim) {
9425             s--;
9426             if (UTF8_IS_CONTINUED(*s)) {
9427                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9428                     s--;
9429                 if (! UTF8_IS_START(*s)) {
9430                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9431                 }
9432             }
9433             /* XXX could check well-formedness here */
9434         }
9435     }
9436     return s;
9437 }
9438
9439 /* like reghop3, but returns NULL on overrun, rather than returning last
9440  * char pos */
9441
9442 STATIC U8 *
9443 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9444 {
9445     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9446
9447     if (off >= 0) {
9448         while (off-- && s < lim) {
9449             /* XXX could check well-formedness here */
9450             s += UTF8SKIP(s);
9451         }
9452         if (off >= 0)
9453             return NULL;
9454     }
9455     else {
9456         while (off++ && s > lim) {
9457             s--;
9458             if (UTF8_IS_CONTINUED(*s)) {
9459                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9460                     s--;
9461                 if (! UTF8_IS_START(*s)) {
9462                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9463                 }
9464             }
9465             /* XXX could check well-formedness here */
9466         }
9467         if (off <= 0)
9468             return NULL;
9469     }
9470     return s;
9471 }
9472
9473
9474 /* when executing a regex that may have (?{}), extra stuff needs setting
9475    up that will be visible to the called code, even before the current
9476    match has finished. In particular:
9477
9478    * $_ is localised to the SV currently being matched;
9479    * pos($_) is created if necessary, ready to be updated on each call-out
9480      to code;
9481    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9482      isn't set until the current pattern is successfully finished), so that
9483      $1 etc of the match-so-far can be seen;
9484    * save the old values of subbeg etc of the current regex, and  set then
9485      to the current string (again, this is normally only done at the end
9486      of execution)
9487 */
9488
9489 static void
9490 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9491 {
9492     MAGIC *mg;
9493     regexp *const rex = ReANY(reginfo->prog);
9494     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9495
9496     eval_state->rex = rex;
9497
9498     if (reginfo->sv) {
9499         /* Make $_ available to executed code. */
9500         if (reginfo->sv != DEFSV) {
9501             SAVE_DEFSV;
9502             DEFSV_set(reginfo->sv);
9503         }
9504
9505         if (!(mg = mg_find_mglob(reginfo->sv))) {
9506             /* prepare for quick setting of pos */
9507             mg = sv_magicext_mglob(reginfo->sv);
9508             mg->mg_len = -1;
9509         }
9510         eval_state->pos_magic = mg;
9511         eval_state->pos       = mg->mg_len;
9512         eval_state->pos_flags = mg->mg_flags;
9513     }
9514     else
9515         eval_state->pos_magic = NULL;
9516
9517     if (!PL_reg_curpm) {
9518         /* PL_reg_curpm is a fake PMOP that we can attach the current
9519          * regex to and point PL_curpm at, so that $1 et al are visible
9520          * within a /(?{})/. It's just allocated once per interpreter the
9521          * first time its needed */
9522         Newxz(PL_reg_curpm, 1, PMOP);
9523 #ifdef USE_ITHREADS
9524         {
9525             SV* const repointer = &PL_sv_undef;
9526             /* this regexp is also owned by the new PL_reg_curpm, which
9527                will try to free it.  */
9528             av_push(PL_regex_padav, repointer);
9529             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
9530             PL_regex_pad = AvARRAY(PL_regex_padav);
9531         }
9532 #endif
9533     }
9534     SET_reg_curpm(reginfo->prog);
9535     eval_state->curpm = PL_curpm;
9536     PL_curpm_under = PL_curpm;
9537     PL_curpm = PL_reg_curpm;
9538     if (RXp_MATCH_COPIED(rex)) {
9539         /*  Here is a serious problem: we cannot rewrite subbeg,
9540             since it may be needed if this match fails.  Thus
9541             $` inside (?{}) could fail... */
9542         eval_state->subbeg     = rex->subbeg;
9543         eval_state->sublen     = rex->sublen;
9544         eval_state->suboffset  = rex->suboffset;
9545         eval_state->subcoffset = rex->subcoffset;
9546 #ifdef PERL_ANY_COW
9547         eval_state->saved_copy = rex->saved_copy;
9548 #endif
9549         RXp_MATCH_COPIED_off(rex);
9550     }
9551     else
9552         eval_state->subbeg = NULL;
9553     rex->subbeg = (char *)reginfo->strbeg;
9554     rex->suboffset = 0;
9555     rex->subcoffset = 0;
9556     rex->sublen = reginfo->strend - reginfo->strbeg;
9557 }
9558
9559
9560 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
9561
9562 static void
9563 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
9564 {
9565     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
9566     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
9567     regmatch_slab *s;
9568
9569     Safefree(aux->poscache);
9570
9571     if (eval_state) {
9572
9573         /* undo the effects of S_setup_eval_state() */
9574
9575         if (eval_state->subbeg) {
9576             regexp * const rex = eval_state->rex;
9577             rex->subbeg     = eval_state->subbeg;
9578             rex->sublen     = eval_state->sublen;
9579             rex->suboffset  = eval_state->suboffset;
9580             rex->subcoffset = eval_state->subcoffset;
9581 #ifdef PERL_ANY_COW
9582             rex->saved_copy = eval_state->saved_copy;
9583 #endif
9584             RXp_MATCH_COPIED_on(rex);
9585         }
9586         if (eval_state->pos_magic)
9587         {
9588             eval_state->pos_magic->mg_len = eval_state->pos;
9589             eval_state->pos_magic->mg_flags =
9590                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
9591                | (eval_state->pos_flags & MGf_BYTES);
9592         }
9593
9594         PL_curpm = eval_state->curpm;
9595     }
9596
9597     PL_regmatch_state = aux->old_regmatch_state;
9598     PL_regmatch_slab  = aux->old_regmatch_slab;
9599
9600     /* free all slabs above current one - this must be the last action
9601      * of this function, as aux and eval_state are allocated within
9602      * slabs and may be freed here */
9603
9604     s = PL_regmatch_slab->next;
9605     if (s) {
9606         PL_regmatch_slab->next = NULL;
9607         while (s) {
9608             regmatch_slab * const osl = s;
9609             s = s->next;
9610             Safefree(osl);
9611         }
9612     }
9613 }
9614
9615
9616 STATIC void
9617 S_to_utf8_substr(pTHX_ regexp *prog)
9618 {
9619     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9620      * on the converted value */
9621
9622     int i = 1;
9623
9624     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9625
9626     do {
9627         if (prog->substrs->data[i].substr
9628             && !prog->substrs->data[i].utf8_substr) {
9629             SV* const sv = newSVsv(prog->substrs->data[i].substr);
9630             prog->substrs->data[i].utf8_substr = sv;
9631             sv_utf8_upgrade(sv);
9632             if (SvVALID(prog->substrs->data[i].substr)) {
9633                 if (SvTAIL(prog->substrs->data[i].substr)) {
9634                     /* Trim the trailing \n that fbm_compile added last
9635                        time.  */
9636                     SvCUR_set(sv, SvCUR(sv) - 1);
9637                     /* Whilst this makes the SV technically "invalid" (as its
9638                        buffer is no longer followed by "\0") when fbm_compile()
9639                        adds the "\n" back, a "\0" is restored.  */
9640                     fbm_compile(sv, FBMcf_TAIL);
9641                 } else
9642                     fbm_compile(sv, 0);
9643             }
9644             if (prog->substrs->data[i].substr == prog->check_substr)
9645                 prog->check_utf8 = sv;
9646         }
9647     } while (i--);
9648 }
9649
9650 STATIC bool
9651 S_to_byte_substr(pTHX_ regexp *prog)
9652 {
9653     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9654      * on the converted value; returns FALSE if can't be converted. */
9655
9656     int i = 1;
9657
9658     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9659
9660     do {
9661         if (prog->substrs->data[i].utf8_substr
9662             && !prog->substrs->data[i].substr) {
9663             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
9664             if (! sv_utf8_downgrade(sv, TRUE)) {
9665                 return FALSE;
9666             }
9667             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9668                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9669                     /* Trim the trailing \n that fbm_compile added last
9670                         time.  */
9671                     SvCUR_set(sv, SvCUR(sv) - 1);
9672                     fbm_compile(sv, FBMcf_TAIL);
9673                 } else
9674                     fbm_compile(sv, 0);
9675             }
9676             prog->substrs->data[i].substr = sv;
9677             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9678                 prog->check_substr = sv;
9679         }
9680     } while (i--);
9681
9682     return TRUE;
9683 }
9684
9685 /*
9686  * ex: set ts=8 sts=4 sw=4 et:
9687  */