regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /* Valid only if 'c', the character being looke-up, is an invariant under
 111  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 112  * everything matchable is straight forward in the bitmap */
 113 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 114                                 ? reginclass(prog,p,c,c+1,u)                \
 115                                 : ANYOF_BITMAP_TEST(p,*(c)))
 116
 117 /*
 118  * Forwards.
 119  */
 120
 121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 122
 123 #define HOPc(pos,off) \
 124         (char *)(reginfo->is_utf8_target \
 125             ? reghop3((U8*)pos, off, \
 126                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 127             : (U8*)(pos + off))
 128
 129 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 130 #define HOPBACK3(pos, off, lim) \
 131         (reginfo->is_utf8_target                          \
 132             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 133             : (pos - off >= lim)                                 \
 134                 ? (U8*)pos - off                                 \
 135                 : NULL)
 136
 137 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 138
 139 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 140 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 141
 142 /* lim must be +ve. Returns NULL on overshoot */
 143 #define HOPMAYBE3(pos,off,lim) \
 144         (reginfo->is_utf8_target                        \
 145             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 146             : ((U8*)pos + off <= lim)                   \
 147                 ? (U8*)pos + off                        \
 148                 : NULL)
 149
 150 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 151  * off must be >=0; args should be vars rather than expressions */
 152 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 153     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 154     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 155 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 156
 157 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 158     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 159     : (U8*)(pos + off))
 160 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 161
 162 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 163 #define NEXTCHR_IS_EOS (nextchr < 0)
 164
 165 #define SET_nextchr \
 166     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 167
 168 #define SET_locinput(p) \
 169     locinput = (p);  \
 170     SET_nextchr
 171
 172
 173 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 174         if (!swash_ptr) {                                                     \
 175             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 176             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 177                                          1, 0, invlist, &flags);              \
 178             assert(swash_ptr);                                                \
 179         }                                                                     \
 180     } STMT_END
 181
 182 /* If in debug mode, we test that a known character properly matches */
 183 #ifdef DEBUGGING
 184 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 185                                           property_name,                      \
 186                                           invlist,                            \
 187                                           utf8_char_in_property)              \
 188         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 189         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 190 #else
 191 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 192                                           property_name,                      \
 193                                           invlist,                            \
 194                                           utf8_char_in_property)              \
 195         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 196 #endif
 197
 198 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 199                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 200                                         "",                                   \
 201                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 202                                         LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
 203
 204 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 205 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 206
 207 /* for use after a quantifier and before an EXACT-like node -- japhy */
 208 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 209  *
 210  * NOTE that *nothing* that affects backtracking should be in here, specifically
 211  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 212  * node that is in between two EXACT like nodes when ascertaining what the required
 213  * "follow" character is. This should probably be moved to regex compile time
 214  * although it may be done at run time beause of the REF possibility - more
 215  * investigation required. -- demerphq
 216 */
 217 #define JUMPABLE(rn) (                                                             \
 218     OP(rn) == OPEN ||                                                              \
 219     (OP(rn) == CLOSE &&                                                            \
 220      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 221     OP(rn) == EVAL ||                                                              \
 222     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 223     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 224     OP(rn) == KEEPS ||                                                             \
 225     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 226 )
 227 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 228
 229 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 230
 231 #if 0
 232 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 233    we don't need this definition.  XXX These are now out-of-sync*/
 234 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 235 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 236 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 237
 238 #else
 239 /* ... so we use this as its faster. */
 240 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 241 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 242 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 243 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 244
 245 #endif
 246
 247 /*
 248   Search for mandatory following text node; for lookahead, the text must
 249   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 250 */
 251 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 252     while (JUMPABLE(rn)) { \
 253         const OPCODE type = OP(rn); \
 254         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 255             rn = NEXTOPER(NEXTOPER(rn)); \
 256         else if (type == PLUS) \
 257             rn = NEXTOPER(rn); \
 258         else if (type == IFMATCH) \
 259             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 260         else rn += NEXT_OFF(rn); \
 261     } \
 262 } STMT_END
 263
 264 #define SLAB_FIRST(s) (&(s)->states[0])
 265 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 266
 267 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 268 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 269 static regmatch_state * S_push_slab(pTHX);
 270
 271 #define REGCP_PAREN_ELEMS 3
 272 #define REGCP_OTHER_ELEMS 3
 273 #define REGCP_FRAME_ELEMS 1
 274 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 275  * are needed for the regexp context stack bookkeeping. */
 276
 277 STATIC CHECKPOINT
 278 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 279 {
 280     const int retval = PL_savestack_ix;
 281     const int paren_elems_to_push =
 282                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 283     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 284     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 285     I32 p;
 286     GET_RE_DEBUG_FLAGS_DECL;
 287
 288     PERL_ARGS_ASSERT_REGCPPUSH;
 289
 290     if (paren_elems_to_push < 0)
 291         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 292                    (int)paren_elems_to_push, (int)maxopenparen,
 293                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 294
 295     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 296         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 297                    " out of range (%lu-%ld)",
 298                    total_elems,
 299                    (unsigned long)maxopenparen,
 300                    (long)parenfloor);
 301
 302     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 303
 304     DEBUG_BUFFERS_r(
 305         if ((int)maxopenparen > (int)parenfloor)
 306             Perl_re_exec_indentf( aTHX_
 307                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 308                 depth,
 309                 PTR2UV(rex),
 310                 PTR2UV(rex->offs)
 311             );
 312     );
 313     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 314 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 315         SSPUSHIV(rex->offs[p].end);
 316         SSPUSHIV(rex->offs[p].start);
 317         SSPUSHINT(rex->offs[p].start_tmp);
 318         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 319             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 320             depth,
 321             (UV)p,
 322             (IV)rex->offs[p].start,
 323             (IV)rex->offs[p].start_tmp,
 324             (IV)rex->offs[p].end
 325         ));
 326     }
 327 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 328     SSPUSHINT(maxopenparen);
 329     SSPUSHINT(rex->lastparen);
 330     SSPUSHINT(rex->lastcloseparen);
 331     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 332
 333     return retval;
 334 }
 335
 336 /* These are needed since we do not localize EVAL nodes: */
 337 #define REGCP_SET(cp)                                           \
 338     DEBUG_STATE_r(                                              \
 339         Perl_re_exec_indentf( aTHX_                             \
 340             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 341             depth, (IV)PL_savestack_ix                          \
 342         )                                                       \
 343     );                                                          \
 344     cp = PL_savestack_ix
 345
 346 #define REGCP_UNWIND(cp)                                        \
 347     DEBUG_STATE_r(                                              \
 348         if (cp != PL_savestack_ix)                              \
 349             Perl_re_exec_indentf( aTHX_                         \
 350                 "Clearing an EVAL scope, savestack=%"           \
 351                 IVdf "..%" IVdf "\n",                           \
 352                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 353             )                                                   \
 354     );                                                          \
 355     regcpblow(cp)
 356
 357 #define UNWIND_PAREN(lp, lcp)               \
 358     for (n = rex->lastparen; n > lp; n--)   \
 359         rex->offs[n].end = -1;              \
 360     rex->lastparen = n;                     \
 361     rex->lastcloseparen = lcp;
 362
 363
 364 STATIC void
 365 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 366 {
 367     UV i;
 368     U32 paren;
 369     GET_RE_DEBUG_FLAGS_DECL;
 370
 371     PERL_ARGS_ASSERT_REGCPPOP;
 372
 373     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 374     i = SSPOPUV;
 375     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 376     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 377     rex->lastcloseparen = SSPOPINT;
 378     rex->lastparen = SSPOPINT;
 379     *maxopenparen_p = SSPOPINT;
 380
 381     i -= REGCP_OTHER_ELEMS;
 382     /* Now restore the parentheses context. */
 383     DEBUG_BUFFERS_r(
 384         if (i || rex->lastparen + 1 <= rex->nparens)
 385             Perl_re_exec_indentf( aTHX_
 386                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 387                 depth,
 388                 PTR2UV(rex),
 389                 PTR2UV(rex->offs)
 390             );
 391     );
 392     paren = *maxopenparen_p;
 393     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 394         SSize_t tmps;
 395         rex->offs[paren].start_tmp = SSPOPINT;
 396         rex->offs[paren].start = SSPOPIV;
 397         tmps = SSPOPIV;
 398         if (paren <= rex->lastparen)
 399             rex->offs[paren].end = tmps;
 400         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 401             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 402             depth,
 403             (UV)paren,
 404             (IV)rex->offs[paren].start,
 405             (IV)rex->offs[paren].start_tmp,
 406             (IV)rex->offs[paren].end,
 407             (paren > rex->lastparen ? "(skipped)" : ""));
 408         );
 409         paren--;
 410     }
 411 #if 1
 412     /* It would seem that the similar code in regtry()
 413      * already takes care of this, and in fact it is in
 414      * a better location to since this code can #if 0-ed out
 415      * but the code in regtry() is needed or otherwise tests
 416      * requiring null fields (pat.t#187 and split.t#{13,14}
 417      * (as of patchlevel 7877)  will fail.  Then again,
 418      * this code seems to be necessary or otherwise
 419      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 420      * --jhi updated by dapm */
 421     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 422         if (i > *maxopenparen_p)
 423             rex->offs[i].start = -1;
 424         rex->offs[i].end = -1;
 425         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 426             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 427             depth,
 428             (UV)i,
 429             (i > *maxopenparen_p) ? "-1" : "  "
 430         ));
 431     }
 432 #endif
 433 }
 434
 435 /* restore the parens and associated vars at savestack position ix,
 436  * but without popping the stack */
 437
 438 STATIC void
 439 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 440 {
 441     I32 tmpix = PL_savestack_ix;
 442     PERL_ARGS_ASSERT_REGCP_RESTORE;
 443
 444     PL_savestack_ix = ix;
 445     regcppop(rex, maxopenparen_p);
 446     PL_savestack_ix = tmpix;
 447 }
 448
 449 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 450
 451 #ifndef PERL_IN_XSUB_RE
 452
 453 bool
 454 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 455 {
 456     /* Returns a boolean as to whether or not 'character' is a member of the
 457      * Posix character class given by 'classnum' that should be equivalent to a
 458      * value in the typedef '_char_class_number'.
 459      *
 460      * Ideally this could be replaced by a just an array of function pointers
 461      * to the C library functions that implement the macros this calls.
 462      * However, to compile, the precise function signatures are required, and
 463      * these may vary from platform to to platform.  To avoid having to figure
 464      * out what those all are on each platform, I (khw) am using this method,
 465      * which adds an extra layer of function call overhead (unless the C
 466      * optimizer strips it away).  But we don't particularly care about
 467      * performance with locales anyway. */
 468
 469     switch ((_char_class_number) classnum) {
 470         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 471         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 472         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 473         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 474         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 475                                         || isUPPER_LC(character);
 476         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 477         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 478         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 479         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 480         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 481         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 482         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 483         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 484         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 485         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 486         default:    /* VERTSPACE should never occur in locales */
 487             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 488     }
 489
 490     NOT_REACHED; /* NOTREACHED */
 491     return FALSE;
 492 }
 493
 494 #endif
 495
 496 STATIC bool
 497 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 498 {
 499     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 500      * 'character' is a member of the Posix character class given by 'classnum'
 501      * that should be equivalent to a value in the typedef
 502      * '_char_class_number'.
 503      *
 504      * This just calls isFOO_lc on the code point for the character if it is in
 505      * the range 0-255.  Outside that range, all characters use Unicode
 506      * rules, ignoring any locale.  So use the Unicode function if this class
 507      * requires a swash, and use the Unicode macro otherwise. */
 508
 509     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 510
 511     if (UTF8_IS_INVARIANT(*character)) {
 512         return isFOO_lc(classnum, *character);
 513     }
 514     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 515         return isFOO_lc(classnum,
 516                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 517     }
 518
 519     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 520
 521     if (classnum < _FIRST_NON_SWASH_CC) {
 522
 523         /* Initialize the swash unless done already */
 524         if (! PL_utf8_swash_ptrs[classnum]) {
 525             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 526             PL_utf8_swash_ptrs[classnum] =
 527                     _core_swash_init("utf8",
 528                                      "",
 529                                      &PL_sv_undef, 1, 0,
 530                                      PL_XPosix_ptrs[classnum], &flags);
 531         }
 532
 533         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 534                                  character,
 535                                  TRUE /* is UTF */ ));
 536     }
 537
 538     switch ((_char_class_number) classnum) {
 539         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 540         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 541         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 542         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 543         default:                 break;
 544     }
 545
 546     return FALSE; /* Things like CNTRL are always below 256 */
 547 }
 548
 549 /*
 550  * pregexec and friends
 551  */
 552
 553 #ifndef PERL_IN_XSUB_RE
 554 /*
 555  - pregexec - match a regexp against a string
 556  */
 557 I32
 558 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 559          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 560 /* stringarg: the point in the string at which to begin matching */
 561 /* strend:    pointer to null at end of string */
 562 /* strbeg:    real beginning of string */
 563 /* minend:    end of match must be >= minend bytes after stringarg. */
 564 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 565  *            itself is accessed via the pointers above */
 566 /* nosave:    For optimizations. */
 567 {
 568     PERL_ARGS_ASSERT_PREGEXEC;
 569
 570     return
 571         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 572                       nosave ? 0 : REXEC_COPY_STR);
 573 }
 574 #endif
 575
 576
 577
 578 /* re_intuit_start():
 579  *
 580  * Based on some optimiser hints, try to find the earliest position in the
 581  * string where the regex could match.
 582  *
 583  *   rx:     the regex to match against
 584  *   sv:     the SV being matched: only used for utf8 flag; the string
 585  *           itself is accessed via the pointers below. Note that on
 586  *           something like an overloaded SV, SvPOK(sv) may be false
 587  *           and the string pointers may point to something unrelated to
 588  *           the SV itself.
 589  *   strbeg: real beginning of string
 590  *   strpos: the point in the string at which to begin matching
 591  *   strend: pointer to the byte following the last char of the string
 592  *   flags   currently unused; set to 0
 593  *   data:   currently unused; set to NULL
 594  *
 595  * The basic idea of re_intuit_start() is to use some known information
 596  * about the pattern, namely:
 597  *
 598  *   a) the longest known anchored substring (i.e. one that's at a
 599  *      constant offset from the beginning of the pattern; but not
 600  *      necessarily at a fixed offset from the beginning of the
 601  *      string);
 602  *   b) the longest floating substring (i.e. one that's not at a constant
 603  *      offset from the beginning of the pattern);
 604  *   c) Whether the pattern is anchored to the string; either
 605  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 606  *      or anchored to pos(): /\G/;
 607  *   d) A start class: a real or synthetic character class which
 608  *      represents which characters are legal at the start of the pattern;
 609  *
 610  * to either quickly reject the match, or to find the earliest position
 611  * within the string at which the pattern might match, thus avoiding
 612  * running the full NFA engine at those earlier locations, only to
 613  * eventually fail and retry further along.
 614  *
 615  * Returns NULL if the pattern can't match, or returns the address within
 616  * the string which is the earliest place the match could occur.
 617  *
 618  * The longest of the anchored and floating substrings is called 'check'
 619  * and is checked first. The other is called 'other' and is checked
 620  * second. The 'other' substring may not be present.  For example,
 621  *
 622  *    /(abc|xyz)ABC\d{0,3}DEFG/
 623  *
 624  * will have
 625  *
 626  *   check substr (float)    = "DEFG", offset 6..9 chars
 627  *   other substr (anchored) = "ABC",  offset 3..3 chars
 628  *   stclass = [ax]
 629  *
 630  * Be aware that during the course of this function, sometimes 'anchored'
 631  * refers to a substring being anchored relative to the start of the
 632  * pattern, and sometimes to the pattern itself being anchored relative to
 633  * the string. For example:
 634  *
 635  *   /\dabc/:   "abc" is anchored to the pattern;
 636  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 637  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 638  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 639  *                    but the pattern is anchored to the string.
 640  */
 641
 642 char *
 643 Perl_re_intuit_start(pTHX_
 644                     REGEXP * const rx,
 645                     SV *sv,
 646                     const char * const strbeg,
 647                     char *strpos,
 648                     char *strend,
 649                     const U32 flags,
 650                     re_scream_pos_data *data)
 651 {
 652     struct regexp *const prog = ReANY(rx);
 653     SSize_t start_shift = prog->check_offset_min;
 654     /* Should be nonnegative! */
 655     SSize_t end_shift   = 0;
 656     /* current lowest pos in string where the regex can start matching */
 657     char *rx_origin = strpos;
 658     SV *check;
 659     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 660     U8   other_ix = 1 - prog->substrs->check_ix;
 661     bool ml_anch = 0;
 662     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 663     char *check_at = NULL;              /* check substr found at this pos */
 664     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 665     RXi_GET_DECL(prog,progi);
 666     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 667     regmatch_info *const reginfo = &reginfo_buf;
 668     GET_RE_DEBUG_FLAGS_DECL;
 669
 670     PERL_ARGS_ASSERT_RE_INTUIT_START;
 671     PERL_UNUSED_ARG(flags);
 672     PERL_UNUSED_ARG(data);
 673
 674     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 675                 "Intuit: trying to determine minimum start position...\n"));
 676
 677     /* for now, assume that all substr offsets are positive. If at some point
 678      * in the future someone wants to do clever things with lookbehind and
 679      * -ve offsets, they'll need to fix up any code in this function
 680      * which uses these offsets. See the thread beginning
 681      * <20140113145929.GF27210@iabyn.com>
 682      */
 683     assert(prog->substrs->data[0].min_offset >= 0);
 684     assert(prog->substrs->data[0].max_offset >= 0);
 685     assert(prog->substrs->data[1].min_offset >= 0);
 686     assert(prog->substrs->data[1].max_offset >= 0);
 687     assert(prog->substrs->data[2].min_offset >= 0);
 688     assert(prog->substrs->data[2].max_offset >= 0);
 689
 690     /* for now, assume that if both present, that the floating substring
 691      * doesn't start before the anchored substring.
 692      * If you break this assumption (e.g. doing better optimisations
 693      * with lookahead/behind), then you'll need to audit the code in this
 694      * function carefully first
 695      */
 696     assert(
 697             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 698               && (prog->float_utf8    || prog->float_substr))
 699            || (prog->float_min_offset >= prog->anchored_offset));
 700
 701     /* byte rather than char calculation for efficiency. It fails
 702      * to quickly reject some cases that can't match, but will reject
 703      * them later after doing full char arithmetic */
 704     if (prog->minlen > strend - strpos) {
 705         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 706                               "  String too short...\n"));
 707         goto fail;
 708     }
 709
 710     RXp_MATCH_UTF8_set(prog, utf8_target);
 711     reginfo->is_utf8_target = cBOOL(utf8_target);
 712     reginfo->info_aux = NULL;
 713     reginfo->strbeg = strbeg;
 714     reginfo->strend = strend;
 715     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 716     reginfo->intuit = 1;
 717     /* not actually used within intuit, but zero for safety anyway */
 718     reginfo->poscache_maxiter = 0;
 719
 720     if (utf8_target) {
 721         if ((!prog->anchored_utf8 && prog->anchored_substr)
 722                 || (!prog->float_utf8 && prog->float_substr))
 723             to_utf8_substr(prog);
 724         check = prog->check_utf8;
 725     } else {
 726         if (!prog->check_substr && prog->check_utf8) {
 727             if (! to_byte_substr(prog)) {
 728                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 729             }
 730         }
 731         check = prog->check_substr;
 732     }
 733
 734     /* dump the various substring data */
 735     DEBUG_OPTIMISE_MORE_r({
 736         int i;
 737         for (i=0; i<=2; i++) {
 738             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 739                                   : prog->substrs->data[i].substr);
 740             if (!sv)
 741                 continue;
 742
 743             Perl_re_printf( aTHX_
 744                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
 745                 " useful=%" IVdf " utf8=%d [%s]\n",
 746                 i,
 747                 (IV)prog->substrs->data[i].min_offset,
 748                 (IV)prog->substrs->data[i].max_offset,
 749                 (IV)prog->substrs->data[i].end_shift,
 750                 BmUSEFUL(sv),
 751                 utf8_target ? 1 : 0,
 752                 SvPEEK(sv));
 753         }
 754     });
 755
 756     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 757
 758         /* ml_anch: check after \n?
 759          *
 760          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 761          * with /.*.../, these flags will have been added by the
 762          * compiler:
 763          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 764          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 765          */
 766         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 767                    && !(prog->intflags & PREGf_IMPLICIT);
 768
 769         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 770             /* we are only allowed to match at BOS or \G */
 771
 772             /* trivially reject if there's a BOS anchor and we're not at BOS.
 773              *
 774              * Note that we don't try to do a similar quick reject for
 775              * \G, since generally the caller will have calculated strpos
 776              * based on pos() and gofs, so the string is already correctly
 777              * anchored by definition; and handling the exceptions would
 778              * be too fiddly (e.g. REXEC_IGNOREPOS).
 779              */
 780             if (   strpos != strbeg
 781                 && (prog->intflags & PREGf_ANCH_SBOL))
 782             {
 783                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 784                                 "  Not at start...\n"));
 785                 goto fail;
 786             }
 787
 788             /* in the presence of an anchor, the anchored (relative to the
 789              * start of the regex) substr must also be anchored relative
 790              * to strpos. So quickly reject if substr isn't found there.
 791              * This works for \G too, because the caller will already have
 792              * subtracted gofs from pos, and gofs is the offset from the
 793              * \G to the start of the regex. For example, in /.abc\Gdef/,
 794              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 795              * caller will have set strpos=pos()-4; we look for the substr
 796              * at position pos()-4+1, which lines up with the "a" */
 797
 798             if (prog->check_offset_min == prog->check_offset_max) {
 799                 /* Substring at constant offset from beg-of-str... */
 800                 SSize_t slen = SvCUR(check);
 801                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 802
 803                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 804                     "  Looking for check substr at fixed offset %" IVdf "...\n",
 805                     (IV)prog->check_offset_min));
 806
 807                 if (SvTAIL(check)) {
 808                     /* In this case, the regex is anchored at the end too.
 809                      * Unless it's a multiline match, the lengths must match
 810                      * exactly, give or take a \n.  NB: slen >= 1 since
 811                      * the last char of check is \n */
 812                     if (!multiline
 813                         && (   strend - s > slen
 814                             || strend - s < slen - 1
 815                             || (strend - s == slen && strend[-1] != '\n')))
 816                     {
 817                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 818                                             "  String too long...\n"));
 819                         goto fail_finish;
 820                     }
 821                     /* Now should match s[0..slen-2] */
 822                     slen--;
 823                 }
 824                 if (slen && (strend - s < slen
 825                     || *SvPVX_const(check) != *s
 826                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
 827                 {
 828                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 829                                     "  String not equal...\n"));
 830                     goto fail_finish;
 831                 }
 832
 833                 check_at = s;
 834                 goto success_at_start;
 835             }
 836         }
 837     }
 838
 839     end_shift = prog->check_end_shift;
 840
 841 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 842     if (end_shift < 0)
 843         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
 844                    (IV)end_shift, RX_PRECOMP(rx));
 845 #endif
 846
 847   restart:
 848
 849     /* This is the (re)entry point of the main loop in this function.
 850      * The goal of this loop is to:
 851      * 1) find the "check" substring in the region rx_origin..strend
 852      *    (adjusted by start_shift / end_shift). If not found, reject
 853      *    immediately.
 854      * 2) If it exists, look for the "other" substr too if defined; for
 855      *    example, if the check substr maps to the anchored substr, then
 856      *    check the floating substr, and vice-versa. If not found, go
 857      *    back to (1) with rx_origin suitably incremented.
 858      * 3) If we find an rx_origin position that doesn't contradict
 859      *    either of the substrings, then check the possible additional
 860      *    constraints on rx_origin of /^.../m or a known start class.
 861      *    If these fail, then depending on which constraints fail, jump
 862      *    back to here, or to various other re-entry points further along
 863      *    that skip some of the first steps.
 864      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 865      *    substring. If the start position was determined to be at the
 866      *    beginning of the string  - so, not rejected, but not optimised,
 867      *    since we have to run regmatch from position 0 - decrement the
 868      *    BmUSEFUL() count. Otherwise increment it.
 869      */
 870
 871
 872     /* first, look for the 'check' substring */
 873
 874     {
 875         U8* start_point;
 876         U8* end_point;
 877
 878         DEBUG_OPTIMISE_MORE_r({
 879             Perl_re_printf( aTHX_
 880                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
 881                 " Start shift: %" IVdf " End shift %" IVdf
 882                 " Real end Shift: %" IVdf "\n",
 883                 (IV)(rx_origin - strbeg),
 884                 (IV)prog->check_offset_min,
 885                 (IV)start_shift,
 886                 (IV)end_shift,
 887                 (IV)prog->check_end_shift);
 888         });
 889
 890         end_point = HOPBACK3(strend, end_shift, rx_origin);
 891         if (!end_point)
 892             goto fail_finish;
 893         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 894         if (!start_point)
 895             goto fail_finish;
 896
 897
 898         /* If the regex is absolutely anchored to either the start of the
 899          * string (SBOL) or to pos() (ANCH_GPOS), then
 900          * check_offset_max represents an upper bound on the string where
 901          * the substr could start. For the ANCH_GPOS case, we assume that
 902          * the caller of intuit will have already set strpos to
 903          * pos()-gofs, so in this case strpos + offset_max will still be
 904          * an upper bound on the substr.
 905          */
 906         if (!ml_anch
 907             && prog->intflags & PREGf_ANCH
 908             && prog->check_offset_max != SSize_t_MAX)
 909         {
 910             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
 911             const char * const anchor =
 912                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 913             SSize_t targ_len = (char*)end_point - anchor;
 914
 915             if (check_len > targ_len) {
 916                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 917                               "Anchored string too short...\n"));
 918                 goto fail_finish;
 919             }
 920
 921             /* do a bytes rather than chars comparison. It's conservative;
 922              * so it skips doing the HOP if the result can't possibly end
 923              * up earlier than the old value of end_point.
 924              */
 925             assert(anchor + check_len <= (char *)end_point);
 926             if (prog->check_offset_max + check_len < targ_len) {
 927                 end_point = HOP3lim((U8*)anchor,
 928                                 prog->check_offset_max,
 929                                 end_point - check_len
 930                             )
 931                             + check_len;
 932             }
 933         }
 934
 935         check_at = fbm_instr( start_point, end_point,
 936                       check, multiline ? FBMrf_MULTILINE : 0);
 937
 938         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 939             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
 940             (IV)((char*)start_point - strbeg),
 941             (IV)((char*)end_point   - strbeg),
 942             (IV)(check_at ? check_at - strbeg : -1)
 943         ));
 944
 945         /* Update the count-of-usability, remove useless subpatterns,
 946             unshift s.  */
 947
 948         DEBUG_EXECUTE_r({
 949             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 950                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 951             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
 952                               (check_at ? "Found" : "Did not find"),
 953                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 954                     ? "anchored" : "floating"),
 955                 quoted,
 956                 RE_SV_TAIL(check),
 957                 (check_at ? " at offset " : "...\n") );
 958         });
 959
 960         if (!check_at)
 961             goto fail_finish;
 962         /* set rx_origin to the minimum position where the regex could start
 963          * matching, given the constraint of the just-matched check substring.
 964          * But don't set it lower than previously.
 965          */
 966
 967         if (check_at - rx_origin > prog->check_offset_max)
 968             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 969         /* Finish the diagnostic message */
 970         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 971             "%ld (rx_origin now %" IVdf ")...\n",
 972             (long)(check_at - strbeg),
 973             (IV)(rx_origin - strbeg)
 974         ));
 975     }
 976
 977
 978     /* now look for the 'other' substring if defined */
 979
 980     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 981                     : prog->substrs->data[other_ix].substr)
 982     {
 983         /* Take into account the "other" substring. */
 984         char *last, *last1;
 985         char *s;
 986         SV* must;
 987         struct reg_substr_datum *other;
 988
 989       do_other_substr:
 990         other = &prog->substrs->data[other_ix];
 991
 992         /* if "other" is anchored:
 993          * we've previously found a floating substr starting at check_at.
 994          * This means that the regex origin must lie somewhere
 995          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 996          * and max:                 HOP3(check_at, -check_offset_min)
 997          * (except that min will be >= strpos)
 998          * So the fixed  substr must lie somewhere between
 999          *  HOP3(min, anchored_offset)
1000          *  HOP3(max, anchored_offset) + SvCUR(substr)
1001          */
1002
1003         /* if "other" is floating
1004          * Calculate last1, the absolute latest point where the
1005          * floating substr could start in the string, ignoring any
1006          * constraints from the earlier fixed match. It is calculated
1007          * as follows:
1008          *
1009          * strend - prog->minlen (in chars) is the absolute latest
1010          * position within the string where the origin of the regex
1011          * could appear. The latest start point for the floating
1012          * substr is float_min_offset(*) on from the start of the
1013          * regex.  last1 simply combines thee two offsets.
1014          *
1015          * (*) You might think the latest start point should be
1016          * float_max_offset from the regex origin, and technically
1017          * you'd be correct. However, consider
1018          *    /a\d{2,4}bcd\w/
1019          * Here, float min, max are 3,5 and minlen is 7.
1020          * This can match either
1021          *    /a\d\dbcd\w/
1022          *    /a\d\d\dbcd\w/
1023          *    /a\d\d\d\dbcd\w/
1024          * In the first case, the regex matches minlen chars; in the
1025          * second, minlen+1, in the third, minlen+2.
1026          * In the first case, the floating offset is 3 (which equals
1027          * float_min), in the second, 4, and in the third, 5 (which
1028          * equals float_max). In all cases, the floating string bcd
1029          * can never start more than 4 chars from the end of the
1030          * string, which equals minlen - float_min. As the substring
1031          * starts to match more than float_min from the start of the
1032          * regex, it makes the regex match more than minlen chars,
1033          * and the two cancel each other out. So we can always use
1034          * float_min - minlen, rather than float_max - minlen for the
1035          * latest position in the string.
1036          *
1037          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1038          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1039          */
1040
1041         assert(prog->minlen >= other->min_offset);
1042         last1 = HOP3c(strend,
1043                         other->min_offset - prog->minlen, strbeg);
1044
1045         if (other_ix) {/* i.e. if (other-is-float) */
1046             /* last is the latest point where the floating substr could
1047              * start, *given* any constraints from the earlier fixed
1048              * match. This constraint is that the floating string starts
1049              * <= float_max_offset chars from the regex origin (rx_origin).
1050              * If this value is less than last1, use it instead.
1051              */
1052             assert(rx_origin <= last1);
1053             last =
1054                 /* this condition handles the offset==infinity case, and
1055                  * is a short-cut otherwise. Although it's comparing a
1056                  * byte offset to a char length, it does so in a safe way,
1057                  * since 1 char always occupies 1 or more bytes,
1058                  * so if a string range is  (last1 - rx_origin) bytes,
1059                  * it will be less than or equal to  (last1 - rx_origin)
1060                  * chars; meaning it errs towards doing the accurate HOP3
1061                  * rather than just using last1 as a short-cut */
1062                 (last1 - rx_origin) < other->max_offset
1063                     ? last1
1064                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1065         }
1066         else {
1067             assert(strpos + start_shift <= check_at);
1068             last = HOP4c(check_at, other->min_offset - start_shift,
1069                         strbeg, strend);
1070         }
1071
1072         s = HOP3c(rx_origin, other->min_offset, strend);
1073         if (s < other_last)     /* These positions already checked */
1074             s = other_last;
1075
1076         must = utf8_target ? other->utf8_substr : other->substr;
1077         assert(SvPOK(must));
1078         {
1079             char *from = s;
1080             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1081
1082             if (to > strend)
1083                 to = strend;
1084             if (from > to) {
1085                 s = NULL;
1086                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1087                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1088                     (IV)(from - strbeg),
1089                     (IV)(to   - strbeg)
1090                 ));
1091             }
1092             else {
1093                 s = fbm_instr(
1094                     (unsigned char*)from,
1095                     (unsigned char*)to,
1096                     must,
1097                     multiline ? FBMrf_MULTILINE : 0
1098                 );
1099                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1100                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1101                     (IV)(from - strbeg),
1102                     (IV)(to   - strbeg),
1103                     (IV)(s ? s - strbeg : -1)
1104                 ));
1105             }
1106         }
1107
1108         DEBUG_EXECUTE_r({
1109             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1110                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1111             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1112                 s ? "Found" : "Contradicts",
1113                 other_ix ? "floating" : "anchored",
1114                 quoted, RE_SV_TAIL(must));
1115         });
1116
1117
1118         if (!s) {
1119             /* last1 is latest possible substr location. If we didn't
1120              * find it before there, we never will */
1121             if (last >= last1) {
1122                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1123                                         "; giving up...\n"));
1124                 goto fail_finish;
1125             }
1126
1127             /* try to find the check substr again at a later
1128              * position. Maybe next time we'll find the "other" substr
1129              * in range too */
1130             other_last = HOP3c(last, 1, strend) /* highest failure */;
1131             rx_origin =
1132                 other_ix /* i.e. if other-is-float */
1133                     ? HOP3c(rx_origin, 1, strend)
1134                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1135             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1136                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1137                 (other_ix ? "floating" : "anchored"),
1138                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1139                 (IV)(rx_origin - strbeg)
1140             ));
1141             goto restart;
1142         }
1143         else {
1144             if (other_ix) { /* if (other-is-float) */
1145                 /* other_last is set to s, not s+1, since its possible for
1146                  * a floating substr to fail first time, then succeed
1147                  * second time at the same floating position; e.g.:
1148                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1149                  * The first time round, anchored and float match at
1150                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1151                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1152                  */
1153                 other_last = s;
1154             }
1155             else {
1156                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1157                 other_last = HOP3c(s, 1, strend);
1158             }
1159             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1160                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1161                   (long)(s - strbeg),
1162                 (IV)(rx_origin - strbeg)
1163               ));
1164
1165         }
1166     }
1167     else {
1168         DEBUG_OPTIMISE_MORE_r(
1169             Perl_re_printf( aTHX_
1170                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1171                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1172                 " strend:%" IVdf "\n",
1173                 (IV)prog->check_offset_min,
1174                 (IV)prog->check_offset_max,
1175                 (IV)(check_at-strbeg),
1176                 (IV)(rx_origin-strbeg),
1177                 (IV)(rx_origin-check_at),
1178                 (IV)(strend-strbeg)
1179             )
1180         );
1181     }
1182
1183   postprocess_substr_matches:
1184
1185     /* handle the extra constraint of /^.../m if present */
1186
1187     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1188         char *s;
1189
1190         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1191                         "  looking for /^/m anchor"));
1192
1193         /* we have failed the constraint of a \n before rx_origin.
1194          * Find the next \n, if any, even if it's beyond the current
1195          * anchored and/or floating substrings. Whether we should be
1196          * scanning ahead for the next \n or the next substr is debatable.
1197          * On the one hand you'd expect rare substrings to appear less
1198          * often than \n's. On the other hand, searching for \n means
1199          * we're effectively flipping between check_substr and "\n" on each
1200          * iteration as the current "rarest" string candidate, which
1201          * means for example that we'll quickly reject the whole string if
1202          * hasn't got a \n, rather than trying every substr position
1203          * first
1204          */
1205
1206         s = HOP3c(strend, - prog->minlen, strpos);
1207         if (s <= rx_origin ||
1208             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1209         {
1210             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1211                             "  Did not find /%s^%s/m...\n",
1212                             PL_colors[0], PL_colors[1]));
1213             goto fail_finish;
1214         }
1215
1216         /* earliest possible origin is 1 char after the \n.
1217          * (since *rx_origin == '\n', it's safe to ++ here rather than
1218          * HOP(rx_origin, 1)) */
1219         rx_origin++;
1220
1221         if (prog->substrs->check_ix == 0  /* check is anchored */
1222             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1223         {
1224             /* Position contradicts check-string; either because
1225              * check was anchored (and thus has no wiggle room),
1226              * or check was float and rx_origin is above the float range */
1227             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1228                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1229                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1230             goto restart;
1231         }
1232
1233         /* if we get here, the check substr must have been float,
1234          * is in range, and we may or may not have had an anchored
1235          * "other" substr which still contradicts */
1236         assert(prog->substrs->check_ix); /* check is float */
1237
1238         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1239             /* whoops, the anchored "other" substr exists, so we still
1240              * contradict. On the other hand, the float "check" substr
1241              * didn't contradict, so just retry the anchored "other"
1242              * substr */
1243             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1244                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1245                 PL_colors[0], PL_colors[1],
1246                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1247                 (IV)(rx_origin - strbeg)
1248             ));
1249             goto do_other_substr;
1250         }
1251
1252         /* success: we don't contradict the found floating substring
1253          * (and there's no anchored substr). */
1254         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1255             "  Found /%s^%s/m with rx_origin %ld...\n",
1256             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1257     }
1258     else {
1259         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1260             "  (multiline anchor test skipped)\n"));
1261     }
1262
1263   success_at_start:
1264
1265
1266     /* if we have a starting character class, then test that extra constraint.
1267      * (trie stclasses are too expensive to use here, we are better off to
1268      * leave it to regmatch itself) */
1269
1270     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1271         const U8* const str = (U8*)STRING(progi->regstclass);
1272
1273         /* XXX this value could be pre-computed */
1274         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1275                     ?  (reginfo->is_utf8_pat
1276                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1277                         : STR_LEN(progi->regstclass))
1278                     : 1);
1279         char * endpos;
1280         char *s;
1281         /* latest pos that a matching float substr constrains rx start to */
1282         char *rx_max_float = NULL;
1283
1284         /* if the current rx_origin is anchored, either by satisfying an
1285          * anchored substring constraint, or a /^.../m constraint, then we
1286          * can reject the current origin if the start class isn't found
1287          * at the current position. If we have a float-only match, then
1288          * rx_origin is constrained to a range; so look for the start class
1289          * in that range. if neither, then look for the start class in the
1290          * whole rest of the string */
1291
1292         /* XXX DAPM it's not clear what the minlen test is for, and why
1293          * it's not used in the floating case. Nothing in the test suite
1294          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1295          * Here are some old comments, which may or may not be correct:
1296          *
1297          *   minlen == 0 is possible if regstclass is \b or \B,
1298          *   and the fixed substr is ''$.
1299          *   Since minlen is already taken into account, rx_origin+1 is
1300          *   before strend; accidentally, minlen >= 1 guaranties no false
1301          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1302          *   0) below assumes that regstclass does not come from lookahead...
1303          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1304          *   This leaves EXACTF-ish only, which are dealt with in
1305          *   find_byclass().
1306          */
1307
1308         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1309             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1310         else if (prog->float_substr || prog->float_utf8) {
1311             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1312             endpos = HOP3clim(rx_max_float, cl_l, strend);
1313         }
1314         else
1315             endpos= strend;
1316
1317         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1318             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1319             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1320               (IV)start_shift, (IV)(check_at - strbeg),
1321               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1322
1323         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1324                             reginfo);
1325         if (!s) {
1326             if (endpos == strend) {
1327                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1328                                 "  Could not match STCLASS...\n") );
1329                 goto fail;
1330             }
1331             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1332                                "  This position contradicts STCLASS...\n") );
1333             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1334                         && !(prog->intflags & PREGf_IMPLICIT))
1335                 goto fail;
1336
1337             /* Contradict one of substrings */
1338             if (prog->anchored_substr || prog->anchored_utf8) {
1339                 if (prog->substrs->check_ix == 1) { /* check is float */
1340                     /* Have both, check_string is floating */
1341                     assert(rx_origin + start_shift <= check_at);
1342                     if (rx_origin + start_shift != check_at) {
1343                         /* not at latest position float substr could match:
1344                          * Recheck anchored substring, but not floating.
1345                          * The condition above is in bytes rather than
1346                          * chars for efficiency. It's conservative, in
1347                          * that it errs on the side of doing 'goto
1348                          * do_other_substr'. In this case, at worst,
1349                          * an extra anchored search may get done, but in
1350                          * practice the extra fbm_instr() is likely to
1351                          * get skipped anyway. */
1352                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1353                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1354                             (long)(other_last - strbeg),
1355                             (IV)(rx_origin - strbeg)
1356                         ));
1357                         goto do_other_substr;
1358                     }
1359                 }
1360             }
1361             else {
1362                 /* float-only */
1363
1364                 if (ml_anch) {
1365                     /* In the presence of ml_anch, we might be able to
1366                      * find another \n without breaking the current float
1367                      * constraint. */
1368
1369                     /* strictly speaking this should be HOP3c(..., 1, ...),
1370                      * but since we goto a block of code that's going to
1371                      * search for the next \n if any, its safe here */
1372                     rx_origin++;
1373                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1374                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1375                               PL_colors[0], PL_colors[1],
1376                               (long)(rx_origin - strbeg)) );
1377                     goto postprocess_substr_matches;
1378                 }
1379
1380                 /* strictly speaking this can never be true; but might
1381                  * be if we ever allow intuit without substrings */
1382                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1383                     goto fail;
1384
1385                 rx_origin = rx_max_float;
1386             }
1387
1388             /* at this point, any matching substrings have been
1389              * contradicted. Start again... */
1390
1391             rx_origin = HOP3c(rx_origin, 1, strend);
1392
1393             /* uses bytes rather than char calculations for efficiency.
1394              * It's conservative: it errs on the side of doing 'goto restart',
1395              * where there is code that does a proper char-based test */
1396             if (rx_origin + start_shift + end_shift > strend) {
1397                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1398                                        "  Could not match STCLASS...\n") );
1399                 goto fail;
1400             }
1401             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1402                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1403                 (prog->substrs->check_ix ? "floating" : "anchored"),
1404                 (long)(rx_origin + start_shift - strbeg),
1405                 (IV)(rx_origin - strbeg)
1406             ));
1407             goto restart;
1408         }
1409
1410         /* Success !!! */
1411
1412         if (rx_origin != s) {
1413             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1414                         "  By STCLASS: moving %ld --> %ld\n",
1415                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1416                    );
1417         }
1418         else {
1419             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1420                                   "  Does not contradict STCLASS...\n");
1421                    );
1422         }
1423     }
1424
1425     /* Decide whether using the substrings helped */
1426
1427     if (rx_origin != strpos) {
1428         /* Fixed substring is found far enough so that the match
1429            cannot start at strpos. */
1430
1431         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1432         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1433     }
1434     else {
1435         /* The found rx_origin position does not prohibit matching at
1436          * strpos, so calling intuit didn't gain us anything. Decrement
1437          * the BmUSEFUL() count on the check substring, and if we reach
1438          * zero, free it.  */
1439         if (!(prog->intflags & PREGf_NAUGHTY)
1440             && (utf8_target ? (
1441                 prog->check_utf8                /* Could be deleted already */
1442                 && --BmUSEFUL(prog->check_utf8) < 0
1443                 && (prog->check_utf8 == prog->float_utf8)
1444             ) : (
1445                 prog->check_substr              /* Could be deleted already */
1446                 && --BmUSEFUL(prog->check_substr) < 0
1447                 && (prog->check_substr == prog->float_substr)
1448             )))
1449         {
1450             /* If flags & SOMETHING - do not do it many times on the same match */
1451             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1452             /* XXX Does the destruction order has to change with utf8_target? */
1453             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1454             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1455             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1456             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1457             check = NULL;                       /* abort */
1458             /* XXXX This is a remnant of the old implementation.  It
1459                     looks wasteful, since now INTUIT can use many
1460                     other heuristics. */
1461             prog->extflags &= ~RXf_USE_INTUIT;
1462         }
1463     }
1464
1465     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1466             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1467              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1468
1469     return rx_origin;
1470
1471   fail_finish:                          /* Substring not found */
1472     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1473         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1474   fail:
1475     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1476                           PL_colors[4], PL_colors[5]));
1477     return NULL;
1478 }
1479
1480
1481 #define DECL_TRIE_TYPE(scan) \
1482     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1483                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1484                  trie_utf8l, trie_flu8 }                                            \
1485                     trie_type = ((scan->flags == EXACT)                             \
1486                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1487                                  : (scan->flags == EXACTL)                          \
1488                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1489                                     : (scan->flags == EXACTFA)                      \
1490                                       ? (utf8_target                                \
1491                                          ? trie_utf8_exactfa_fold                   \
1492                                          : trie_latin_utf8_exactfa_fold)            \
1493                                       : (scan->flags == EXACTFLU8                   \
1494                                          ? trie_flu8                                \
1495                                          : (utf8_target                             \
1496                                            ? trie_utf8_fold                         \
1497                                            :   trie_latin_utf8_fold)))
1498
1499 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1500 STMT_START {                                                                        \
1501     STRLEN skiplen;                                                                 \
1502     U8 flags = FOLD_FLAGS_FULL;                                                     \
1503     switch (trie_type) {                                                            \
1504     case trie_flu8:                                                                 \
1505         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1506         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1507             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1508         }                                                                           \
1509         goto do_trie_utf8_fold;                                                     \
1510     case trie_utf8_exactfa_fold:                                                    \
1511         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1512         /* FALLTHROUGH */                                                           \
1513     case trie_utf8_fold:                                                            \
1514       do_trie_utf8_fold:                                                            \
1515         if ( foldlen>0 ) {                                                          \
1516             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1517             foldlen -= len;                                                         \
1518             uscan += len;                                                           \
1519             len=0;                                                                  \
1520         } else {                                                                    \
1521             len = UTF8SKIP(uc);                                                     \
1522             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen,  \
1523                                                                             flags); \
1524             skiplen = UVCHR_SKIP( uvc );                                            \
1525             foldlen -= skiplen;                                                     \
1526             uscan = foldbuf + skiplen;                                              \
1527         }                                                                           \
1528         break;                                                                      \
1529     case trie_latin_utf8_exactfa_fold:                                              \
1530         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1531         /* FALLTHROUGH */                                                           \
1532     case trie_latin_utf8_fold:                                                      \
1533         if ( foldlen>0 ) {                                                          \
1534             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1535             foldlen -= len;                                                         \
1536             uscan += len;                                                           \
1537             len=0;                                                                  \
1538         } else {                                                                    \
1539             len = 1;                                                                \
1540             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1541             skiplen = UVCHR_SKIP( uvc );                                            \
1542             foldlen -= skiplen;                                                     \
1543             uscan = foldbuf + skiplen;                                              \
1544         }                                                                           \
1545         break;                                                                      \
1546     case trie_utf8l:                                                                \
1547         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1548         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1549             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1550         }                                                                           \
1551         /* FALLTHROUGH */                                                           \
1552     case trie_utf8:                                                                 \
1553         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1554         break;                                                                      \
1555     case trie_plain:                                                                \
1556         uvc = (UV)*uc;                                                              \
1557         len = 1;                                                                    \
1558     }                                                                               \
1559     if (uvc < 256) {                                                                \
1560         charid = trie->charmap[ uvc ];                                              \
1561     }                                                                               \
1562     else {                                                                          \
1563         charid = 0;                                                                 \
1564         if (widecharmap) {                                                          \
1565             SV** const svpp = hv_fetch(widecharmap,                                 \
1566                         (char*)&uvc, sizeof(UV), 0);                                \
1567             if (svpp)                                                               \
1568                 charid = (U16)SvIV(*svpp);                                          \
1569         }                                                                           \
1570     }                                                                               \
1571 } STMT_END
1572
1573 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1574     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1575                 startpos, doutf8, depth)
1576
1577 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1578 STMT_START {                                              \
1579     while (s <= e) {                                      \
1580         if ( (COND)                                       \
1581              && (ln == 1 || folder(s, pat_string, ln))    \
1582              && (reginfo->intuit || regtry(reginfo, &s)) )\
1583             goto got_it;                                  \
1584         s++;                                              \
1585     }                                                     \
1586 } STMT_END
1587
1588 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1589 STMT_START {                                          \
1590     while (s < strend) {                              \
1591         CODE                                          \
1592         s += UTF8SKIP(s);                             \
1593     }                                                 \
1594 } STMT_END
1595
1596 #define REXEC_FBC_SCAN(CODE)                          \
1597 STMT_START {                                          \
1598     while (s < strend) {                              \
1599         CODE                                          \
1600         s++;                                          \
1601     }                                                 \
1602 } STMT_END
1603
1604 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1605 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1606     if (COND) {                                                \
1607         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1608             goto got_it;                                       \
1609         else                                                   \
1610             tmp = doevery;                                     \
1611     }                                                          \
1612     else                                                       \
1613         tmp = 1;                                               \
1614 )
1615
1616 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1617 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1618     if (COND) {                                                \
1619         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1620             goto got_it;                                       \
1621         else                                                   \
1622             tmp = doevery;                                     \
1623     }                                                          \
1624     else                                                       \
1625         tmp = 1;                                               \
1626 )
1627
1628 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1629     if (utf8_target) {                                         \
1630         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1631     }                                                          \
1632     else {                                                     \
1633         REXEC_FBC_CLASS_SCAN(COND);                            \
1634     }
1635
1636 /* The three macros below are slightly different versions of the same logic.
1637  *
1638  * The first is for /a and /aa when the target string is UTF-8.  This can only
1639  * match ascii, but it must advance based on UTF-8.   The other two handle the
1640  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1641  * for the boundary (or non-boundary) between a word and non-word character.
1642  * The utf8 and non-utf8 cases have the same logic, but the details must be
1643  * different.  Find the "wordness" of the character just prior to this one, and
1644  * compare it with the wordness of this one.  If they differ, we have a
1645  * boundary.  At the beginning of the string, pretend that the previous
1646  * character was a new-line.
1647  *
1648  * All these macros uncleanly have side-effects with each other and outside
1649  * variables.  So far it's been too much trouble to clean-up
1650  *
1651  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1652  *               a word character or not.
1653  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1654  *               word/non-word
1655  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1656  *
1657  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1658  * are looking for a boundary or for a non-boundary.  If we are looking for a
1659  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1660  * see if this tentative match actually works, and if so, to quit the loop
1661  * here.  And vice-versa if we are looking for a non-boundary.
1662  *
1663  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1664  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1665  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1666  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1667  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1668  * complement.  But in that branch we complement tmp, meaning that at the
1669  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1670  * which means at the top of the loop in the next iteration, it is
1671  * TEST_NON_UTF8(s-1) */
1672 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1673     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1674     tmp = TEST_NON_UTF8(tmp);                                                  \
1675     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1676         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1677             tmp = !tmp;                                                        \
1678             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1679         }                                                                      \
1680         else {                                                                 \
1681             IF_FAIL;                                                           \
1682         }                                                                      \
1683     );                                                                         \
1684
1685 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1686  * TEST_UTF8 is a macro that for the same input code points returns identically
1687  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1688 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1689     if (s == reginfo->strbeg) {                                                \
1690         tmp = '\n';                                                            \
1691     }                                                                          \
1692     else { /* Back-up to the start of the previous character */                \
1693         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1694         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1695                                                        0, UTF8_ALLOW_DEFAULT); \
1696     }                                                                          \
1697     tmp = TEST_UV(tmp);                                                        \
1698     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1699     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1700         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1701             tmp = !tmp;                                                        \
1702             IF_SUCCESS;                                                        \
1703         }                                                                      \
1704         else {                                                                 \
1705             IF_FAIL;                                                           \
1706         }                                                                      \
1707     );
1708
1709 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1710  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1711  * macros below */
1712 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1713     if (utf8_target) {                                                         \
1714         UTF8_CODE                                                              \
1715     }                                                                          \
1716     else {  /* Not utf8 */                                                     \
1717         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1718         tmp = TEST_NON_UTF8(tmp);                                              \
1719         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1720             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1721                 IF_SUCCESS;                                                    \
1722                 tmp = !tmp;                                                    \
1723             }                                                                  \
1724             else {                                                             \
1725                 IF_FAIL;                                                       \
1726             }                                                                  \
1727         );                                                                     \
1728     }                                                                          \
1729     /* Here, things have been set up by the previous code so that tmp is the   \
1730      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1731      * utf8ness of the target).  We also have to check if this matches against \
1732      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1733      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1734      * string */                                                               \
1735     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1736         IF_SUCCESS;                                                            \
1737     }                                                                          \
1738     else {                                                                     \
1739         IF_FAIL;                                                               \
1740     }
1741
1742 /* This is the macro to use when we want to see if something that looks like it
1743  * could match, actually does, and if so exits the loop */
1744 #define REXEC_FBC_TRYIT                            \
1745     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1746         goto got_it
1747
1748 /* The only difference between the BOUND and NBOUND cases is that
1749  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1750  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1751  * with the other one being empty (PLACEHOLDER is defined as empty).
1752  *
1753  * The TEST_FOO parameters are for operating on different forms of input, but
1754  * all should be ones that return identically for the same underlying code
1755  * points */
1756 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1757     FBC_BOUND_COMMON(                                                          \
1758           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1759           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1760
1761 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1762     FBC_BOUND_COMMON(                                                          \
1763             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1764             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1765
1766 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1767     FBC_BOUND_COMMON(                                                          \
1768           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1769           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1770
1771 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1772     FBC_BOUND_COMMON(                                                          \
1773             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1774             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1775
1776 #ifdef DEBUGGING
1777 static IV
1778 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1779   IV cp_out = Perl__invlist_search(invlist, cp_in);
1780   assert(cp_out >= 0);
1781   return cp_out;
1782 }
1783 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1784         invmap[S_get_break_val_cp_checked(invlist, cp)]
1785 #else
1786 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1787         invmap[_invlist_search(invlist, cp)]
1788 #endif
1789
1790 /* Takes a pointer to an inversion list, a pointer to its corresponding
1791  * inversion map, and a code point, and returns the code point's value
1792  * according to the two arrays.  It assumes that all code points have a value.
1793  * This is used as the base macro for macros for particular properties */
1794 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1795         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
1796
1797 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1798  * of a code point, returning the value for the first code point in the string.
1799  * And it takes the particular macro name that finds the desired value given a
1800  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1801 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1802              (__ASSERT_(pos < strend)                                          \
1803                  /* Note assumes is valid UTF-8 */                             \
1804              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1805
1806 /* Returns the GCB value for the input code point */
1807 #define getGCB_VAL_CP(cp)                                                      \
1808           _generic_GET_BREAK_VAL_CP(                                           \
1809                                     PL_GCB_invlist,                            \
1810                                     _Perl_GCB_invmap,                          \
1811                                     (cp))
1812
1813 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1814  * bounded by pos and strend */
1815 #define getGCB_VAL_UTF8(pos, strend)                                           \
1816     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1817
1818 /* Returns the LB value for the input code point */
1819 #define getLB_VAL_CP(cp)                                                       \
1820           _generic_GET_BREAK_VAL_CP(                                           \
1821                                     PL_LB_invlist,                             \
1822                                     _Perl_LB_invmap,                           \
1823                                     (cp))
1824
1825 /* Returns the LB value for the first code point in the UTF-8 encoded string
1826  * bounded by pos and strend */
1827 #define getLB_VAL_UTF8(pos, strend)                                            \
1828     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1829
1830
1831 /* Returns the SB value for the input code point */
1832 #define getSB_VAL_CP(cp)                                                       \
1833           _generic_GET_BREAK_VAL_CP(                                           \
1834                                     PL_SB_invlist,                             \
1835                                     _Perl_SB_invmap,                     \
1836                                     (cp))
1837
1838 /* Returns the SB value for the first code point in the UTF-8 encoded string
1839  * bounded by pos and strend */
1840 #define getSB_VAL_UTF8(pos, strend)                                            \
1841     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1842
1843 /* Returns the WB value for the input code point */
1844 #define getWB_VAL_CP(cp)                                                       \
1845           _generic_GET_BREAK_VAL_CP(                                           \
1846                                     PL_WB_invlist,                             \
1847                                     _Perl_WB_invmap,                         \
1848                                     (cp))
1849
1850 /* Returns the WB value for the first code point in the UTF-8 encoded string
1851  * bounded by pos and strend */
1852 #define getWB_VAL_UTF8(pos, strend)                                            \
1853     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1854
1855 /* We know what class REx starts with.  Try to find this position... */
1856 /* if reginfo->intuit, its a dryrun */
1857 /* annoyingly all the vars in this routine have different names from their counterparts
1858    in regmatch. /grrr */
1859 STATIC char *
1860 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1861     const char *strend, regmatch_info *reginfo)
1862 {
1863     dVAR;
1864     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1865     char *pat_string;   /* The pattern's exactish string */
1866     char *pat_end;          /* ptr to end char of pat_string */
1867     re_fold_t folder;   /* Function for computing non-utf8 folds */
1868     const U8 *fold_array;   /* array for folding ords < 256 */
1869     STRLEN ln;
1870     STRLEN lnc;
1871     U8 c1;
1872     U8 c2;
1873     char *e;
1874     I32 tmp = 1;        /* Scratch variable? */
1875     const bool utf8_target = reginfo->is_utf8_target;
1876     UV utf8_fold_flags = 0;
1877     const bool is_utf8_pat = reginfo->is_utf8_pat;
1878     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1879                                    with a result inverts that result, as 0^1 =
1880                                    1 and 1^1 = 0 */
1881     _char_class_number classnum;
1882
1883     RXi_GET_DECL(prog,progi);
1884
1885     PERL_ARGS_ASSERT_FIND_BYCLASS;
1886
1887     /* We know what class it must start with. */
1888     switch (OP(c)) {
1889     case ANYOFL:
1890         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1891
1892         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
1893             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1894         }
1895
1896         /* FALLTHROUGH */
1897     case ANYOFD:
1898     case ANYOF:
1899         if (utf8_target) {
1900             REXEC_FBC_UTF8_CLASS_SCAN(
1901                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1902         }
1903         else if (ANYOF_FLAGS(c)) {
1904             REXEC_FBC_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
1905         }
1906         else {
1907             REXEC_FBC_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
1908         }
1909         break;
1910
1911     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1912         assert(! is_utf8_pat);
1913         /* FALLTHROUGH */
1914     case EXACTFA:
1915         if (is_utf8_pat || utf8_target) {
1916             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1917             goto do_exactf_utf8;
1918         }
1919         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1920         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1921         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1922
1923     case EXACTF:   /* This node only generated for non-utf8 patterns */
1924         assert(! is_utf8_pat);
1925         if (utf8_target) {
1926             utf8_fold_flags = 0;
1927             goto do_exactf_utf8;
1928         }
1929         fold_array = PL_fold;
1930         folder = foldEQ;
1931         goto do_exactf_non_utf8;
1932
1933     case EXACTFL:
1934         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1935         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1936             utf8_fold_flags = FOLDEQ_LOCALE;
1937             goto do_exactf_utf8;
1938         }
1939         fold_array = PL_fold_locale;
1940         folder = foldEQ_locale;
1941         goto do_exactf_non_utf8;
1942
1943     case EXACTFU_SS:
1944         if (is_utf8_pat) {
1945             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1946         }
1947         goto do_exactf_utf8;
1948
1949     case EXACTFLU8:
1950             if (! utf8_target) {    /* All code points in this node require
1951                                        UTF-8 to express.  */
1952                 break;
1953             }
1954             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1955                                              | FOLDEQ_S2_FOLDS_SANE;
1956             goto do_exactf_utf8;
1957
1958     case EXACTFU:
1959         if (is_utf8_pat || utf8_target) {
1960             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1961             goto do_exactf_utf8;
1962         }
1963
1964         /* Any 'ss' in the pattern should have been replaced by regcomp,
1965          * so we don't have to worry here about this single special case
1966          * in the Latin1 range */
1967         fold_array = PL_fold_latin1;
1968         folder = foldEQ_latin1;
1969
1970         /* FALLTHROUGH */
1971
1972       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1973                            are no glitches with fold-length differences
1974                            between the target string and pattern */
1975
1976         /* The idea in the non-utf8 EXACTF* cases is to first find the
1977          * first character of the EXACTF* node and then, if necessary,
1978          * case-insensitively compare the full text of the node.  c1 is the
1979          * first character.  c2 is its fold.  This logic will not work for
1980          * Unicode semantics and the german sharp ss, which hence should
1981          * not be compiled into a node that gets here. */
1982         pat_string = STRING(c);
1983         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1984
1985         /* We know that we have to match at least 'ln' bytes (which is the
1986          * same as characters, since not utf8).  If we have to match 3
1987          * characters, and there are only 2 availabe, we know without
1988          * trying that it will fail; so don't start a match past the
1989          * required minimum number from the far end */
1990         e = HOP3c(strend, -((SSize_t)ln), s);
1991         if (e < s)
1992             break;
1993
1994         c1 = *pat_string;
1995         c2 = fold_array[c1];
1996         if (c1 == c2) { /* If char and fold are the same */
1997             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1998         }
1999         else {
2000             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
2001         }
2002         break;
2003
2004       do_exactf_utf8:
2005       {
2006         unsigned expansion;
2007
2008         /* If one of the operands is in utf8, we can't use the simpler folding
2009          * above, due to the fact that many different characters can have the
2010          * same fold, or portion of a fold, or different- length fold */
2011         pat_string = STRING(c);
2012         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2013         pat_end = pat_string + ln;
2014         lnc = is_utf8_pat       /* length to match in characters */
2015                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2016                 : ln;
2017
2018         /* We have 'lnc' characters to match in the pattern, but because of
2019          * multi-character folding, each character in the target can match
2020          * up to 3 characters (Unicode guarantees it will never exceed
2021          * this) if it is utf8-encoded; and up to 2 if not (based on the
2022          * fact that the Latin 1 folds are already determined, and the
2023          * only multi-char fold in that range is the sharp-s folding to
2024          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2025          * string character.  Adjust lnc accordingly, rounding up, so that
2026          * if we need to match at least 4+1/3 chars, that really is 5. */
2027         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2028         lnc = (lnc + expansion - 1) / expansion;
2029
2030         /* As in the non-UTF8 case, if we have to match 3 characters, and
2031          * only 2 are left, it's guaranteed to fail, so don't start a
2032          * match that would require us to go beyond the end of the string
2033          */
2034         e = HOP3c(strend, -((SSize_t)lnc), s);
2035
2036         /* XXX Note that we could recalculate e to stop the loop earlier,
2037          * as the worst case expansion above will rarely be met, and as we
2038          * go along we would usually find that e moves further to the left.
2039          * This would happen only after we reached the point in the loop
2040          * where if there were no expansion we should fail.  Unclear if
2041          * worth the expense */
2042
2043         while (s <= e) {
2044             char *my_strend= (char *)strend;
2045             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2046                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2047                 && (reginfo->intuit || regtry(reginfo, &s)) )
2048             {
2049                 goto got_it;
2050             }
2051             s += (utf8_target) ? UTF8SKIP(s) : 1;
2052         }
2053         break;
2054     }
2055
2056     case BOUNDL:
2057         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2058         if (FLAGS(c) != TRADITIONAL_BOUND) {
2059             if (! IN_UTF8_CTYPE_LOCALE) {
2060                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2061                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2062             }
2063             goto do_boundu;
2064         }
2065
2066         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2067         break;
2068
2069     case NBOUNDL:
2070         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2071         if (FLAGS(c) != TRADITIONAL_BOUND) {
2072             if (! IN_UTF8_CTYPE_LOCALE) {
2073                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2074                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2075             }
2076             goto do_nboundu;
2077         }
2078
2079         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2080         break;
2081
2082     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2083                    meaning */
2084         assert(FLAGS(c) == TRADITIONAL_BOUND);
2085
2086         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2087         break;
2088
2089     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2090                    meaning */
2091         assert(FLAGS(c) == TRADITIONAL_BOUND);
2092
2093         FBC_BOUND_A(isWORDCHAR_A);
2094         break;
2095
2096     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2097                    meaning */
2098         assert(FLAGS(c) == TRADITIONAL_BOUND);
2099
2100         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2101         break;
2102
2103     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2104                    meaning */
2105         assert(FLAGS(c) == TRADITIONAL_BOUND);
2106
2107         FBC_NBOUND_A(isWORDCHAR_A);
2108         break;
2109
2110     case NBOUNDU:
2111         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2112             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2113             break;
2114         }
2115
2116       do_nboundu:
2117
2118         to_complement = 1;
2119         /* FALLTHROUGH */
2120
2121     case BOUNDU:
2122       do_boundu:
2123         switch((bound_type) FLAGS(c)) {
2124             case TRADITIONAL_BOUND:
2125                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2126                 break;
2127             case GCB_BOUND:
2128                 if (s == reginfo->strbeg) {
2129                     if (reginfo->intuit || regtry(reginfo, &s))
2130                     {
2131                         goto got_it;
2132                     }
2133
2134                     /* Didn't match.  Try at the next position (if there is one) */
2135                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2136                     if (UNLIKELY(s >= reginfo->strend)) {
2137                         break;
2138                     }
2139                 }
2140
2141                 if (utf8_target) {
2142                     GCB_enum before = getGCB_VAL_UTF8(
2143                                                reghop3((U8*)s, -1,
2144                                                        (U8*)(reginfo->strbeg)),
2145                                                (U8*) reginfo->strend);
2146                     while (s < strend) {
2147                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2148                                                         (U8*) reginfo->strend);
2149                         if (   (to_complement ^ isGCB(before,
2150                                                       after,
2151                                                       (U8*) reginfo->strbeg,
2152                                                       (U8*) s,
2153                                                       utf8_target))
2154                             && (reginfo->intuit || regtry(reginfo, &s)))
2155                         {
2156                             goto got_it;
2157                         }
2158                         before = after;
2159                         s += UTF8SKIP(s);
2160                     }
2161                 }
2162                 else {  /* Not utf8.  Everything is a GCB except between CR and
2163                            LF */
2164                     while (s < strend) {
2165                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2166                                               || UCHARAT(s) != '\n'))
2167                             && (reginfo->intuit || regtry(reginfo, &s)))
2168                         {
2169                             goto got_it;
2170                         }
2171                         s++;
2172                     }
2173                 }
2174
2175                 /* And, since this is a bound, it can match after the final
2176                  * character in the string */
2177                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2178                     goto got_it;
2179                 }
2180                 break;
2181
2182             case LB_BOUND:
2183                 if (s == reginfo->strbeg) {
2184                     if (reginfo->intuit || regtry(reginfo, &s)) {
2185                         goto got_it;
2186                     }
2187                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2188                     if (UNLIKELY(s >= reginfo->strend)) {
2189                         break;
2190                     }
2191                 }
2192
2193                 if (utf8_target) {
2194                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2195                                                                -1,
2196                                                                (U8*)(reginfo->strbeg)),
2197                                                        (U8*) reginfo->strend);
2198                     while (s < strend) {
2199                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2200                         if (to_complement ^ isLB(before,
2201                                                  after,
2202                                                  (U8*) reginfo->strbeg,
2203                                                  (U8*) s,
2204                                                  (U8*) reginfo->strend,
2205                                                  utf8_target)
2206                             && (reginfo->intuit || regtry(reginfo, &s)))
2207                         {
2208                             goto got_it;
2209                         }
2210                         before = after;
2211                         s += UTF8SKIP(s);
2212                     }
2213                 }
2214                 else {  /* Not utf8. */
2215                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2216                     while (s < strend) {
2217                         LB_enum after = getLB_VAL_CP((U8) *s);
2218                         if (to_complement ^ isLB(before,
2219                                                  after,
2220                                                  (U8*) reginfo->strbeg,
2221                                                  (U8*) s,
2222                                                  (U8*) reginfo->strend,
2223                                                  utf8_target)
2224                             && (reginfo->intuit || regtry(reginfo, &s)))
2225                         {
2226                             goto got_it;
2227                         }
2228                         before = after;
2229                         s++;
2230                     }
2231                 }
2232
2233                 if (reginfo->intuit || regtry(reginfo, &s)) {
2234                     goto got_it;
2235                 }
2236
2237                 break;
2238
2239             case SB_BOUND:
2240                 if (s == reginfo->strbeg) {
2241                     if (reginfo->intuit || regtry(reginfo, &s)) {
2242                         goto got_it;
2243                     }
2244                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2245                     if (UNLIKELY(s >= reginfo->strend)) {
2246                         break;
2247                     }
2248                 }
2249
2250                 if (utf8_target) {
2251                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2252                                                         -1,
2253                                                         (U8*)(reginfo->strbeg)),
2254                                                       (U8*) reginfo->strend);
2255                     while (s < strend) {
2256                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2257                                                          (U8*) reginfo->strend);
2258                         if ((to_complement ^ isSB(before,
2259                                                   after,
2260                                                   (U8*) reginfo->strbeg,
2261                                                   (U8*) s,
2262                                                   (U8*) reginfo->strend,
2263                                                   utf8_target))
2264                             && (reginfo->intuit || regtry(reginfo, &s)))
2265                         {
2266                             goto got_it;
2267                         }
2268                         before = after;
2269                         s += UTF8SKIP(s);
2270                     }
2271                 }
2272                 else {  /* Not utf8. */
2273                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2274                     while (s < strend) {
2275                         SB_enum after = getSB_VAL_CP((U8) *s);
2276                         if ((to_complement ^ isSB(before,
2277                                                   after,
2278                                                   (U8*) reginfo->strbeg,
2279                                                   (U8*) s,
2280                                                   (U8*) reginfo->strend,
2281                                                   utf8_target))
2282                             && (reginfo->intuit || regtry(reginfo, &s)))
2283                         {
2284                             goto got_it;
2285                         }
2286                         before = after;
2287                         s++;
2288                     }
2289                 }
2290
2291                 /* Here are at the final position in the target string.  The SB
2292                  * value is always true here, so matches, depending on other
2293                  * constraints */
2294                 if (reginfo->intuit || regtry(reginfo, &s)) {
2295                     goto got_it;
2296                 }
2297
2298                 break;
2299
2300             case WB_BOUND:
2301                 if (s == reginfo->strbeg) {
2302                     if (reginfo->intuit || regtry(reginfo, &s)) {
2303                         goto got_it;
2304                     }
2305                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2306                     if (UNLIKELY(s >= reginfo->strend)) {
2307                         break;
2308                     }
2309                 }
2310
2311                 if (utf8_target) {
2312                     /* We are at a boundary between char_sub_0 and char_sub_1.
2313                      * We also keep track of the value for char_sub_-1 as we
2314                      * loop through the line.   Context may be needed to make a
2315                      * determination, and if so, this can save having to
2316                      * recalculate it */
2317                     WB_enum previous = WB_UNKNOWN;
2318                     WB_enum before = getWB_VAL_UTF8(
2319                                               reghop3((U8*)s,
2320                                                       -1,
2321                                                       (U8*)(reginfo->strbeg)),
2322                                               (U8*) reginfo->strend);
2323                     while (s < strend) {
2324                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2325                                                         (U8*) reginfo->strend);
2326                         if ((to_complement ^ isWB(previous,
2327                                                   before,
2328                                                   after,
2329                                                   (U8*) reginfo->strbeg,
2330                                                   (U8*) s,
2331                                                   (U8*) reginfo->strend,
2332                                                   utf8_target))
2333                             && (reginfo->intuit || regtry(reginfo, &s)))
2334                         {
2335                             goto got_it;
2336                         }
2337                         previous = before;
2338                         before = after;
2339                         s += UTF8SKIP(s);
2340                     }
2341                 }
2342                 else {  /* Not utf8. */
2343                     WB_enum previous = WB_UNKNOWN;
2344                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2345                     while (s < strend) {
2346                         WB_enum after = getWB_VAL_CP((U8) *s);
2347                         if ((to_complement ^ isWB(previous,
2348                                                   before,
2349                                                   after,
2350                                                   (U8*) reginfo->strbeg,
2351                                                   (U8*) s,
2352                                                   (U8*) reginfo->strend,
2353                                                   utf8_target))
2354                             && (reginfo->intuit || regtry(reginfo, &s)))
2355                         {
2356                             goto got_it;
2357                         }
2358                         previous = before;
2359                         before = after;
2360                         s++;
2361                     }
2362                 }
2363
2364                 if (reginfo->intuit || regtry(reginfo, &s)) {
2365                     goto got_it;
2366                 }
2367         }
2368         break;
2369
2370     case LNBREAK:
2371         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2372                         is_LNBREAK_latin1_safe(s, strend)
2373         );
2374         break;
2375
2376     /* The argument to all the POSIX node types is the class number to pass to
2377      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2378
2379     case NPOSIXL:
2380         to_complement = 1;
2381         /* FALLTHROUGH */
2382
2383     case POSIXL:
2384         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2385         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2386                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2387         break;
2388
2389     case NPOSIXD:
2390         to_complement = 1;
2391         /* FALLTHROUGH */
2392
2393     case POSIXD:
2394         if (utf8_target) {
2395             goto posix_utf8;
2396         }
2397         goto posixa;
2398
2399     case NPOSIXA:
2400         if (utf8_target) {
2401             /* The complement of something that matches only ASCII matches all
2402              * non-ASCII, plus everything in ASCII that isn't in the class. */
2403             REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
2404                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2405             break;
2406         }
2407
2408         to_complement = 1;
2409         /* FALLTHROUGH */
2410
2411     case POSIXA:
2412       posixa:
2413         /* Don't need to worry about utf8, as it can match only a single
2414          * byte invariant character. */
2415         REXEC_FBC_CLASS_SCAN(
2416                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2417         break;
2418
2419     case NPOSIXU:
2420         to_complement = 1;
2421         /* FALLTHROUGH */
2422
2423     case POSIXU:
2424         if (! utf8_target) {
2425             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2426                                                                     FLAGS(c))));
2427         }
2428         else {
2429
2430           posix_utf8:
2431             classnum = (_char_class_number) FLAGS(c);
2432             if (classnum < _FIRST_NON_SWASH_CC) {
2433                 while (s < strend) {
2434
2435                     /* We avoid loading in the swash as long as possible, but
2436                      * should we have to, we jump to a separate loop.  This
2437                      * extra 'if' statement is what keeps this code from being
2438                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2439                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2440                         goto found_above_latin1;
2441                     }
2442                     if ((UTF8_IS_INVARIANT(*s)
2443                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2444                                                                 classnum)))
2445                         || (   UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
2446                             && to_complement ^ cBOOL(
2447                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2448                                                                       *(s + 1)),
2449                                               classnum))))
2450                     {
2451                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2452                             goto got_it;
2453                         else {
2454                             tmp = doevery;
2455                         }
2456                     }
2457                     else {
2458                         tmp = 1;
2459                     }
2460                     s += UTF8SKIP(s);
2461                 }
2462             }
2463             else switch (classnum) {    /* These classes are implemented as
2464                                            macros */
2465                 case _CC_ENUM_SPACE:
2466                     REXEC_FBC_UTF8_CLASS_SCAN(
2467                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2468                     break;
2469
2470                 case _CC_ENUM_BLANK:
2471                     REXEC_FBC_UTF8_CLASS_SCAN(
2472                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2473                     break;
2474
2475                 case _CC_ENUM_XDIGIT:
2476                     REXEC_FBC_UTF8_CLASS_SCAN(
2477                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2478                     break;
2479
2480                 case _CC_ENUM_VERTSPACE:
2481                     REXEC_FBC_UTF8_CLASS_SCAN(
2482                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2483                     break;
2484
2485                 case _CC_ENUM_CNTRL:
2486                     REXEC_FBC_UTF8_CLASS_SCAN(
2487                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2488                     break;
2489
2490                 default:
2491                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2492                     NOT_REACHED; /* NOTREACHED */
2493             }
2494         }
2495         break;
2496
2497       found_above_latin1:   /* Here we have to load a swash to get the result
2498                                for the current code point */
2499         if (! PL_utf8_swash_ptrs[classnum]) {
2500             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2501             PL_utf8_swash_ptrs[classnum] =
2502                     _core_swash_init("utf8",
2503                                      "",
2504                                      &PL_sv_undef, 1, 0,
2505                                      PL_XPosix_ptrs[classnum], &flags);
2506         }
2507
2508         /* This is a copy of the loop above for swash classes, though using the
2509          * FBC macro instead of being expanded out.  Since we've loaded the
2510          * swash, we don't have to check for that each time through the loop */
2511         REXEC_FBC_UTF8_CLASS_SCAN(
2512                 to_complement ^ cBOOL(_generic_utf8_safe(
2513                                       classnum,
2514                                       s,
2515                                       strend,
2516                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2517                                                   (U8 *) s, TRUE))));
2518         break;
2519
2520     case AHOCORASICKC:
2521     case AHOCORASICK:
2522         {
2523             DECL_TRIE_TYPE(c);
2524             /* what trie are we using right now */
2525             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2526             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2527             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2528
2529             const char *last_start = strend - trie->minlen;
2530 #ifdef DEBUGGING
2531             const char *real_start = s;
2532 #endif
2533             STRLEN maxlen = trie->maxlen;
2534             SV *sv_points;
2535             U8 **points; /* map of where we were in the input string
2536                             when reading a given char. For ASCII this
2537                             is unnecessary overhead as the relationship
2538                             is always 1:1, but for Unicode, especially
2539                             case folded Unicode this is not true. */
2540             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2541             U8 *bitmap=NULL;
2542
2543
2544             GET_RE_DEBUG_FLAGS_DECL;
2545
2546             /* We can't just allocate points here. We need to wrap it in
2547              * an SV so it gets freed properly if there is a croak while
2548              * running the match */
2549             ENTER;
2550             SAVETMPS;
2551             sv_points=newSV(maxlen * sizeof(U8 *));
2552             SvCUR_set(sv_points,
2553                 maxlen * sizeof(U8 *));
2554             SvPOK_on(sv_points);
2555             sv_2mortal(sv_points);
2556             points=(U8**)SvPV_nolen(sv_points );
2557             if ( trie_type != trie_utf8_fold
2558                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2559             {
2560                 if (trie->bitmap)
2561                     bitmap=(U8*)trie->bitmap;
2562                 else
2563                     bitmap=(U8*)ANYOF_BITMAP(c);
2564             }
2565             /* this is the Aho-Corasick algorithm modified a touch
2566                to include special handling for long "unknown char" sequences.
2567                The basic idea being that we use AC as long as we are dealing
2568                with a possible matching char, when we encounter an unknown char
2569                (and we have not encountered an accepting state) we scan forward
2570                until we find a legal starting char.
2571                AC matching is basically that of trie matching, except that when
2572                we encounter a failing transition, we fall back to the current
2573                states "fail state", and try the current char again, a process
2574                we repeat until we reach the root state, state 1, or a legal
2575                transition. If we fail on the root state then we can either
2576                terminate if we have reached an accepting state previously, or
2577                restart the entire process from the beginning if we have not.
2578
2579              */
2580             while (s <= last_start) {
2581                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2582                 U8 *uc = (U8*)s;
2583                 U16 charid = 0;
2584                 U32 base = 1;
2585                 U32 state = 1;
2586                 UV uvc = 0;
2587                 STRLEN len = 0;
2588                 STRLEN foldlen = 0;
2589                 U8 *uscan = (U8*)NULL;
2590                 U8 *leftmost = NULL;
2591 #ifdef DEBUGGING
2592                 U32 accepted_word= 0;
2593 #endif
2594                 U32 pointpos = 0;
2595
2596                 while ( state && uc <= (U8*)strend ) {
2597                     int failed=0;
2598                     U32 word = aho->states[ state ].wordnum;
2599
2600                     if( state==1 ) {
2601                         if ( bitmap ) {
2602                             DEBUG_TRIE_EXECUTE_r(
2603                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2604                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2605                                         (char *)uc, utf8_target, 0 );
2606                                     Perl_re_printf( aTHX_
2607                                         " Scanning for legal start char...\n");
2608                                 }
2609                             );
2610                             if (utf8_target) {
2611                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2612                                     uc += UTF8SKIP(uc);
2613                                 }
2614                             } else {
2615                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2616                                     uc++;
2617                                 }
2618                             }
2619                             s= (char *)uc;
2620                         }
2621                         if (uc >(U8*)last_start) break;
2622                     }
2623
2624                     if ( word ) {
2625                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2626                         if (!leftmost || lpos < leftmost) {
2627                             DEBUG_r(accepted_word=word);
2628                             leftmost= lpos;
2629                         }
2630                         if (base==0) break;
2631
2632                     }
2633                     points[pointpos++ % maxlen]= uc;
2634                     if (foldlen || uc < (U8*)strend) {
2635                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2636                                          widecharmap, uc,
2637                                          uscan, len, uvc, charid, foldlen,
2638                                          foldbuf, uniflags);
2639                         DEBUG_TRIE_EXECUTE_r({
2640                             dump_exec_pos( (char *)uc, c, strend,
2641                                         real_start, s, utf8_target, 0);
2642                             Perl_re_printf( aTHX_
2643                                 " Charid:%3u CP:%4" UVxf " ",
2644                                  charid, uvc);
2645                         });
2646                     }
2647                     else {
2648                         len = 0;
2649                         charid = 0;
2650                     }
2651
2652
2653                     do {
2654 #ifdef DEBUGGING
2655                         word = aho->states[ state ].wordnum;
2656 #endif
2657                         base = aho->states[ state ].trans.base;
2658
2659                         DEBUG_TRIE_EXECUTE_r({
2660                             if (failed)
2661                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2662                                     s,   utf8_target, 0 );
2663                             Perl_re_printf( aTHX_
2664                                 "%sState: %4" UVxf ", word=%" UVxf,
2665                                 failed ? " Fail transition to " : "",
2666                                 (UV)state, (UV)word);
2667                         });
2668                         if ( base ) {
2669                             U32 tmp;
2670                             I32 offset;
2671                             if (charid &&
2672                                  ( ((offset = base + charid
2673                                     - 1 - trie->uniquecharcount)) >= 0)
2674                                  && ((U32)offset < trie->lasttrans)
2675                                  && trie->trans[offset].check == state
2676                                  && (tmp=trie->trans[offset].next))
2677                             {
2678                                 DEBUG_TRIE_EXECUTE_r(
2679                                     Perl_re_printf( aTHX_ " - legal\n"));
2680                                 state = tmp;
2681                                 break;
2682                             }
2683                             else {
2684                                 DEBUG_TRIE_EXECUTE_r(
2685                                     Perl_re_printf( aTHX_ " - fail\n"));
2686                                 failed = 1;
2687                                 state = aho->fail[state];
2688                             }
2689                         }
2690                         else {
2691                             /* we must be accepting here */
2692                             DEBUG_TRIE_EXECUTE_r(
2693                                     Perl_re_printf( aTHX_ " - accepting\n"));
2694                             failed = 1;
2695                             break;
2696                         }
2697                     } while(state);
2698                     uc += len;
2699                     if (failed) {
2700                         if (leftmost)
2701                             break;
2702                         if (!state) state = 1;
2703                     }
2704                 }
2705                 if ( aho->states[ state ].wordnum ) {
2706                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2707                     if (!leftmost || lpos < leftmost) {
2708                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2709                         leftmost = lpos;
2710                     }
2711                 }
2712                 if (leftmost) {
2713                     s = (char*)leftmost;
2714                     DEBUG_TRIE_EXECUTE_r({
2715                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
2716                             (UV)accepted_word, (IV)(s - real_start)
2717                         );
2718                     });
2719                     if (reginfo->intuit || regtry(reginfo, &s)) {
2720                         FREETMPS;
2721                         LEAVE;
2722                         goto got_it;
2723                     }
2724                     s = HOPc(s,1);
2725                     DEBUG_TRIE_EXECUTE_r({
2726                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
2727                     });
2728                 } else {
2729                     DEBUG_TRIE_EXECUTE_r(
2730                         Perl_re_printf( aTHX_ "No match.\n"));
2731                     break;
2732                 }
2733             }
2734             FREETMPS;
2735             LEAVE;
2736         }
2737         break;
2738     default:
2739         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2740     }
2741     return 0;
2742   got_it:
2743     return s;
2744 }
2745
2746 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2747  * flags have same meanings as with regexec_flags() */
2748
2749 static void
2750 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2751                             char *strbeg,
2752                             char *strend,
2753                             SV *sv,
2754                             U32 flags,
2755                             bool utf8_target)
2756 {
2757     struct regexp *const prog = ReANY(rx);
2758
2759     if (flags & REXEC_COPY_STR) {
2760 #ifdef PERL_ANY_COW
2761         if (SvCANCOW(sv)) {
2762             DEBUG_C(Perl_re_printf( aTHX_
2763                               "Copy on write: regexp capture, type %d\n",
2764                                     (int) SvTYPE(sv)));
2765             /* Create a new COW SV to share the match string and store
2766              * in saved_copy, unless the current COW SV in saved_copy
2767              * is valid and suitable for our purpose */
2768             if ((   prog->saved_copy
2769                  && SvIsCOW(prog->saved_copy)
2770                  && SvPOKp(prog->saved_copy)
2771                  && SvIsCOW(sv)
2772                  && SvPOKp(sv)
2773                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2774             {
2775                 /* just reuse saved_copy SV */
2776                 if (RXp_MATCH_COPIED(prog)) {
2777                     Safefree(prog->subbeg);
2778                     RXp_MATCH_COPIED_off(prog);
2779                 }
2780             }
2781             else {
2782                 /* create new COW SV to share string */
2783                 RXp_MATCH_COPY_FREE(prog);
2784                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2785             }
2786             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2787             assert (SvPOKp(prog->saved_copy));
2788             prog->sublen  = strend - strbeg;
2789             prog->suboffset = 0;
2790             prog->subcoffset = 0;
2791         } else
2792 #endif
2793         {
2794             SSize_t min = 0;
2795             SSize_t max = strend - strbeg;
2796             SSize_t sublen;
2797
2798             if (    (flags & REXEC_COPY_SKIP_POST)
2799                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2800                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2801             ) { /* don't copy $' part of string */
2802                 U32 n = 0;
2803                 max = -1;
2804                 /* calculate the right-most part of the string covered
2805                  * by a capture. Due to lookahead, this may be to
2806                  * the right of $&, so we have to scan all captures */
2807                 while (n <= prog->lastparen) {
2808                     if (prog->offs[n].end > max)
2809                         max = prog->offs[n].end;
2810                     n++;
2811                 }
2812                 if (max == -1)
2813                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2814                             ? prog->offs[0].start
2815                             : 0;
2816                 assert(max >= 0 && max <= strend - strbeg);
2817             }
2818
2819             if (    (flags & REXEC_COPY_SKIP_PRE)
2820                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2821                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2822             ) { /* don't copy $` part of string */
2823                 U32 n = 0;
2824                 min = max;
2825                 /* calculate the left-most part of the string covered
2826                  * by a capture. Due to lookbehind, this may be to
2827                  * the left of $&, so we have to scan all captures */
2828                 while (min && n <= prog->lastparen) {
2829                     if (   prog->offs[n].start != -1
2830                         && prog->offs[n].start < min)
2831                     {
2832                         min = prog->offs[n].start;
2833                     }
2834                     n++;
2835                 }
2836                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2837                     && min >  prog->offs[0].end
2838                 )
2839                     min = prog->offs[0].end;
2840
2841             }
2842
2843             assert(min >= 0 && min <= max && min <= strend - strbeg);
2844             sublen = max - min;
2845
2846             if (RXp_MATCH_COPIED(prog)) {
2847                 if (sublen > prog->sublen)
2848                     prog->subbeg =
2849                             (char*)saferealloc(prog->subbeg, sublen+1);
2850             }
2851             else
2852                 prog->subbeg = (char*)safemalloc(sublen+1);
2853             Copy(strbeg + min, prog->subbeg, sublen, char);
2854             prog->subbeg[sublen] = '\0';
2855             prog->suboffset = min;
2856             prog->sublen = sublen;
2857             RXp_MATCH_COPIED_on(prog);
2858         }
2859         prog->subcoffset = prog->suboffset;
2860         if (prog->suboffset && utf8_target) {
2861             /* Convert byte offset to chars.
2862              * XXX ideally should only compute this if @-/@+
2863              * has been seen, a la PL_sawampersand ??? */
2864
2865             /* If there's a direct correspondence between the
2866              * string which we're matching and the original SV,
2867              * then we can use the utf8 len cache associated with
2868              * the SV. In particular, it means that under //g,
2869              * sv_pos_b2u() will use the previously cached
2870              * position to speed up working out the new length of
2871              * subcoffset, rather than counting from the start of
2872              * the string each time. This stops
2873              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2874              * from going quadratic */
2875             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2876                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2877                                                 SV_GMAGIC|SV_CONST_RETURN);
2878             else
2879                 prog->subcoffset = utf8_length((U8*)strbeg,
2880                                     (U8*)(strbeg+prog->suboffset));
2881         }
2882     }
2883     else {
2884         RXp_MATCH_COPY_FREE(prog);
2885         prog->subbeg = strbeg;
2886         prog->suboffset = 0;
2887         prog->subcoffset = 0;
2888         prog->sublen = strend - strbeg;
2889     }
2890 }
2891
2892
2893
2894
2895 /*
2896  - regexec_flags - match a regexp against a string
2897  */
2898 I32
2899 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2900               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2901 /* stringarg: the point in the string at which to begin matching */
2902 /* strend:    pointer to null at end of string */
2903 /* strbeg:    real beginning of string */
2904 /* minend:    end of match must be >= minend bytes after stringarg. */
2905 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2906  *            itself is accessed via the pointers above */
2907 /* data:      May be used for some additional optimizations.
2908               Currently unused. */
2909 /* flags:     For optimizations. See REXEC_* in regexp.h */
2910
2911 {
2912     struct regexp *const prog = ReANY(rx);
2913     char *s;
2914     regnode *c;
2915     char *startpos;
2916     SSize_t minlen;             /* must match at least this many chars */
2917     SSize_t dontbother = 0;     /* how many characters not to try at end */
2918     const bool utf8_target = cBOOL(DO_UTF8(sv));
2919     I32 multiline;
2920     RXi_GET_DECL(prog,progi);
2921     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2922     regmatch_info *const reginfo = &reginfo_buf;
2923     regexp_paren_pair *swap = NULL;
2924     I32 oldsave;
2925     GET_RE_DEBUG_FLAGS_DECL;
2926
2927     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2928     PERL_UNUSED_ARG(data);
2929
2930     /* Be paranoid... */
2931     if (prog == NULL) {
2932         Perl_croak(aTHX_ "NULL regexp parameter");
2933     }
2934
2935     DEBUG_EXECUTE_r(
2936         debug_start_match(rx, utf8_target, stringarg, strend,
2937         "Matching");
2938     );
2939
2940     startpos = stringarg;
2941
2942     /* set these early as they may be used by the HOP macros below */
2943     reginfo->strbeg = strbeg;
2944     reginfo->strend = strend;
2945     reginfo->is_utf8_target = cBOOL(utf8_target);
2946
2947     if (prog->intflags & PREGf_GPOS_SEEN) {
2948         MAGIC *mg;
2949
2950         /* set reginfo->ganch, the position where \G can match */
2951
2952         reginfo->ganch =
2953             (flags & REXEC_IGNOREPOS)
2954             ? stringarg /* use start pos rather than pos() */
2955             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2956               /* Defined pos(): */
2957             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2958             : strbeg; /* pos() not defined; use start of string */
2959
2960         DEBUG_GPOS_r(Perl_re_printf( aTHX_
2961             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
2962
2963         /* in the presence of \G, we may need to start looking earlier in
2964          * the string than the suggested start point of stringarg:
2965          * if prog->gofs is set, then that's a known, fixed minimum
2966          * offset, such as
2967          * /..\G/:   gofs = 2
2968          * /ab|c\G/: gofs = 1
2969          * or if the minimum offset isn't known, then we have to go back
2970          * to the start of the string, e.g. /w+\G/
2971          */
2972
2973         if (prog->intflags & PREGf_ANCH_GPOS) {
2974             if (prog->gofs) {
2975                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
2976                 if (!startpos ||
2977                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
2978                 {
2979                     DEBUG_r(Perl_re_printf( aTHX_
2980                             "fail: ganch-gofs before earliest possible start\n"));
2981                     return 0;
2982                 }
2983             }
2984             else
2985                 startpos = reginfo->ganch;
2986         }
2987         else if (prog->gofs) {
2988             startpos = HOPBACKc(startpos, prog->gofs);
2989             if (!startpos)
2990                 startpos = strbeg;
2991         }
2992         else if (prog->intflags & PREGf_GPOS_FLOAT)
2993             startpos = strbeg;
2994     }
2995
2996     minlen = prog->minlen;
2997     if ((startpos + minlen) > strend || startpos < strbeg) {
2998         DEBUG_r(Perl_re_printf( aTHX_
2999                     "Regex match can't succeed, so not even tried\n"));
3000         return 0;
3001     }
3002
3003     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3004      * which will call destuctors to reset PL_regmatch_state, free higher
3005      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3006      * regmatch_info_aux_eval */
3007
3008     oldsave = PL_savestack_ix;
3009
3010     s = startpos;
3011
3012     if ((prog->extflags & RXf_USE_INTUIT)
3013         && !(flags & REXEC_CHECKED))
3014     {
3015         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3016                                     flags, NULL);
3017         if (!s)
3018             return 0;
3019
3020         if (prog->extflags & RXf_CHECK_ALL) {
3021             /* we can match based purely on the result of INTUIT.
3022              * Set up captures etc just for $& and $-[0]
3023              * (an intuit-only match wont have $1,$2,..) */
3024             assert(!prog->nparens);
3025
3026             /* s/// doesn't like it if $& is earlier than where we asked it to
3027              * start searching (which can happen on something like /.\G/) */
3028             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3029                     && (s < stringarg))
3030             {
3031                 /* this should only be possible under \G */
3032                 assert(prog->intflags & PREGf_GPOS_SEEN);
3033                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3034                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3035                 goto phooey;
3036             }
3037
3038             /* match via INTUIT shouldn't have any captures.
3039              * Let @-, @+, $^N know */
3040             prog->lastparen = prog->lastcloseparen = 0;
3041             RXp_MATCH_UTF8_set(prog, utf8_target);
3042             prog->offs[0].start = s - strbeg;
3043             prog->offs[0].end = utf8_target
3044                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3045                 : s - strbeg + prog->minlenret;
3046             if ( !(flags & REXEC_NOT_FIRST) )
3047                 S_reg_set_capture_string(aTHX_ rx,
3048                                         strbeg, strend,
3049                                         sv, flags, utf8_target);
3050
3051             return 1;
3052         }
3053     }
3054
3055     multiline = prog->extflags & RXf_PMf_MULTILINE;
3056
3057     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3058         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3059                               "String too short [regexec_flags]...\n"));
3060         goto phooey;
3061     }
3062
3063     /* Check validity of program. */
3064     if (UCHARAT(progi->program) != REG_MAGIC) {
3065         Perl_croak(aTHX_ "corrupted regexp program");
3066     }
3067
3068     RXp_MATCH_TAINTED_off(prog);
3069     RXp_MATCH_UTF8_set(prog, utf8_target);
3070
3071     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3072     reginfo->intuit = 0;
3073     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3074     reginfo->warned = FALSE;
3075     reginfo->sv = sv;
3076     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3077     /* see how far we have to get to not match where we matched before */
3078     reginfo->till = stringarg + minend;
3079
3080     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3081         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3082            S_cleanup_regmatch_info_aux has executed (registered by
3083            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3084            magic belonging to this SV.
3085            Not newSVsv, either, as it does not COW.
3086         */
3087         reginfo->sv = newSV(0);
3088         SvSetSV_nosteal(reginfo->sv, sv);
3089         SAVEFREESV(reginfo->sv);
3090     }
3091
3092     /* reserve next 2 or 3 slots in PL_regmatch_state:
3093      * slot N+0: may currently be in use: skip it
3094      * slot N+1: use for regmatch_info_aux struct
3095      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3096      * slot N+3: ready for use by regmatch()
3097      */
3098
3099     {
3100         regmatch_state *old_regmatch_state;
3101         regmatch_slab  *old_regmatch_slab;
3102         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3103
3104         /* on first ever match, allocate first slab */
3105         if (!PL_regmatch_slab) {
3106             Newx(PL_regmatch_slab, 1, regmatch_slab);
3107             PL_regmatch_slab->prev = NULL;
3108             PL_regmatch_slab->next = NULL;
3109             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3110         }
3111
3112         old_regmatch_state = PL_regmatch_state;
3113         old_regmatch_slab  = PL_regmatch_slab;
3114
3115         for (i=0; i <= max; i++) {
3116             if (i == 1)
3117                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3118             else if (i ==2)
3119                 reginfo->info_aux_eval =
3120                 reginfo->info_aux->info_aux_eval =
3121                             &(PL_regmatch_state->u.info_aux_eval);
3122
3123             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3124                 PL_regmatch_state = S_push_slab(aTHX);
3125         }
3126
3127         /* note initial PL_regmatch_state position; at end of match we'll
3128          * pop back to there and free any higher slabs */
3129
3130         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3131         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3132         reginfo->info_aux->poscache = NULL;
3133
3134         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3135
3136         if ((prog->extflags & RXf_EVAL_SEEN))
3137             S_setup_eval_state(aTHX_ reginfo);
3138         else
3139             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3140     }
3141
3142     /* If there is a "must appear" string, look for it. */
3143
3144     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3145         /* We have to be careful. If the previous successful match
3146            was from this regex we don't want a subsequent partially
3147            successful match to clobber the old results.
3148            So when we detect this possibility we add a swap buffer
3149            to the re, and switch the buffer each match. If we fail,
3150            we switch it back; otherwise we leave it swapped.
3151         */
3152         swap = prog->offs;
3153         /* do we need a save destructor here for eval dies? */
3154         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3155         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3156             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3157             0,
3158             PTR2UV(prog),
3159             PTR2UV(swap),
3160             PTR2UV(prog->offs)
3161         ));
3162     }
3163
3164     if (prog->recurse_locinput)
3165         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3166
3167     /* Simplest case: anchored match need be tried only once, or with
3168      * MBOL, only at the beginning of each line.
3169      *
3170      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3171      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3172      * match at the start of the string then it won't match anywhere else
3173      * either; while with /.*.../, if it doesn't match at the beginning,
3174      * the earliest it could match is at the start of the next line */
3175
3176     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3177         char *end;
3178
3179         if (regtry(reginfo, &s))
3180             goto got_it;
3181
3182         if (!(prog->intflags & PREGf_ANCH_MBOL))
3183             goto phooey;
3184
3185         /* didn't match at start, try at other newline positions */
3186
3187         if (minlen)
3188             dontbother = minlen - 1;
3189         end = HOP3c(strend, -dontbother, strbeg) - 1;
3190
3191         /* skip to next newline */
3192
3193         while (s <= end) { /* note it could be possible to match at the end of the string */
3194             /* NB: newlines are the same in unicode as they are in latin */
3195             if (*s++ != '\n')
3196                 continue;
3197             if (prog->check_substr || prog->check_utf8) {
3198             /* note that with PREGf_IMPLICIT, intuit can only fail
3199              * or return the start position, so it's of limited utility.
3200              * Nevertheless, I made the decision that the potential for
3201              * quick fail was still worth it - DAPM */
3202                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3203                 if (!s)
3204                     goto phooey;
3205             }
3206             if (regtry(reginfo, &s))
3207                 goto got_it;
3208         }
3209         goto phooey;
3210     } /* end anchored search */
3211
3212     if (prog->intflags & PREGf_ANCH_GPOS)
3213     {
3214         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3215         assert(prog->intflags & PREGf_GPOS_SEEN);
3216         /* For anchored \G, the only position it can match from is
3217          * (ganch-gofs); we already set startpos to this above; if intuit
3218          * moved us on from there, we can't possibly succeed */
3219         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3220         if (s == startpos && regtry(reginfo, &s))
3221             goto got_it;
3222         goto phooey;
3223     }
3224
3225     /* Messy cases:  unanchored match. */
3226     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3227         /* we have /x+whatever/ */
3228         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3229         char ch;
3230 #ifdef DEBUGGING
3231         int did_match = 0;
3232 #endif
3233         if (utf8_target) {
3234             if (! prog->anchored_utf8) {
3235                 to_utf8_substr(prog);
3236             }
3237             ch = SvPVX_const(prog->anchored_utf8)[0];
3238             REXEC_FBC_SCAN(
3239                 if (*s == ch) {
3240                     DEBUG_EXECUTE_r( did_match = 1 );
3241                     if (regtry(reginfo, &s)) goto got_it;
3242                     s += UTF8SKIP(s);
3243                     while (s < strend && *s == ch)
3244                         s += UTF8SKIP(s);
3245                 }
3246             );
3247
3248         }
3249         else {
3250             if (! prog->anchored_substr) {
3251                 if (! to_byte_substr(prog)) {
3252                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3253                 }
3254             }
3255             ch = SvPVX_const(prog->anchored_substr)[0];
3256             REXEC_FBC_SCAN(
3257                 if (*s == ch) {
3258                     DEBUG_EXECUTE_r( did_match = 1 );
3259                     if (regtry(reginfo, &s)) goto got_it;
3260                     s++;
3261                     while (s < strend && *s == ch)
3262                         s++;
3263                 }
3264             );
3265         }
3266         DEBUG_EXECUTE_r(if (!did_match)
3267                 Perl_re_printf( aTHX_
3268                                   "Did not find anchored character...\n")
3269                );
3270     }
3271     else if (prog->anchored_substr != NULL
3272               || prog->anchored_utf8 != NULL
3273               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3274                   && prog->float_max_offset < strend - s)) {
3275         SV *must;
3276         SSize_t back_max;
3277         SSize_t back_min;
3278         char *last;
3279         char *last1;            /* Last position checked before */
3280 #ifdef DEBUGGING
3281         int did_match = 0;
3282 #endif
3283         if (prog->anchored_substr || prog->anchored_utf8) {
3284             if (utf8_target) {
3285                 if (! prog->anchored_utf8) {
3286                     to_utf8_substr(prog);
3287                 }
3288                 must = prog->anchored_utf8;
3289             }
3290             else {
3291                 if (! prog->anchored_substr) {
3292                     if (! to_byte_substr(prog)) {
3293                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3294                     }
3295                 }
3296                 must = prog->anchored_substr;
3297             }
3298             back_max = back_min = prog->anchored_offset;
3299         } else {
3300             if (utf8_target) {
3301                 if (! prog->float_utf8) {
3302                     to_utf8_substr(prog);
3303                 }
3304                 must = prog->float_utf8;
3305             }
3306             else {
3307                 if (! prog->float_substr) {
3308                     if (! to_byte_substr(prog)) {
3309                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3310                     }
3311                 }
3312                 must = prog->float_substr;
3313             }
3314             back_max = prog->float_max_offset;
3315             back_min = prog->float_min_offset;
3316         }
3317
3318         if (back_min<0) {
3319             last = strend;
3320         } else {
3321             last = HOP3c(strend,        /* Cannot start after this */
3322                   -(SSize_t)(CHR_SVLEN(must)
3323                          - (SvTAIL(must) != 0) + back_min), strbeg);
3324         }
3325         if (s > reginfo->strbeg)
3326             last1 = HOPc(s, -1);
3327         else
3328             last1 = s - 1;      /* bogus */
3329
3330         /* XXXX check_substr already used to find "s", can optimize if
3331            check_substr==must. */
3332         dontbother = 0;
3333         strend = HOPc(strend, -dontbother);
3334         while ( (s <= last) &&
3335                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3336                                   (unsigned char*)strend, must,
3337                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3338             DEBUG_EXECUTE_r( did_match = 1 );
3339             if (HOPc(s, -back_max) > last1) {
3340                 last1 = HOPc(s, -back_min);
3341                 s = HOPc(s, -back_max);
3342             }
3343             else {
3344                 char * const t = (last1 >= reginfo->strbeg)
3345                                     ? HOPc(last1, 1) : last1 + 1;
3346
3347                 last1 = HOPc(s, -back_min);
3348                 s = t;
3349             }
3350             if (utf8_target) {
3351                 while (s <= last1) {
3352                     if (regtry(reginfo, &s))
3353                         goto got_it;
3354                     if (s >= last1) {
3355                         s++; /* to break out of outer loop */
3356                         break;
3357                     }
3358                     s += UTF8SKIP(s);
3359                 }
3360             }
3361             else {
3362                 while (s <= last1) {
3363                     if (regtry(reginfo, &s))
3364                         goto got_it;
3365                     s++;
3366                 }
3367             }
3368         }
3369         DEBUG_EXECUTE_r(if (!did_match) {
3370             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3371                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3372             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3373                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3374                                ? "anchored" : "floating"),
3375                 quoted, RE_SV_TAIL(must));
3376         });
3377         goto phooey;
3378     }
3379     else if ( (c = progi->regstclass) ) {
3380         if (minlen) {
3381             const OPCODE op = OP(progi->regstclass);
3382             /* don't bother with what can't match */
3383             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3384                 strend = HOPc(strend, -(minlen - 1));
3385         }
3386         DEBUG_EXECUTE_r({
3387             SV * const prop = sv_newmortal();
3388             regprop(prog, prop, c, reginfo, NULL);
3389             {
3390                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3391                     s,strend-s,PL_dump_re_max_len);
3392                 Perl_re_printf( aTHX_
3393                     "Matching stclass %.*s against %s (%d bytes)\n",
3394                     (int)SvCUR(prop), SvPVX_const(prop),
3395                      quoted, (int)(strend - s));
3396             }
3397         });
3398         if (find_byclass(prog, c, s, strend, reginfo))
3399             goto got_it;
3400         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3401     }
3402     else {
3403         dontbother = 0;
3404         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3405             /* Trim the end. */
3406             char *last= NULL;
3407             SV* float_real;
3408             STRLEN len;
3409             const char *little;
3410
3411             if (utf8_target) {
3412                 if (! prog->float_utf8) {
3413                     to_utf8_substr(prog);
3414                 }
3415                 float_real = prog->float_utf8;
3416             }
3417             else {
3418                 if (! prog->float_substr) {
3419                     if (! to_byte_substr(prog)) {
3420                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3421                     }
3422                 }
3423                 float_real = prog->float_substr;
3424             }
3425
3426             little = SvPV_const(float_real, len);
3427             if (SvTAIL(float_real)) {
3428                     /* This means that float_real contains an artificial \n on
3429                      * the end due to the presence of something like this:
3430                      * /foo$/ where we can match both "foo" and "foo\n" at the
3431                      * end of the string.  So we have to compare the end of the
3432                      * string first against the float_real without the \n and
3433                      * then against the full float_real with the string.  We
3434                      * have to watch out for cases where the string might be
3435                      * smaller than the float_real or the float_real without
3436                      * the \n. */
3437                     char *checkpos= strend - len;
3438                     DEBUG_OPTIMISE_r(
3439                         Perl_re_printf( aTHX_
3440                             "%sChecking for float_real.%s\n",
3441                             PL_colors[4], PL_colors[5]));
3442                     if (checkpos + 1 < strbeg) {
3443                         /* can't match, even if we remove the trailing \n
3444                          * string is too short to match */
3445                         DEBUG_EXECUTE_r(
3446                             Perl_re_printf( aTHX_
3447                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3448                                 PL_colors[4], PL_colors[5]));
3449                         goto phooey;
3450                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3451                         /* can match, the end of the string matches without the
3452                          * "\n" */
3453                         last = checkpos + 1;
3454                     } else if (checkpos < strbeg) {
3455                         /* cant match, string is too short when the "\n" is
3456                          * included */
3457                         DEBUG_EXECUTE_r(
3458                             Perl_re_printf( aTHX_
3459                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3460                                 PL_colors[4], PL_colors[5]));
3461                         goto phooey;
3462                     } else if (!multiline) {
3463                         /* non multiline match, so compare with the "\n" at the
3464                          * end of the string */
3465                         if (memEQ(checkpos, little, len)) {
3466                             last= checkpos;
3467                         } else {
3468                             DEBUG_EXECUTE_r(
3469                                 Perl_re_printf( aTHX_
3470                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3471                                     PL_colors[4], PL_colors[5]));
3472                             goto phooey;
3473                         }
3474                     } else {
3475                         /* multiline match, so we have to search for a place
3476                          * where the full string is located */
3477                         goto find_last;
3478                     }
3479             } else {
3480                   find_last:
3481                     if (len)
3482                         last = rninstr(s, strend, little, little + len);
3483                     else
3484                         last = strend;  /* matching "$" */
3485             }
3486             if (!last) {
3487                 /* at one point this block contained a comment which was
3488                  * probably incorrect, which said that this was a "should not
3489                  * happen" case.  Even if it was true when it was written I am
3490                  * pretty sure it is not anymore, so I have removed the comment
3491                  * and replaced it with this one. Yves */
3492                 DEBUG_EXECUTE_r(
3493                     Perl_re_printf( aTHX_
3494                         "%sString does not contain required substring, cannot match.%s\n",
3495                         PL_colors[4], PL_colors[5]
3496                     ));
3497                 goto phooey;
3498             }
3499             dontbother = strend - last + prog->float_min_offset;
3500         }
3501         if (minlen && (dontbother < minlen))
3502             dontbother = minlen - 1;
3503         strend -= dontbother;              /* this one's always in bytes! */
3504         /* We don't know much -- general case. */
3505         if (utf8_target) {
3506             for (;;) {
3507                 if (regtry(reginfo, &s))
3508                     goto got_it;
3509                 if (s >= strend)
3510                     break;
3511                 s += UTF8SKIP(s);
3512             };
3513         }
3514         else {
3515             do {
3516                 if (regtry(reginfo, &s))
3517                     goto got_it;
3518             } while (s++ < strend);
3519         }
3520     }
3521
3522     /* Failure. */
3523     goto phooey;
3524
3525   got_it:
3526     /* s/// doesn't like it if $& is earlier than where we asked it to
3527      * start searching (which can happen on something like /.\G/) */
3528     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3529             && (prog->offs[0].start < stringarg - strbeg))
3530     {
3531         /* this should only be possible under \G */
3532         assert(prog->intflags & PREGf_GPOS_SEEN);
3533         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3534             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3535         goto phooey;
3536     }
3537
3538     DEBUG_BUFFERS_r(
3539         if (swap)
3540             Perl_re_exec_indentf( aTHX_
3541                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3542                 0,
3543                 PTR2UV(prog),
3544                 PTR2UV(swap)
3545             );
3546     );
3547     Safefree(swap);
3548
3549     /* clean up; this will trigger destructors that will free all slabs
3550      * above the current one, and cleanup the regmatch_info_aux
3551      * and regmatch_info_aux_eval sructs */
3552
3553     LEAVE_SCOPE(oldsave);
3554
3555     if (RXp_PAREN_NAMES(prog))
3556         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3557
3558     /* make sure $`, $&, $', and $digit will work later */
3559     if ( !(flags & REXEC_NOT_FIRST) )
3560         S_reg_set_capture_string(aTHX_ rx,
3561                                     strbeg, reginfo->strend,
3562                                     sv, flags, utf8_target);
3563
3564     return 1;
3565
3566   phooey:
3567     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3568                           PL_colors[4], PL_colors[5]));
3569
3570     /* clean up; this will trigger destructors that will free all slabs
3571      * above the current one, and cleanup the regmatch_info_aux
3572      * and regmatch_info_aux_eval sructs */
3573
3574     LEAVE_SCOPE(oldsave);
3575
3576     if (swap) {
3577         /* we failed :-( roll it back */
3578         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3579             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3580             0,
3581             PTR2UV(prog),
3582             PTR2UV(prog->offs),
3583             PTR2UV(swap)
3584         ));
3585         Safefree(prog->offs);
3586         prog->offs = swap;
3587     }
3588     return 0;
3589 }
3590
3591
3592 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3593  * Do inc before dec, in case old and new rex are the same */
3594 #define SET_reg_curpm(Re2)                          \
3595     if (reginfo->info_aux_eval) {                   \
3596         (void)ReREFCNT_inc(Re2);                    \
3597         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3598         PM_SETRE((PL_reg_curpm), (Re2));            \
3599     }
3600
3601
3602 /*
3603  - regtry - try match at specific point
3604  */
3605 STATIC bool                     /* 0 failure, 1 success */
3606 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3607 {
3608     CHECKPOINT lastcp;
3609     REGEXP *const rx = reginfo->prog;
3610     regexp *const prog = ReANY(rx);
3611     SSize_t result;
3612 #ifdef DEBUGGING
3613     U32 depth = 0; /* used by REGCP_SET */
3614 #endif
3615     RXi_GET_DECL(prog,progi);
3616     GET_RE_DEBUG_FLAGS_DECL;
3617
3618     PERL_ARGS_ASSERT_REGTRY;
3619
3620     reginfo->cutpoint=NULL;
3621
3622     prog->offs[0].start = *startposp - reginfo->strbeg;
3623     prog->lastparen = 0;
3624     prog->lastcloseparen = 0;
3625
3626     /* XXXX What this code is doing here?!!!  There should be no need
3627        to do this again and again, prog->lastparen should take care of
3628        this!  --ilya*/
3629
3630     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3631      * Actually, the code in regcppop() (which Ilya may be meaning by
3632      * prog->lastparen), is not needed at all by the test suite
3633      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3634      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3635      * Meanwhile, this code *is* needed for the
3636      * above-mentioned test suite tests to succeed.  The common theme
3637      * on those tests seems to be returning null fields from matches.
3638      * --jhi updated by dapm */
3639
3640     /* After encountering a variant of the issue mentioned above I think
3641      * the point Ilya was making is that if we properly unwind whenever
3642      * we set lastparen to a smaller value then we should not need to do
3643      * this every time, only when needed. So if we have tests that fail if
3644      * we remove this, then it suggests somewhere else we are improperly
3645      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3646      * places it is called, and related regcp() routines. - Yves */
3647 #if 1
3648     if (prog->nparens) {
3649         regexp_paren_pair *pp = prog->offs;
3650         I32 i;
3651         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3652             ++pp;
3653             pp->start = -1;
3654             pp->end = -1;
3655         }
3656     }
3657 #endif
3658     REGCP_SET(lastcp);
3659     result = regmatch(reginfo, *startposp, progi->program + 1);
3660     if (result != -1) {
3661         prog->offs[0].end = result;
3662         return 1;
3663     }
3664     if (reginfo->cutpoint)
3665         *startposp= reginfo->cutpoint;
3666     REGCP_UNWIND(lastcp);
3667     return 0;
3668 }
3669
3670
3671 #define sayYES goto yes
3672 #define sayNO goto no
3673 #define sayNO_SILENT goto no_silent
3674
3675 /* we dont use STMT_START/END here because it leads to
3676    "unreachable code" warnings, which are bogus, but distracting. */
3677 #define CACHEsayNO \
3678     if (ST.cache_mask) \
3679        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3680     sayNO
3681
3682 /* this is used to determine how far from the left messages like
3683    'failed...' are printed in regexec.c. It should be set such that
3684    messages are inline with the regop output that created them.
3685 */
3686 #define REPORT_CODE_OFF 29
3687 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3688 #ifdef DEBUGGING
3689 int
3690 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3691 {
3692     va_list ap;
3693     int result;
3694     PerlIO *f= Perl_debug_log;
3695     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3696     va_start(ap, depth);
3697     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3698     result = PerlIO_vprintf(f, fmt, ap);
3699     va_end(ap);
3700     return result;
3701 }
3702 #endif /* DEBUGGING */
3703
3704
3705 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3706 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3707 #define CHRTEST_NOT_A_CP_1 -999
3708 #define CHRTEST_NOT_A_CP_2 -998
3709
3710 /* grab a new slab and return the first slot in it */
3711
3712 STATIC regmatch_state *
3713 S_push_slab(pTHX)
3714 {
3715     regmatch_slab *s = PL_regmatch_slab->next;
3716     if (!s) {
3717         Newx(s, 1, regmatch_slab);
3718         s->prev = PL_regmatch_slab;
3719         s->next = NULL;
3720         PL_regmatch_slab->next = s;
3721     }
3722     PL_regmatch_slab = s;
3723     return SLAB_FIRST(s);
3724 }
3725
3726
3727 /* push a new state then goto it */
3728
3729 #define PUSH_STATE_GOTO(state, node, input) \
3730     pushinput = input; \
3731     scan = node; \
3732     st->resume_state = state; \
3733     goto push_state;
3734
3735 /* push a new state with success backtracking, then goto it */
3736
3737 #define PUSH_YES_STATE_GOTO(state, node, input) \
3738     pushinput = input; \
3739     scan = node; \
3740     st->resume_state = state; \
3741     goto push_yes_state;
3742
3743
3744
3745
3746 /*
3747
3748 regmatch() - main matching routine
3749
3750 This is basically one big switch statement in a loop. We execute an op,
3751 set 'next' to point the next op, and continue. If we come to a point which
3752 we may need to backtrack to on failure such as (A|B|C), we push a
3753 backtrack state onto the backtrack stack. On failure, we pop the top
3754 state, and re-enter the loop at the state indicated. If there are no more
3755 states to pop, we return failure.
3756
3757 Sometimes we also need to backtrack on success; for example /A+/, where
3758 after successfully matching one A, we need to go back and try to
3759 match another one; similarly for lookahead assertions: if the assertion
3760 completes successfully, we backtrack to the state just before the assertion
3761 and then carry on.  In these cases, the pushed state is marked as
3762 'backtrack on success too'. This marking is in fact done by a chain of
3763 pointers, each pointing to the previous 'yes' state. On success, we pop to
3764 the nearest yes state, discarding any intermediate failure-only states.
3765 Sometimes a yes state is pushed just to force some cleanup code to be
3766 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3767 it to free the inner regex.
3768
3769 Note that failure backtracking rewinds the cursor position, while
3770 success backtracking leaves it alone.
3771
3772 A pattern is complete when the END op is executed, while a subpattern
3773 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3774 ops trigger the "pop to last yes state if any, otherwise return true"
3775 behaviour.
3776
3777 A common convention in this function is to use A and B to refer to the two
3778 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3779 the subpattern to be matched possibly multiple times, while B is the entire
3780 rest of the pattern. Variable and state names reflect this convention.
3781
3782 The states in the main switch are the union of ops and failure/success of
3783 substates associated with with that op.  For example, IFMATCH is the op
3784 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3785 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3786 successfully matched A and IFMATCH_A_fail is a state saying that we have
3787 just failed to match A. Resume states always come in pairs. The backtrack
3788 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3789 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3790 on success or failure.
3791
3792 The struct that holds a backtracking state is actually a big union, with
3793 one variant for each major type of op. The variable st points to the
3794 top-most backtrack struct. To make the code clearer, within each
3795 block of code we #define ST to alias the relevant union.
3796
3797 Here's a concrete example of a (vastly oversimplified) IFMATCH
3798 implementation:
3799
3800     switch (state) {
3801     ....
3802
3803 #define ST st->u.ifmatch
3804
3805     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3806         ST.foo = ...; // some state we wish to save
3807         ...
3808         // push a yes backtrack state with a resume value of
3809         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3810         // first node of A:
3811         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3812         // NOTREACHED
3813
3814     case IFMATCH_A: // we have successfully executed A; now continue with B
3815         next = B;
3816         bar = ST.foo; // do something with the preserved value
3817         break;
3818
3819     case IFMATCH_A_fail: // A failed, so the assertion failed
3820         ...;   // do some housekeeping, then ...
3821         sayNO; // propagate the failure
3822
3823 #undef ST
3824
3825     ...
3826     }
3827
3828 For any old-timers reading this who are familiar with the old recursive
3829 approach, the code above is equivalent to:
3830
3831     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3832     {
3833         int foo = ...
3834         ...
3835         if (regmatch(A)) {
3836             next = B;
3837             bar = foo;
3838             break;
3839         }
3840         ...;   // do some housekeeping, then ...
3841         sayNO; // propagate the failure
3842     }
3843
3844 The topmost backtrack state, pointed to by st, is usually free. If you
3845 want to claim it, populate any ST.foo fields in it with values you wish to
3846 save, then do one of
3847
3848         PUSH_STATE_GOTO(resume_state, node, newinput);
3849         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3850
3851 which sets that backtrack state's resume value to 'resume_state', pushes a
3852 new free entry to the top of the backtrack stack, then goes to 'node'.
3853 On backtracking, the free slot is popped, and the saved state becomes the
3854 new free state. An ST.foo field in this new top state can be temporarily
3855 accessed to retrieve values, but once the main loop is re-entered, it
3856 becomes available for reuse.
3857
3858 Note that the depth of the backtrack stack constantly increases during the
3859 left-to-right execution of the pattern, rather than going up and down with
3860 the pattern nesting. For example the stack is at its maximum at Z at the
3861 end of the pattern, rather than at X in the following:
3862
3863     /(((X)+)+)+....(Y)+....Z/
3864
3865 The only exceptions to this are lookahead/behind assertions and the cut,
3866 (?>A), which pop all the backtrack states associated with A before
3867 continuing.
3868
3869 Backtrack state structs are allocated in slabs of about 4K in size.
3870 PL_regmatch_state and st always point to the currently active state,
3871 and PL_regmatch_slab points to the slab currently containing
3872 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3873 allocated, and is never freed until interpreter destruction. When the slab
3874 is full, a new one is allocated and chained to the end. At exit from
3875 regmatch(), slabs allocated since entry are freed.
3876
3877 */
3878
3879
3880 #define DEBUG_STATE_pp(pp)                                  \
3881     DEBUG_STATE_r({                                         \
3882         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
3883         Perl_re_printf( aTHX_                                           \
3884             "%*s" pp " %s%s%s%s%s\n",                       \
3885             INDENT_CHARS(depth), "",                        \
3886             PL_reg_name[st->resume_state],                  \
3887             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3888             ((st==yes_state) ? "Y" : ""),                   \
3889             ((st==mark_state) ? "M" : ""),                  \
3890             ((st==yes_state||st==mark_state) ? "]" : "")    \
3891         );                                                  \
3892     });
3893
3894
3895 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3896
3897 #ifdef DEBUGGING
3898
3899 STATIC void
3900 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3901     const char *start, const char *end, const char *blurb)
3902 {
3903     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3904
3905     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3906
3907     if (!PL_colorset)
3908             reginitcolors();
3909     {
3910         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3911             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
3912
3913         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3914             start, end - start, PL_dump_re_max_len);
3915
3916         Perl_re_printf( aTHX_
3917             "%s%s REx%s %s against %s\n",
3918                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3919
3920         if (utf8_target||utf8_pat)
3921             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
3922                 utf8_pat ? "pattern" : "",
3923                 utf8_pat && utf8_target ? " and " : "",
3924                 utf8_target ? "string" : ""
3925             );
3926     }
3927 }
3928
3929 STATIC void
3930 S_dump_exec_pos(pTHX_ const char *locinput,
3931                       const regnode *scan,
3932                       const char *loc_regeol,
3933                       const char *loc_bostr,
3934                       const char *loc_reg_starttry,
3935                       const bool utf8_target,
3936                       const U32 depth
3937                 )
3938 {
3939     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3940     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3941     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3942     /* The part of the string before starttry has one color
3943        (pref0_len chars), between starttry and current
3944        position another one (pref_len - pref0_len chars),
3945        after the current position the third one.
3946        We assume that pref0_len <= pref_len, otherwise we
3947        decrease pref0_len.  */
3948     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3949         ? (5 + taill) - l : locinput - loc_bostr;
3950     int pref0_len;
3951
3952     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3953
3954     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3955         pref_len++;
3956     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3957     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3958         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3959               ? (5 + taill) - pref_len : loc_regeol - locinput);
3960     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3961         l--;
3962     if (pref0_len < 0)
3963         pref0_len = 0;
3964     if (pref0_len > pref_len)
3965         pref0_len = pref_len;
3966     {
3967         const int is_uni = utf8_target ? 1 : 0;
3968
3969         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3970             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
3971
3972         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3973                     (locinput - pref_len + pref0_len),
3974                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
3975
3976         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3977                     locinput, loc_regeol - locinput, 10, 0, 1);
3978
3979         const STRLEN tlen=len0+len1+len2;
3980         Perl_re_printf( aTHX_
3981                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
3982                     (IV)(locinput - loc_bostr),
3983                     len0, s0,
3984                     len1, s1,
3985                     (docolor ? "" : "> <"),
3986                     len2, s2,
3987                     (int)(tlen > 19 ? 0 :  19 - tlen),
3988                     "",
3989                     depth);
3990     }
3991 }
3992
3993 #endif
3994
3995 /* reg_check_named_buff_matched()
3996  * Checks to see if a named buffer has matched. The data array of
3997  * buffer numbers corresponding to the buffer is expected to reside
3998  * in the regexp->data->data array in the slot stored in the ARG() of
3999  * node involved. Note that this routine doesn't actually care about the
4000  * name, that information is not preserved from compilation to execution.
4001  * Returns the index of the leftmost defined buffer with the given name
4002  * or 0 if non of the buffers matched.
4003  */
4004 STATIC I32
4005 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4006 {
4007     I32 n;
4008     RXi_GET_DECL(rex,rexi);
4009     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4010     I32 *nums=(I32*)SvPVX(sv_dat);
4011
4012     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4013
4014     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4015         if ((I32)rex->lastparen >= nums[n] &&
4016             rex->offs[nums[n]].end != -1)
4017         {
4018             return nums[n];
4019         }
4020     }
4021     return 0;
4022 }
4023
4024
4025 static bool
4026 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4027         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4028 {
4029     /* This function determines if there are one or two characters that match
4030      * the first character of the passed-in EXACTish node <text_node>, and if
4031      * so, returns them in the passed-in pointers.
4032      *
4033      * If it determines that no possible character in the target string can
4034      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4035      * the first character in <text_node> requires UTF-8 to represent, and the
4036      * target string isn't in UTF-8.)
4037      *
4038      * If there are more than two characters that could match the beginning of
4039      * <text_node>, or if more context is required to determine a match or not,
4040      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4041      *
4042      * The motiviation behind this function is to allow the caller to set up
4043      * tight loops for matching.  If <text_node> is of type EXACT, there is
4044      * only one possible character that can match its first character, and so
4045      * the situation is quite simple.  But things get much more complicated if
4046      * folding is involved.  It may be that the first character of an EXACTFish
4047      * node doesn't participate in any possible fold, e.g., punctuation, so it
4048      * can be matched only by itself.  The vast majority of characters that are
4049      * in folds match just two things, their lower and upper-case equivalents.
4050      * But not all are like that; some have multiple possible matches, or match
4051      * sequences of more than one character.  This function sorts all that out.
4052      *
4053      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4054      * loop of trying to match A*, we know we can't exit where the thing
4055      * following it isn't a B.  And something can't be a B unless it is the
4056      * beginning of B.  By putting a quick test for that beginning in a tight
4057      * loop, we can rule out things that can't possibly be B without having to
4058      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4059      * character, we can make a tight loop matching A*, using the outputs of
4060      * this function.
4061      *
4062      * If the target string to match isn't in UTF-8, and there aren't
4063      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4064      * the one or two possible octets (which are characters in this situation)
4065      * that can match.  In all cases, if there is only one character that can
4066      * match, *<c1p> and *<c2p> will be identical.
4067      *
4068      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4069      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4070      * can match the beginning of <text_node>.  They should be declared with at
4071      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4072      * undefined what these contain.)  If one or both of the buffers are
4073      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4074      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4075      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4076      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4077      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4078
4079     const bool utf8_target = reginfo->is_utf8_target;
4080
4081     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4082     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4083     bool use_chrtest_void = FALSE;
4084     const bool is_utf8_pat = reginfo->is_utf8_pat;
4085
4086     /* Used when we have both utf8 input and utf8 output, to avoid converting
4087      * to/from code points */
4088     bool utf8_has_been_setup = FALSE;
4089
4090     dVAR;
4091
4092     U8 *pat = (U8*)STRING(text_node);
4093     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4094
4095     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4096
4097         /* In an exact node, only one thing can be matched, that first
4098          * character.  If both the pat and the target are UTF-8, we can just
4099          * copy the input to the output, avoiding finding the code point of
4100          * that character */
4101         if (!is_utf8_pat) {
4102             c2 = c1 = *pat;
4103         }
4104         else if (utf8_target) {
4105             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4106             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4107             utf8_has_been_setup = TRUE;
4108         }
4109         else {
4110             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4111         }
4112     }
4113     else { /* an EXACTFish node */
4114         U8 *pat_end = pat + STR_LEN(text_node);
4115
4116         /* An EXACTFL node has at least some characters unfolded, because what
4117          * they match is not known until now.  So, now is the time to fold
4118          * the first few of them, as many as are needed to determine 'c1' and
4119          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4120          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4121          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4122          * need to fold as many characters as a single character can fold to,
4123          * so that later we can check if the first ones are such a multi-char
4124          * fold.  But, in such a pattern only locale-problematic characters
4125          * aren't folded, so we can skip this completely if the first character
4126          * in the node isn't one of the tricky ones */
4127         if (OP(text_node) == EXACTFL) {
4128
4129             if (! is_utf8_pat) {
4130                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4131                 {
4132                     folded[0] = folded[1] = 's';
4133                     pat = folded;
4134                     pat_end = folded + 2;
4135                 }
4136             }
4137             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4138                 U8 *s = pat;
4139                 U8 *d = folded;
4140                 int i;
4141
4142                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4143                     if (isASCII(*s)) {
4144                         *(d++) = (U8) toFOLD_LC(*s);
4145                         s++;
4146                     }
4147                     else {
4148                         STRLEN len;
4149                         _toFOLD_utf8_flags(s,
4150                                            pat_end,
4151                                            d,
4152                                            &len,
4153                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4154                         d += len;
4155                         s += UTF8SKIP(s);
4156                     }
4157                 }
4158
4159                 pat = folded;
4160                 pat_end = d;
4161             }
4162         }
4163
4164         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4165              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4166         {
4167             /* Multi-character folds require more context to sort out.  Also
4168              * PL_utf8_foldclosures used below doesn't handle them, so have to
4169              * be handled outside this routine */
4170             use_chrtest_void = TRUE;
4171         }
4172         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4173             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4174             if (c1 > 255) {
4175                 /* Load the folds hash, if not already done */
4176                 SV** listp;
4177                 if (! PL_utf8_foldclosures) {
4178                     _load_PL_utf8_foldclosures();
4179                 }
4180
4181                 /* The fold closures data structure is a hash with the keys
4182                  * being the UTF-8 of every character that is folded to, like
4183                  * 'k', and the values each an array of all code points that
4184                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4185                  * Multi-character folds are not included */
4186                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4187                                         (char *) pat,
4188                                         UTF8SKIP(pat),
4189                                         FALSE))))
4190                 {
4191                     /* Not found in the hash, therefore there are no folds
4192                     * containing it, so there is only a single character that
4193                     * could match */
4194                     c2 = c1;
4195                 }
4196                 else {  /* Does participate in folds */
4197                     AV* list = (AV*) *listp;
4198                     if (av_tindex_skip_len_mg(list) != 1) {
4199
4200                         /* If there aren't exactly two folds to this, it is
4201                          * outside the scope of this function */
4202                         use_chrtest_void = TRUE;
4203                     }
4204                     else {  /* There are two.  Get them */
4205                         SV** c_p = av_fetch(list, 0, FALSE);
4206                         if (c_p == NULL) {
4207                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4208                         }
4209                         c1 = SvUV(*c_p);
4210
4211                         c_p = av_fetch(list, 1, FALSE);
4212                         if (c_p == NULL) {
4213                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4214                         }
4215                         c2 = SvUV(*c_p);
4216
4217                         /* Folds that cross the 255/256 boundary are forbidden
4218                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4219                          * one is ASCIII.  Since the pattern character is above
4220                          * 255, and its only other match is below 256, the only
4221                          * legal match will be to itself.  We have thrown away
4222                          * the original, so have to compute which is the one
4223                          * above 255. */
4224                         if ((c1 < 256) != (c2 < 256)) {
4225                             if ((OP(text_node) == EXACTFL
4226                                  && ! IN_UTF8_CTYPE_LOCALE)
4227                                 || ((OP(text_node) == EXACTFA
4228                                     || OP(text_node) == EXACTFA_NO_TRIE)
4229                                     && (isASCII(c1) || isASCII(c2))))
4230                             {
4231                                 if (c1 < 256) {
4232                                     c1 = c2;
4233                                 }
4234                                 else {
4235                                     c2 = c1;
4236                                 }
4237                             }
4238                         }
4239                     }
4240                 }
4241             }
4242             else /* Here, c1 is <= 255 */
4243                 if (utf8_target
4244                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4245                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4246                     && ((OP(text_node) != EXACTFA
4247                         && OP(text_node) != EXACTFA_NO_TRIE)
4248                         || ! isASCII(c1)))
4249             {
4250                 /* Here, there could be something above Latin1 in the target
4251                  * which folds to this character in the pattern.  All such
4252                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4253                  * than two characters involved in their folds, so are outside
4254                  * the scope of this function */
4255                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4256                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4257                 }
4258                 else {
4259                     use_chrtest_void = TRUE;
4260                 }
4261             }
4262             else { /* Here nothing above Latin1 can fold to the pattern
4263                       character */
4264                 switch (OP(text_node)) {
4265
4266                     case EXACTFL:   /* /l rules */
4267                         c2 = PL_fold_locale[c1];
4268                         break;
4269
4270                     case EXACTF:   /* This node only generated for non-utf8
4271                                     patterns */
4272                         assert(! is_utf8_pat);
4273                         if (! utf8_target) {    /* /d rules */
4274                             c2 = PL_fold[c1];
4275                             break;
4276                         }
4277                         /* FALLTHROUGH */
4278                         /* /u rules for all these.  This happens to work for
4279                         * EXACTFA as nothing in Latin1 folds to ASCII */
4280                     case EXACTFA_NO_TRIE:   /* This node only generated for
4281                                             non-utf8 patterns */
4282                         assert(! is_utf8_pat);
4283                         /* FALLTHROUGH */
4284                     case EXACTFA:
4285                     case EXACTFU_SS:
4286                     case EXACTFU:
4287                         c2 = PL_fold_latin1[c1];
4288                         break;
4289
4290                     default:
4291                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4292                         NOT_REACHED; /* NOTREACHED */
4293                 }
4294             }
4295         }
4296     }
4297
4298     /* Here have figured things out.  Set up the returns */
4299     if (use_chrtest_void) {
4300         *c2p = *c1p = CHRTEST_VOID;
4301     }
4302     else if (utf8_target) {
4303         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4304             uvchr_to_utf8(c1_utf8, c1);
4305             uvchr_to_utf8(c2_utf8, c2);
4306         }
4307
4308         /* Invariants are stored in both the utf8 and byte outputs; Use
4309          * negative numbers otherwise for the byte ones.  Make sure that the
4310          * byte ones are the same iff the utf8 ones are the same */
4311         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4312         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4313                 ? *c2_utf8
4314                 : (c1 == c2)
4315                   ? CHRTEST_NOT_A_CP_1
4316                   : CHRTEST_NOT_A_CP_2;
4317     }
4318     else if (c1 > 255) {
4319        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4320                            can represent */
4321            return FALSE;
4322        }
4323
4324        *c1p = *c2p = c2;    /* c2 is the only representable value */
4325     }
4326     else {  /* c1 is representable; see about c2 */
4327        *c1p = c1;
4328        *c2p = (c2 < 256) ? c2 : c1;
4329     }
4330
4331     return TRUE;
4332 }
4333
4334 STATIC bool
4335 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4336 {
4337     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4338      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4339
4340     PERL_ARGS_ASSERT_ISGCB;
4341
4342     switch (GCB_table[before][after]) {
4343         case GCB_BREAKABLE:
4344             return TRUE;
4345
4346         case GCB_NOBREAK:
4347             return FALSE;
4348
4349         case GCB_RI_then_RI:
4350             {
4351                 int RI_count = 1;
4352                 U8 * temp_pos = (U8 *) curpos;
4353
4354                 /* Do not break within emoji flag sequences. That is, do not
4355                  * break between regional indicator (RI) symbols if there is an
4356                  * odd number of RI characters before the break point.
4357                  *  GB12   sot (RI RI)* RI × RI
4358                  *  GB13 [^RI] (RI RI)* RI × RI */
4359
4360                 while (backup_one_GCB(strbeg,
4361                                     &temp_pos,
4362                                     utf8_target) == GCB_Regional_Indicator)
4363                 {
4364                     RI_count++;
4365                 }
4366
4367                 return RI_count % 2 != 1;
4368             }
4369
4370         case GCB_EX_then_EM:
4371
4372             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4373             {
4374                 U8 * temp_pos = (U8 *) curpos;
4375                 GCB_enum prev;
4376
4377                 do {
4378                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4379                 }
4380                 while (prev == GCB_Extend);
4381
4382                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4383             }
4384
4385         default:
4386             break;
4387     }
4388
4389 #ifdef DEBUGGING
4390     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4391                                   before, after, GCB_table[before][after]);
4392     assert(0);
4393 #endif
4394     return TRUE;
4395 }
4396
4397 STATIC GCB_enum
4398 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4399 {
4400     GCB_enum gcb;
4401
4402     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4403
4404     if (*curpos < strbeg) {
4405         return GCB_EDGE;
4406     }
4407
4408     if (utf8_target) {
4409         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4410         U8 * prev_prev_char_pos;
4411
4412         if (! prev_char_pos) {
4413             return GCB_EDGE;
4414         }
4415
4416         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4417             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4418             *curpos = prev_char_pos;
4419             prev_char_pos = prev_prev_char_pos;
4420         }
4421         else {
4422             *curpos = (U8 *) strbeg;
4423             return GCB_EDGE;
4424         }
4425     }
4426     else {
4427         if (*curpos - 2 < strbeg) {
4428             *curpos = (U8 *) strbeg;
4429             return GCB_EDGE;
4430         }
4431         (*curpos)--;
4432         gcb = getGCB_VAL_CP(*(*curpos - 1));
4433     }
4434
4435     return gcb;
4436 }
4437
4438 /* Combining marks attach to most classes that precede them, but this defines
4439  * the exceptions (from TR14) */
4440 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4441                                      || prev == LB_Mandatory_Break      \
4442                                      || prev == LB_Carriage_Return      \
4443                                      || prev == LB_Line_Feed            \
4444                                      || prev == LB_Next_Line            \
4445                                      || prev == LB_Space                \
4446                                      || prev == LB_ZWSpace))
4447
4448 STATIC bool
4449 S_isLB(pTHX_ LB_enum before,
4450              LB_enum after,
4451              const U8 * const strbeg,
4452              const U8 * const curpos,
4453              const U8 * const strend,
4454              const bool utf8_target)
4455 {
4456     U8 * temp_pos = (U8 *) curpos;
4457     LB_enum prev = before;
4458
4459     /* Is the boundary between 'before' and 'after' line-breakable?
4460      * Most of this is just a table lookup of a generated table from Unicode
4461      * rules.  But some rules require context to decide, and so have to be
4462      * implemented in code */
4463
4464     PERL_ARGS_ASSERT_ISLB;
4465
4466     /* Rule numbers in the comments below are as of Unicode 9.0 */
4467
4468   redo:
4469     before = prev;
4470     switch (LB_table[before][after]) {
4471         case LB_BREAKABLE:
4472             return TRUE;
4473
4474         case LB_NOBREAK:
4475         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4476             return FALSE;
4477
4478         case LB_SP_foo + LB_BREAKABLE:
4479         case LB_SP_foo + LB_NOBREAK:
4480         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4481
4482             /* When we have something following a SP, we have to look at the
4483              * context in order to know what to do.
4484              *
4485              * SP SP should not reach here because LB7: Do not break before
4486              * spaces.  (For two spaces in a row there is nothing that
4487              * overrides that) */
4488             assert(after != LB_Space);
4489
4490             /* Here we have a space followed by a non-space.  Mostly this is a
4491              * case of LB18: "Break after spaces".  But there are complications
4492              * as the handling of spaces is somewhat tricky.  They are in a
4493              * number of rules, which have to be applied in priority order, but
4494              * something earlier in the string can cause a rule to be skipped
4495              * and a lower priority rule invoked.  A prime example is LB7 which
4496              * says don't break before a space.  But rule LB8 (lower priority)
4497              * says that the first break opportunity after a ZW is after any
4498              * span of spaces immediately after it.  If a ZW comes before a SP
4499              * in the input, rule LB8 applies, and not LB7.  Other such rules
4500              * involve combining marks which are rules 9 and 10, but they may
4501              * override higher priority rules if they come earlier in the
4502              * string.  Since we're doing random access into the middle of the
4503              * string, we have to look for rules that should get applied based
4504              * on both string position and priority.  Combining marks do not
4505              * attach to either ZW nor SP, so we don't have to consider them
4506              * until later.
4507              *
4508              * To check for LB8, we have to find the first non-space character
4509              * before this span of spaces */
4510             do {
4511                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4512             }
4513             while (prev == LB_Space);
4514
4515             /* LB8 Break before any character following a zero-width space,
4516              * even if one or more spaces intervene.
4517              *      ZW SP* ÷
4518              * So if we have a ZW just before this span, and to get here this
4519              * is the final space in the span. */
4520             if (prev == LB_ZWSpace) {
4521                 return TRUE;
4522             }
4523
4524             /* Here, not ZW SP+.  There are several rules that have higher
4525              * priority than LB18 and can be resolved now, as they don't depend
4526              * on anything earlier in the string (except ZW, which we have
4527              * already handled).  One of these rules is LB11 Do not break
4528              * before Word joiner, but we have specially encoded that in the
4529              * lookup table so it is caught by the single test below which
4530              * catches the other ones. */
4531             if (LB_table[LB_Space][after] - LB_SP_foo
4532                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4533             {
4534                 return FALSE;
4535             }
4536
4537             /* If we get here, we have to XXX consider combining marks. */
4538             if (prev == LB_Combining_Mark) {
4539
4540                 /* What happens with these depends on the character they
4541                  * follow.  */
4542                 do {
4543                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4544                 }
4545                 while (prev == LB_Combining_Mark);
4546
4547                 /* Most times these attach to and inherit the characteristics
4548                  * of that character, but not always, and when not, they are to
4549                  * be treated as AL by rule LB10. */
4550                 if (! LB_CM_ATTACHES_TO(prev)) {
4551                     prev = LB_Alphabetic;
4552                 }
4553             }
4554
4555             /* Here, we have the character preceding the span of spaces all set
4556              * up.  We follow LB18: "Break after spaces" unless the table shows
4557              * that is overriden */
4558             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4559
4560         case LB_CM_ZWJ_foo:
4561
4562             /* We don't know how to treat the CM except by looking at the first
4563              * non-CM character preceding it.  ZWJ is treated as CM */
4564             do {
4565                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4566             }
4567             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4568
4569             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4570              * attatches to it, then it inherits the behavior of 'prev'.  If it
4571              * doesn't attach, it is to be treated as an AL */
4572             if (! LB_CM_ATTACHES_TO(prev)) {
4573                 prev = LB_Alphabetic;
4574             }
4575
4576             goto redo;
4577
4578         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4579         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4580
4581             /* LB21a Don't break after Hebrew + Hyphen.
4582              * HL (HY | BA) × */
4583
4584             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4585                                                           == LB_Hebrew_Letter)
4586             {
4587                 return FALSE;
4588             }
4589
4590             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4591
4592         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4593         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4594
4595             /* LB25a (PR | PO) × ( OP | HY )? NU */
4596             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4597                 return FALSE;
4598             }
4599
4600             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4601                                                                 == LB_BREAKABLE;
4602
4603         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4604         case LB_SY_or_IS_then_various + LB_NOBREAK:
4605         {
4606             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4607
4608             LB_enum temp = prev;
4609             do {
4610                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4611             }
4612             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4613             if (temp == LB_Numeric) {
4614                 return FALSE;
4615             }
4616
4617             return LB_table[prev][after] - LB_SY_or_IS_then_various
4618                                                                == LB_BREAKABLE;
4619         }
4620
4621         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4622         case LB_various_then_PO_or_PR + LB_NOBREAK:
4623         {
4624             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4625
4626             LB_enum temp = prev;
4627             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4628             {
4629                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4630             }
4631             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4632                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4633             }
4634             if (temp == LB_Numeric) {
4635                 return FALSE;
4636             }
4637             return LB_various_then_PO_or_PR;
4638         }
4639
4640         case LB_RI_then_RI + LB_NOBREAK:
4641         case LB_RI_then_RI + LB_BREAKABLE:
4642             {
4643                 int RI_count = 1;
4644
4645                 /* LB30a Break between two regional indicator symbols if and
4646                  * only if there are an even number of regional indicators
4647                  * preceding the position of the break.
4648                  *
4649                  *    sot (RI RI)* RI × RI
4650                  *  [^RI] (RI RI)* RI × RI */
4651
4652                 while (backup_one_LB(strbeg,
4653                                      &temp_pos,
4654                                      utf8_target) == LB_Regional_Indicator)
4655                 {
4656                     RI_count++;
4657                 }
4658
4659                 return RI_count % 2 == 0;
4660             }
4661
4662         default:
4663             break;
4664     }
4665
4666 #ifdef DEBUGGING
4667     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4668                                   before, after, LB_table[before][after]);
4669     assert(0);
4670 #endif
4671     return TRUE;
4672 }
4673
4674 STATIC LB_enum
4675 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4676 {
4677     LB_enum lb;
4678
4679     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4680
4681     if (*curpos >= strend) {
4682         return LB_EDGE;
4683     }
4684
4685     if (utf8_target) {
4686         *curpos += UTF8SKIP(*curpos);
4687         if (*curpos >= strend) {
4688             return LB_EDGE;
4689         }
4690         lb = getLB_VAL_UTF8(*curpos, strend);
4691     }
4692     else {
4693         (*curpos)++;
4694         if (*curpos >= strend) {
4695             return LB_EDGE;
4696         }
4697         lb = getLB_VAL_CP(**curpos);
4698     }
4699
4700     return lb;
4701 }
4702
4703 STATIC LB_enum
4704 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4705 {
4706     LB_enum lb;
4707
4708     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4709
4710     if (*curpos < strbeg) {
4711         return LB_EDGE;
4712     }
4713
4714     if (utf8_target) {
4715         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4716         U8 * prev_prev_char_pos;
4717
4718         if (! prev_char_pos) {
4719             return LB_EDGE;
4720         }
4721
4722         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4723             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4724             *curpos = prev_char_pos;
4725             prev_char_pos = prev_prev_char_pos;
4726         }
4727         else {
4728             *curpos = (U8 *) strbeg;
4729             return LB_EDGE;
4730         }
4731     }
4732     else {
4733         if (*curpos - 2 < strbeg) {
4734             *curpos = (U8 *) strbeg;
4735             return LB_EDGE;
4736         }
4737         (*curpos)--;
4738         lb = getLB_VAL_CP(*(*curpos - 1));
4739     }
4740
4741     return lb;
4742 }
4743
4744 STATIC bool
4745 S_isSB(pTHX_ SB_enum before,
4746              SB_enum after,
4747              const U8 * const strbeg,
4748              const U8 * const curpos,
4749              const U8 * const strend,
4750              const bool utf8_target)
4751 {
4752     /* returns a boolean indicating if there is a Sentence Boundary Break
4753      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4754
4755     U8 * lpos = (U8 *) curpos;
4756     bool has_para_sep = FALSE;
4757     bool has_sp = FALSE;
4758
4759     PERL_ARGS_ASSERT_ISSB;
4760
4761     /* Break at the start and end of text.
4762         SB1.  sot  ÷
4763         SB2.  ÷  eot
4764       But unstated in Unicode is don't break if the text is empty */
4765     if (before == SB_EDGE || after == SB_EDGE) {
4766         return before != after;
4767     }
4768
4769     /* SB 3: Do not break within CRLF. */
4770     if (before == SB_CR && after == SB_LF) {
4771         return FALSE;
4772     }
4773
4774     /* Break after paragraph separators.  CR and LF are considered
4775      * so because Unicode views text as like word processing text where there
4776      * are no newlines except between paragraphs, and the word processor takes
4777      * care of wrapping without there being hard line-breaks in the text *./
4778        SB4.  Sep | CR | LF  ÷ */
4779     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4780         return TRUE;
4781     }
4782
4783     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4784      * (See Section 6.2, Replacing Ignore Rules.)
4785         SB5.  X (Extend | Format)*  →  X */
4786     if (after == SB_Extend || after == SB_Format) {
4787
4788         /* Implied is that the these characters attach to everything
4789          * immediately prior to them except for those separator-type
4790          * characters.  And the rules earlier have already handled the case
4791          * when one of those immediately precedes the extend char */
4792         return FALSE;
4793     }
4794
4795     if (before == SB_Extend || before == SB_Format) {
4796         U8 * temp_pos = lpos;
4797         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4798         if (   backup != SB_EDGE
4799             && backup != SB_Sep
4800             && backup != SB_CR
4801             && backup != SB_LF)
4802         {
4803             before = backup;
4804             lpos = temp_pos;
4805         }
4806
4807         /* Here, both 'before' and 'backup' are these types; implied is that we
4808          * don't break between them */
4809         if (backup == SB_Extend || backup == SB_Format) {
4810             return FALSE;
4811         }
4812     }
4813
4814     /* Do not break after ambiguous terminators like period, if they are
4815      * immediately followed by a number or lowercase letter, if they are
4816      * between uppercase letters, if the first following letter (optionally
4817      * after certain punctuation) is lowercase, or if they are followed by
4818      * "continuation" punctuation such as comma, colon, or semicolon. For
4819      * example, a period may be an abbreviation or numeric period, and thus may
4820      * not mark the end of a sentence.
4821
4822      * SB6. ATerm  ×  Numeric */
4823     if (before == SB_ATerm && after == SB_Numeric) {
4824         return FALSE;
4825     }
4826
4827     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4828     if (before == SB_ATerm && after == SB_Upper) {
4829         U8 * temp_pos = lpos;
4830         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4831         if (backup == SB_Upper || backup == SB_Lower) {
4832             return FALSE;
4833         }
4834     }
4835
4836     /* The remaining rules that aren't the final one, all require an STerm or
4837      * an ATerm after having backed up over some Close* Sp*, and in one case an
4838      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4839      * So do that backup now, setting flags if either Sp or a paragraph
4840      * separator are found */
4841
4842     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4843         has_para_sep = TRUE;
4844         before = backup_one_SB(strbeg, &lpos, utf8_target);
4845     }
4846
4847     if (before == SB_Sp) {
4848         has_sp = TRUE;
4849         do {
4850             before = backup_one_SB(strbeg, &lpos, utf8_target);
4851         }
4852         while (before == SB_Sp);
4853     }
4854
4855     while (before == SB_Close) {
4856         before = backup_one_SB(strbeg, &lpos, utf8_target);
4857     }
4858
4859     /* The next few rules apply only when the backed-up-to is an ATerm, and in
4860      * most cases an STerm */
4861     if (before == SB_STerm || before == SB_ATerm) {
4862
4863         /* So, here the lhs matches
4864          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
4865          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
4866          * The rules that apply here are:
4867          *
4868          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
4869                                            | LF | STerm | ATerm) )* Lower
4870            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
4871            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
4872            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
4873            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
4874          */
4875
4876         /* And all but SB11 forbid having seen a paragraph separator */
4877         if (! has_para_sep) {
4878             if (before == SB_ATerm) {          /* SB8 */
4879                 U8 * rpos = (U8 *) curpos;
4880                 SB_enum later = after;
4881
4882                 while (    later != SB_OLetter
4883                         && later != SB_Upper
4884                         && later != SB_Lower
4885                         && later != SB_Sep
4886                         && later != SB_CR
4887                         && later != SB_LF
4888                         && later != SB_STerm
4889                         && later != SB_ATerm
4890                         && later != SB_EDGE)
4891                 {
4892                     later = advance_one_SB(&rpos, strend, utf8_target);
4893                 }
4894                 if (later == SB_Lower) {
4895                     return FALSE;
4896                 }
4897             }
4898
4899             if (   after == SB_SContinue    /* SB8a */
4900                 || after == SB_STerm
4901                 || after == SB_ATerm)
4902             {
4903                 return FALSE;
4904             }
4905
4906             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
4907                 if (   after == SB_Close
4908                     || after == SB_Sp
4909                     || after == SB_Sep
4910                     || after == SB_CR
4911                     || after == SB_LF)
4912                 {
4913                     return FALSE;
4914                 }
4915             }
4916
4917             /* SB10.  This and SB9 could probably be combined some way, but khw
4918              * has decided to follow the Unicode rule book precisely for
4919              * simplified maintenance */
4920             if (   after == SB_Sp
4921                 || after == SB_Sep
4922                 || after == SB_CR
4923                 || after == SB_LF)
4924             {
4925                 return FALSE;
4926             }
4927         }
4928
4929         /* SB11.  */
4930         return TRUE;
4931     }
4932
4933     /* Otherwise, do not break.
4934     SB12.  Any  ×  Any */
4935
4936     return FALSE;
4937 }
4938
4939 STATIC SB_enum
4940 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4941 {
4942     SB_enum sb;
4943
4944     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4945
4946     if (*curpos >= strend) {
4947         return SB_EDGE;
4948     }
4949
4950     if (utf8_target) {
4951         do {
4952             *curpos += UTF8SKIP(*curpos);
4953             if (*curpos >= strend) {
4954                 return SB_EDGE;
4955             }
4956             sb = getSB_VAL_UTF8(*curpos, strend);
4957         } while (sb == SB_Extend || sb == SB_Format);
4958     }
4959     else {
4960         do {
4961             (*curpos)++;
4962             if (*curpos >= strend) {
4963                 return SB_EDGE;
4964             }
4965             sb = getSB_VAL_CP(**curpos);
4966         } while (sb == SB_Extend || sb == SB_Format);
4967     }
4968
4969     return sb;
4970 }
4971
4972 STATIC SB_enum
4973 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4974 {
4975     SB_enum sb;
4976
4977     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4978
4979     if (*curpos < strbeg) {
4980         return SB_EDGE;
4981     }
4982
4983     if (utf8_target) {
4984         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4985         if (! prev_char_pos) {
4986             return SB_EDGE;
4987         }
4988
4989         /* Back up over Extend and Format.  curpos is always just to the right
4990          * of the characater whose value we are getting */
4991         do {
4992             U8 * prev_prev_char_pos;
4993             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4994                                                                       strbeg)))
4995             {
4996                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4997                 *curpos = prev_char_pos;
4998                 prev_char_pos = prev_prev_char_pos;
4999             }
5000             else {
5001                 *curpos = (U8 *) strbeg;
5002                 return SB_EDGE;
5003             }
5004         } while (sb == SB_Extend || sb == SB_Format);
5005     }
5006     else {
5007         do {
5008             if (*curpos - 2 < strbeg) {
5009                 *curpos = (U8 *) strbeg;
5010                 return SB_EDGE;
5011             }
5012             (*curpos)--;
5013             sb = getSB_VAL_CP(*(*curpos - 1));
5014         } while (sb == SB_Extend || sb == SB_Format);
5015     }
5016
5017     return sb;
5018 }
5019
5020 STATIC bool
5021 S_isWB(pTHX_ WB_enum previous,
5022              WB_enum before,
5023              WB_enum after,
5024              const U8 * const strbeg,
5025              const U8 * const curpos,
5026              const U8 * const strend,
5027              const bool utf8_target)
5028 {
5029     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5030      *  a Unicode word break, using their published algorithm, but tailored for
5031      *  Perl by treating spans of white space as one unit.  Context may be
5032      *  needed to make this determination.  If the value for the character
5033      *  before 'before' is known, it is passed as 'previous'; otherwise that
5034      *  should be set to WB_UNKNOWN.  The other input parameters give the
5035      *  boundaries and current position in the matching of the string.  That
5036      *  is, 'curpos' marks the position where the character whose wb value is
5037      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5038
5039     U8 * before_pos = (U8 *) curpos;
5040     U8 * after_pos = (U8 *) curpos;
5041     WB_enum prev = before;
5042     WB_enum next;
5043
5044     PERL_ARGS_ASSERT_ISWB;
5045
5046     /* Rule numbers in the comments below are as of Unicode 9.0 */
5047
5048   redo:
5049     before = prev;
5050     switch (WB_table[before][after]) {
5051         case WB_BREAKABLE:
5052             return TRUE;
5053
5054         case WB_NOBREAK:
5055             return FALSE;
5056
5057         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5058             next = advance_one_WB(&after_pos, strend, utf8_target,
5059                                  FALSE /* Don't skip Extend nor Format */ );
5060             /* A space immediately preceeding an Extend or Format is attached
5061              * to by them, and hence gets separated from previous spaces.
5062              * Otherwise don't break between horizontal white space */
5063             return next == WB_Extend || next == WB_Format;
5064
5065         /* WB4 Ignore Format and Extend characters, except when they appear at
5066          * the beginning of a region of text.  This code currently isn't
5067          * general purpose, but it works as the rules are currently and likely
5068          * to be laid out.  The reason it works is that when 'they appear at
5069          * the beginning of a region of text', the rule is to break before
5070          * them, just like any other character.  Therefore, the default rule
5071          * applies and we don't have to look in more depth.  Should this ever
5072          * change, we would have to have 2 'case' statements, like in the rules
5073          * below, and backup a single character (not spacing over the extend
5074          * ones) and then see if that is one of the region-end characters and
5075          * go from there */
5076         case WB_Ex_or_FO_or_ZWJ_then_foo:
5077             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5078             goto redo;
5079
5080         case WB_DQ_then_HL + WB_BREAKABLE:
5081         case WB_DQ_then_HL + WB_NOBREAK:
5082
5083             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5084
5085             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5086                                                             == WB_Hebrew_Letter)
5087             {
5088                 return FALSE;
5089             }
5090
5091              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5092
5093         case WB_HL_then_DQ + WB_BREAKABLE:
5094         case WB_HL_then_DQ + WB_NOBREAK:
5095
5096             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5097
5098             if (advance_one_WB(&after_pos, strend, utf8_target,
5099                                        TRUE /* Do skip Extend and Format */ )
5100                                                             == WB_Hebrew_Letter)
5101             {
5102                 return FALSE;
5103             }
5104
5105             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5106
5107         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5108         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5109
5110             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5111              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5112
5113             next = advance_one_WB(&after_pos, strend, utf8_target,
5114                                        TRUE /* Do skip Extend and Format */ );
5115
5116             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5117             {
5118                 return FALSE;
5119             }
5120
5121             return WB_table[before][after]
5122                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5123
5124         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5125         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5126
5127             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5128              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5129
5130             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5131             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5132             {
5133                 return FALSE;
5134             }
5135
5136             return WB_table[before][after]
5137                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5138
5139         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5140         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5141
5142             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5143              * */
5144
5145             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5146                                                             == WB_Numeric)
5147             {
5148                 return FALSE;
5149             }
5150
5151             return WB_table[before][after]
5152                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5153
5154         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5155         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5156
5157             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5158
5159             if (advance_one_WB(&after_pos, strend, utf8_target,
5160                                        TRUE /* Do skip Extend and Format */ )
5161                                                             == WB_Numeric)
5162             {
5163                 return FALSE;
5164             }
5165
5166             return WB_table[before][after]
5167                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5168
5169         case WB_RI_then_RI + WB_NOBREAK:
5170         case WB_RI_then_RI + WB_BREAKABLE:
5171             {
5172                 int RI_count = 1;
5173
5174                 /* Do not break within emoji flag sequences. That is, do not
5175                  * break between regional indicator (RI) symbols if there is an
5176                  * odd number of RI characters before the potential break
5177                  * point.
5178                  *
5179                  * WB15   sot (RI RI)* RI × RI
5180                  * WB16 [^RI] (RI RI)* RI × RI */
5181
5182                 while (backup_one_WB(&previous,
5183                                      strbeg,
5184                                      &before_pos,
5185                                      utf8_target) == WB_Regional_Indicator)
5186                 {
5187                     RI_count++;
5188                 }
5189
5190                 return RI_count % 2 != 1;
5191             }
5192
5193         default:
5194             break;
5195     }
5196
5197 #ifdef DEBUGGING
5198     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5199                                   before, after, WB_table[before][after]);
5200     assert(0);
5201 #endif
5202     return TRUE;
5203 }
5204
5205 STATIC WB_enum
5206 S_advance_one_WB(pTHX_ U8 ** curpos,
5207                        const U8 * const strend,
5208                        const bool utf8_target,
5209                        const bool skip_Extend_Format)
5210 {
5211     WB_enum wb;
5212
5213     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5214
5215     if (*curpos >= strend) {
5216         return WB_EDGE;
5217     }
5218
5219     if (utf8_target) {
5220
5221         /* Advance over Extend and Format */
5222         do {
5223             *curpos += UTF8SKIP(*curpos);
5224             if (*curpos >= strend) {
5225                 return WB_EDGE;
5226             }
5227             wb = getWB_VAL_UTF8(*curpos, strend);
5228         } while (    skip_Extend_Format
5229                  && (wb == WB_Extend || wb == WB_Format));
5230     }
5231     else {
5232         do {
5233             (*curpos)++;
5234             if (*curpos >= strend) {
5235                 return WB_EDGE;
5236             }
5237             wb = getWB_VAL_CP(**curpos);
5238         } while (    skip_Extend_Format
5239                  && (wb == WB_Extend || wb == WB_Format));
5240     }
5241
5242     return wb;
5243 }
5244
5245 STATIC WB_enum
5246 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5247 {
5248     WB_enum wb;
5249
5250     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5251
5252     /* If we know what the previous character's break value is, don't have
5253         * to look it up */
5254     if (*previous != WB_UNKNOWN) {
5255         wb = *previous;
5256
5257         /* But we need to move backwards by one */
5258         if (utf8_target) {
5259             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5260             if (! *curpos) {
5261                 *previous = WB_EDGE;
5262                 *curpos = (U8 *) strbeg;
5263             }
5264             else {
5265                 *previous = WB_UNKNOWN;
5266             }
5267         }
5268         else {
5269             (*curpos)--;
5270             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5271         }
5272
5273         /* And we always back up over these three types */
5274         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5275             return wb;
5276         }
5277     }
5278
5279     if (*curpos < strbeg) {
5280         return WB_EDGE;
5281     }
5282
5283     if (utf8_target) {
5284         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5285         if (! prev_char_pos) {
5286             return WB_EDGE;
5287         }
5288
5289         /* Back up over Extend and Format.  curpos is always just to the right
5290          * of the characater whose value we are getting */
5291         do {
5292             U8 * prev_prev_char_pos;
5293             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5294                                                    -1,
5295                                                    strbeg)))
5296             {
5297                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5298                 *curpos = prev_char_pos;
5299                 prev_char_pos = prev_prev_char_pos;
5300             }
5301             else {
5302                 *curpos = (U8 *) strbeg;
5303                 return WB_EDGE;
5304             }
5305         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5306     }
5307     else {
5308         do {
5309             if (*curpos - 2 < strbeg) {
5310                 *curpos = (U8 *) strbeg;
5311                 return WB_EDGE;
5312             }
5313             (*curpos)--;
5314             wb = getWB_VAL_CP(*(*curpos - 1));
5315         } while (wb == WB_Extend || wb == WB_Format);
5316     }
5317
5318     return wb;
5319 }
5320
5321 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5322 (                                                           \
5323     (   ( st )                                         ) && \
5324     (   ( st )->u.eval.close_paren                     ) && \
5325     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5326 )
5327
5328 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5329 (                                                           \
5330     (   ( st )                                         ) && \
5331     (   ( st )->u.eval.close_paren                     ) && \
5332     (   ( expr )                                       ) && \
5333     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5334 )
5335
5336
5337 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5338     (st)->u.eval.close_paren = ( (expr) + 1 )
5339
5340 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5341     (st)->u.eval.close_paren = 0
5342
5343 /* returns -1 on failure, $+[0] on success */
5344 STATIC SSize_t
5345 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5346 {
5347     dVAR;
5348     const bool utf8_target = reginfo->is_utf8_target;
5349     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5350     REGEXP *rex_sv = reginfo->prog;
5351     regexp *rex = ReANY(rex_sv);
5352     RXi_GET_DECL(rex,rexi);
5353     /* the current state. This is a cached copy of PL_regmatch_state */
5354     regmatch_state *st;
5355     /* cache heavy used fields of st in registers */
5356     regnode *scan;
5357     regnode *next;
5358     U32 n = 0;  /* general value; init to avoid compiler warning */
5359     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5360     SSize_t endref = 0; /* offset of end of backref when ln is start */
5361     char *locinput = startpos;
5362     char *pushinput; /* where to continue after a PUSH */
5363     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5364
5365     bool result = 0;        /* return value of S_regmatch */
5366     U32 depth = 0;            /* depth of backtrack stack */
5367     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5368     const U32 max_nochange_depth =
5369         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5370         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5371     regmatch_state *yes_state = NULL; /* state to pop to on success of
5372                                                             subpattern */
5373     /* mark_state piggy backs on the yes_state logic so that when we unwind
5374        the stack on success we can update the mark_state as we go */
5375     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5376     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5377     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5378     U32 state_num;
5379     bool no_final = 0;      /* prevent failure from backtracking? */
5380     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5381     char *startpoint = locinput;
5382     SV *popmark = NULL;     /* are we looking for a mark? */
5383     SV *sv_commit = NULL;   /* last mark name seen in failure */
5384     SV *sv_yes_mark = NULL; /* last mark name we have seen
5385                                during a successful match */
5386     U32 lastopen = 0;       /* last open we saw */
5387     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5388     SV* const oreplsv = GvSVn(PL_replgv);
5389     /* these three flags are set by various ops to signal information to
5390      * the very next op. They have a useful lifetime of exactly one loop
5391      * iteration, and are not preserved or restored by state pushes/pops
5392      */
5393     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5394     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5395     int logical = 0;        /* the following EVAL is:
5396                                 0: (?{...})
5397                                 1: (?(?{...})X|Y)
5398                                 2: (??{...})
5399                                or the following IFMATCH/UNLESSM is:
5400                                 false: plain (?=foo)
5401                                 true:  used as a condition: (?(?=foo))
5402                             */
5403     PAD* last_pad = NULL;
5404     dMULTICALL;
5405     U8 gimme = G_SCALAR;
5406     CV *caller_cv = NULL;       /* who called us */
5407     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5408     U32 maxopenparen = 0;       /* max '(' index seen so far */
5409     int to_complement;  /* Invert the result? */
5410     _char_class_number classnum;
5411     bool is_utf8_pat = reginfo->is_utf8_pat;
5412     bool match = FALSE;
5413     I32 orig_savestack_ix = PL_savestack_ix;
5414
5415 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5416 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5417 #  define SOLARIS_BAD_OPTIMIZER
5418     const U32 *pl_charclass_dup = PL_charclass;
5419 #  define PL_charclass pl_charclass_dup
5420 #endif
5421
5422 #ifdef DEBUGGING
5423     GET_RE_DEBUG_FLAGS_DECL;
5424 #endif
5425
5426     /* protect against undef(*^R) */
5427     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5428
5429     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5430     multicall_oldcatch = 0;
5431     PERL_UNUSED_VAR(multicall_cop);
5432
5433     PERL_ARGS_ASSERT_REGMATCH;
5434
5435     st = PL_regmatch_state;
5436
5437     /* Note that nextchr is a byte even in UTF */
5438     SET_nextchr;
5439     scan = prog;
5440
5441     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5442             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5443             Perl_re_printf( aTHX_ "regmatch start\n" );
5444     }));
5445
5446     while (scan != NULL) {
5447
5448
5449         next = scan + NEXT_OFF(scan);
5450         if (next == scan)
5451             next = NULL;
5452         state_num = OP(scan);
5453
5454       reenter_switch:
5455         DEBUG_EXECUTE_r(
5456             if (state_num <= REGNODE_MAX) {
5457                 SV * const prop = sv_newmortal();
5458                 regnode *rnext = regnext(scan);
5459
5460                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5461                 regprop(rex, prop, scan, reginfo, NULL);
5462                 Perl_re_printf( aTHX_
5463                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5464                     INDENT_CHARS(depth), "",
5465                     (IV)(scan - rexi->program),
5466                     SvPVX_const(prop),
5467                     (PL_regkind[OP(scan)] == END || !rnext) ?
5468                         0 : (IV)(rnext - rexi->program));
5469             }
5470         );
5471
5472         to_complement = 0;
5473
5474         SET_nextchr;
5475         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5476
5477         switch (state_num) {
5478         case SBOL: /*  /^../ and /\A../  */
5479             if (locinput == reginfo->strbeg)
5480                 break;
5481             sayNO;
5482
5483         case MBOL: /*  /^../m  */
5484             if (locinput == reginfo->strbeg ||
5485                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5486             {
5487                 break;
5488             }
5489             sayNO;
5490
5491         case GPOS: /*  \G  */
5492             if (locinput == reginfo->ganch)
5493                 break;
5494             sayNO;
5495
5496         case KEEPS: /*   \K  */
5497             /* update the startpoint */
5498             st->u.keeper.val = rex->offs[0].start;
5499             rex->offs[0].start = locinput - reginfo->strbeg;
5500             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5501             NOT_REACHED; /* NOTREACHED */
5502
5503         case KEEPS_next_fail:
5504             /* rollback the start point change */
5505             rex->offs[0].start = st->u.keeper.val;
5506             sayNO_SILENT;
5507             NOT_REACHED; /* NOTREACHED */
5508
5509         case MEOL: /* /..$/m  */
5510             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5511                 sayNO;
5512             break;
5513
5514         case SEOL: /* /..$/  */
5515             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5516                 sayNO;
5517             if (reginfo->strend - locinput > 1)
5518                 sayNO;
5519             break;
5520
5521         case EOS: /*  \z  */
5522             if (!NEXTCHR_IS_EOS)
5523                 sayNO;
5524             break;
5525
5526         case SANY: /*  /./s  */
5527             if (NEXTCHR_IS_EOS)
5528                 sayNO;
5529             goto increment_locinput;
5530
5531         case REG_ANY: /*  /./  */
5532             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5533                 sayNO;
5534             goto increment_locinput;
5535
5536
5537 #undef  ST
5538 #define ST st->u.trie
5539         case TRIEC: /* (ab|cd) with known charclass */
5540             /* In this case the charclass data is available inline so
5541                we can fail fast without a lot of extra overhead.
5542              */
5543             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5544                 DEBUG_EXECUTE_r(
5545                     Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5546                               depth, PL_colors[4], PL_colors[5])
5547                 );
5548                 sayNO_SILENT;
5549                 NOT_REACHED; /* NOTREACHED */
5550             }
5551             /* FALLTHROUGH */
5552         case TRIE:  /* (ab|cd)  */
5553             /* the basic plan of execution of the trie is:
5554              * At the beginning, run though all the states, and
5555              * find the longest-matching word. Also remember the position
5556              * of the shortest matching word. For example, this pattern:
5557              *    1  2 3 4    5
5558              *    ab|a|x|abcd|abc
5559              * when matched against the string "abcde", will generate
5560              * accept states for all words except 3, with the longest
5561              * matching word being 4, and the shortest being 2 (with
5562              * the position being after char 1 of the string).
5563              *
5564              * Then for each matching word, in word order (i.e. 1,2,4,5),
5565              * we run the remainder of the pattern; on each try setting
5566              * the current position to the character following the word,
5567              * returning to try the next word on failure.
5568              *
5569              * We avoid having to build a list of words at runtime by
5570              * using a compile-time structure, wordinfo[].prev, which
5571              * gives, for each word, the previous accepting word (if any).
5572              * In the case above it would contain the mappings 1->2, 2->0,
5573              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5574              * the longest word (4 above), a list of all words, by
5575              * following the list of prev pointers; this gives us the
5576              * unordered list 4,5,1,2. Then given the current word we have
5577              * just tried, we can go through the list and find the
5578              * next-biggest word to try (so if we just failed on word 2,
5579              * the next in the list is 4).
5580              *
5581              * Since at runtime we don't record the matching position in
5582              * the string for each word, we have to work that out for
5583              * each word we're about to process. The wordinfo table holds
5584              * the character length of each word; given that we recorded
5585              * at the start: the position of the shortest word and its
5586              * length in chars, we just need to move the pointer the
5587              * difference between the two char lengths. Depending on
5588              * Unicode status and folding, that's cheap or expensive.
5589              *
5590              * This algorithm is optimised for the case where are only a
5591              * small number of accept states, i.e. 0,1, or maybe 2.
5592              * With lots of accepts states, and having to try all of them,
5593              * it becomes quadratic on number of accept states to find all
5594              * the next words.
5595              */
5596
5597             {
5598                 /* what type of TRIE am I? (utf8 makes this contextual) */
5599                 DECL_TRIE_TYPE(scan);
5600
5601                 /* what trie are we using right now */
5602                 reg_trie_data * const trie
5603                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5604                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5605                 U32 state = trie->startstate;
5606
5607                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5608                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5609                     if (utf8_target
5610                         && nextchr >= 0 /* guard against negative EOS value in nextchr */
5611                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5612                         && scan->flags == EXACTL)
5613                     {
5614                         /* We only output for EXACTL, as we let the folder
5615                          * output this message for EXACTFLU8 to avoid
5616                          * duplication */
5617                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5618                                                                reginfo->strend);
5619                     }
5620                 }
5621                 if (   trie->bitmap
5622                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5623                 {
5624                     if (trie->states[ state ].wordnum) {
5625                          DEBUG_EXECUTE_r(
5626                             Perl_re_exec_indentf( aTHX_  "%smatched empty string...%s\n",
5627                                           depth, PL_colors[4], PL_colors[5])
5628                         );
5629                         if (!trie->jump)
5630                             break;
5631                     } else {
5632                         DEBUG_EXECUTE_r(
5633                             Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5634                                           depth, PL_colors[4], PL_colors[5])
5635                         );
5636                         sayNO_SILENT;
5637                    }
5638                 }
5639
5640             {
5641                 U8 *uc = ( U8* )locinput;
5642
5643                 STRLEN len = 0;
5644                 STRLEN foldlen = 0;
5645                 U8 *uscan = (U8*)NULL;
5646                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5647                 U32 charcount = 0; /* how many input chars we have matched */
5648                 U32 accepted = 0; /* have we seen any accepting states? */
5649
5650                 ST.jump = trie->jump;
5651                 ST.me = scan;
5652                 ST.firstpos = NULL;
5653                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5654                 ST.nextword = 0;
5655
5656                 /* fully traverse the TRIE; note the position of the
5657                    shortest accept state and the wordnum of the longest
5658                    accept state */
5659
5660                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5661                     U32 base = trie->states[ state ].trans.base;
5662                     UV uvc = 0;
5663                     U16 charid = 0;
5664                     U16 wordnum;
5665                     wordnum = trie->states[ state ].wordnum;
5666
5667                     if (wordnum) { /* it's an accept state */
5668                         if (!accepted) {
5669                             accepted = 1;
5670                             /* record first match position */
5671                             if (ST.longfold) {
5672                                 ST.firstpos = (U8*)locinput;
5673                                 ST.firstchars = 0;
5674                             }
5675                             else {
5676                                 ST.firstpos = uc;
5677                                 ST.firstchars = charcount;
5678                             }
5679                         }
5680                         if (!ST.nextword || wordnum < ST.nextword)
5681                             ST.nextword = wordnum;
5682                         ST.topword = wordnum;
5683                     }
5684
5685                     DEBUG_TRIE_EXECUTE_r({
5686                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5687                                 /* HERE */
5688                                 PerlIO_printf( Perl_debug_log,
5689                                     "%*s%sState: %4" UVxf " Accepted: %c ",
5690                                     INDENT_CHARS(depth), "", PL_colors[4],
5691                                     (UV)state, (accepted ? 'Y' : 'N'));
5692                     });
5693
5694                     /* read a char and goto next state */
5695                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5696                         I32 offset;
5697                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5698                                              uscan, len, uvc, charid, foldlen,
5699                                              foldbuf, uniflags);
5700                         charcount++;
5701                         if (foldlen>0)
5702                             ST.longfold = TRUE;
5703                         if (charid &&
5704                              ( ((offset =
5705                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5706
5707                              && ((U32)offset < trie->lasttrans)
5708                              && trie->trans[offset].check == state)
5709                         {
5710                             state = trie->trans[offset].next;
5711                         }
5712                         else {
5713                             state = 0;
5714                         }
5715                         uc += len;
5716
5717                     }
5718                     else {
5719                         state = 0;
5720                     }
5721                     DEBUG_TRIE_EXECUTE_r(
5722                         Perl_re_printf( aTHX_
5723                             "Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
5724                             charid, uvc, (UV)state, PL_colors[5] );
5725                     );
5726                 }
5727                 if (!accepted)
5728                    sayNO;
5729
5730                 /* calculate total number of accept states */
5731                 {
5732                     U16 w = ST.topword;
5733                     accepted = 0;
5734                     while (w) {
5735                         w = trie->wordinfo[w].prev;
5736                         accepted++;
5737                     }
5738                     ST.accepted = accepted;
5739                 }
5740
5741                 DEBUG_EXECUTE_r(
5742                     Perl_re_exec_indentf( aTHX_  "%sgot %" IVdf " possible matches%s\n",
5743                         depth,
5744                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5745                 );
5746                 goto trie_first_try; /* jump into the fail handler */
5747             }}
5748             NOT_REACHED; /* NOTREACHED */
5749
5750         case TRIE_next_fail: /* we failed - try next alternative */
5751         {
5752             U8 *uc;
5753             if ( ST.jump ) {
5754                 /* undo any captures done in the tail part of a branch,
5755                  * e.g.
5756                  *    /(?:X(.)(.)|Y(.)).../
5757                  * where the trie just matches X then calls out to do the
5758                  * rest of the branch */
5759                 REGCP_UNWIND(ST.cp);
5760                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5761             }
5762             if (!--ST.accepted) {
5763                 DEBUG_EXECUTE_r({
5764                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
5765                         depth,
5766                         PL_colors[4],
5767                         PL_colors[5] );
5768                 });
5769                 sayNO_SILENT;
5770             }
5771             {
5772                 /* Find next-highest word to process.  Note that this code
5773                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5774                 U16 min = 0;
5775                 U16 word;
5776                 U16 const nextword = ST.nextword;
5777                 reg_trie_wordinfo * const wordinfo
5778                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5779                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5780                     if (word > nextword && (!min || word < min))
5781                         min = word;
5782                 }
5783                 ST.nextword = min;
5784             }
5785
5786           trie_first_try:
5787             if (do_cutgroup) {
5788                 do_cutgroup = 0;
5789                 no_final = 0;
5790             }
5791
5792             if ( ST.jump ) {
5793                 ST.lastparen = rex->lastparen;
5794                 ST.lastcloseparen = rex->lastcloseparen;
5795                 REGCP_SET(ST.cp);
5796             }
5797
5798             /* find start char of end of current word */
5799             {
5800                 U32 chars; /* how many chars to skip */
5801                 reg_trie_data * const trie
5802                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5803
5804                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5805                             >=  ST.firstchars);
5806                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5807                             - ST.firstchars;
5808                 uc = ST.firstpos;
5809
5810                 if (ST.longfold) {
5811                     /* the hard option - fold each char in turn and find
5812                      * its folded length (which may be different */
5813                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5814                     STRLEN foldlen;
5815                     STRLEN len;
5816                     UV uvc;
5817                     U8 *uscan;
5818
5819                     while (chars) {
5820                         if (utf8_target) {
5821                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5822                                                     uniflags);
5823                             uc += len;
5824                         }
5825                         else {
5826                             uvc = *uc;
5827                             uc++;
5828                         }
5829                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5830                         uscan = foldbuf;
5831                         while (foldlen) {
5832                             if (!--chars)
5833                                 break;
5834                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5835                                             uniflags);
5836                             uscan += len;
5837                             foldlen -= len;
5838                         }
5839                     }
5840                 }
5841                 else {
5842                     if (utf8_target)
5843                         while (chars--)
5844                             uc += UTF8SKIP(uc);
5845                     else
5846                         uc += chars;
5847                 }
5848             }
5849
5850             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5851                             ? ST.jump[ST.nextword]
5852                             : NEXT_OFF(ST.me));
5853
5854             DEBUG_EXECUTE_r({
5855                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
5856                     depth,
5857                     PL_colors[4],
5858                     ST.nextword,
5859                     PL_colors[5]
5860                     );
5861             });
5862
5863             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
5864                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5865                 NOT_REACHED; /* NOTREACHED */
5866             }
5867             /* only one choice left - just continue */
5868             DEBUG_EXECUTE_r({
5869                 AV *const trie_words
5870                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5871                 SV ** const tmp = trie_words
5872                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5873                 SV *sv= tmp ? sv_newmortal() : NULL;
5874
5875                 Perl_re_exec_indentf( aTHX_  "%sonly one match left, short-circuiting: #%d <%s>%s\n",
5876                     depth, PL_colors[4],
5877                     ST.nextword,
5878                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5879                             PL_colors[0], PL_colors[1],
5880                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5881                         )
5882                     : "not compiled under -Dr",
5883                     PL_colors[5] );
5884             });
5885
5886             locinput = (char*)uc;
5887             continue; /* execute rest of RE */
5888             /* NOTREACHED */
5889         }
5890 #undef  ST
5891
5892         case EXACTL:             /*  /abc/l       */
5893             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5894
5895             /* Complete checking would involve going through every character
5896              * matched by the string to see if any is above latin1.  But the
5897              * comparision otherwise might very well be a fast assembly
5898              * language routine, and I (khw) don't think slowing things down
5899              * just to check for this warning is worth it.  So this just checks
5900              * the first character */
5901             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5902                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5903             }
5904             /* FALLTHROUGH */
5905         case EXACT: {            /*  /abc/        */
5906             char *s = STRING(scan);
5907             ln = STR_LEN(scan);
5908             if (utf8_target != is_utf8_pat) {
5909                 /* The target and the pattern have differing utf8ness. */
5910                 char *l = locinput;
5911                 const char * const e = s + ln;
5912
5913                 if (utf8_target) {
5914                     /* The target is utf8, the pattern is not utf8.
5915                      * Above-Latin1 code points can't match the pattern;
5916                      * invariants match exactly, and the other Latin1 ones need
5917                      * to be downgraded to a single byte in order to do the
5918                      * comparison.  (If we could be confident that the target
5919                      * is not malformed, this could be refactored to have fewer
5920                      * tests by just assuming that if the first bytes match, it
5921                      * is an invariant, but there are tests in the test suite
5922                      * dealing with (??{...}) which violate this) */
5923                     while (s < e) {
5924                         if (l >= reginfo->strend
5925                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5926                         {
5927                             sayNO;
5928                         }
5929                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
5930                             if (*l != *s) {
5931                                 sayNO;
5932                             }
5933                             l++;
5934                         }
5935                         else {
5936                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5937                             {
5938                                 sayNO;
5939                             }
5940                             l += 2;
5941                         }
5942                         s++;
5943                     }
5944                 }
5945                 else {
5946                     /* The target is not utf8, the pattern is utf8. */
5947                     while (s < e) {
5948                         if (l >= reginfo->strend
5949                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5950                         {
5951                             sayNO;
5952                         }
5953                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
5954                             if (*s != *l) {
5955                                 sayNO;
5956                             }
5957                             s++;
5958                         }
5959                         else {
5960                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5961                             {
5962                                 sayNO;
5963                             }
5964                             s += 2;
5965                         }
5966                         l++;
5967                     }
5968                 }
5969                 locinput = l;
5970             }
5971             else {
5972                 /* The target and the pattern have the same utf8ness. */
5973                 /* Inline the first character, for speed. */
5974                 if (reginfo->strend - locinput < ln
5975                     || UCHARAT(s) != nextchr
5976                     || (ln > 1 && memNE(s, locinput, ln)))
5977                 {
5978                     sayNO;
5979                 }
5980                 locinput += ln;
5981             }
5982             break;
5983             }
5984
5985         case EXACTFL: {          /*  /abc/il      */
5986             re_fold_t folder;
5987             const U8 * fold_array;
5988             const char * s;
5989             U32 fold_utf8_flags;
5990
5991             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5992             folder = foldEQ_locale;
5993             fold_array = PL_fold_locale;
5994             fold_utf8_flags = FOLDEQ_LOCALE;
5995             goto do_exactf;
5996
5997         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
5998                                       is effectively /u; hence to match, target
5999                                       must be UTF-8. */
6000             if (! utf8_target) {
6001                 sayNO;
6002             }
6003             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6004                                              | FOLDEQ_S1_FOLDS_SANE;
6005             folder = foldEQ_latin1;
6006             fold_array = PL_fold_latin1;
6007             goto do_exactf;
6008
6009         case EXACTFU_SS:         /*  /\x{df}/iu   */
6010         case EXACTFU:            /*  /abc/iu      */
6011             folder = foldEQ_latin1;
6012             fold_array = PL_fold_latin1;
6013             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6014             goto do_exactf;
6015
6016         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
6017                                    patterns */
6018             assert(! is_utf8_pat);
6019             /* FALLTHROUGH */
6020         case EXACTFA:            /*  /abc/iaa     */
6021             folder = foldEQ_latin1;
6022             fold_array = PL_fold_latin1;
6023             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6024             goto do_exactf;
6025
6026         case EXACTF:             /*  /abc/i    This node only generated for
6027                                                non-utf8 patterns */
6028             assert(! is_utf8_pat);
6029             folder = foldEQ;
6030             fold_array = PL_fold;
6031             fold_utf8_flags = 0;
6032
6033           do_exactf:
6034             s = STRING(scan);
6035             ln = STR_LEN(scan);
6036
6037             if (utf8_target
6038                 || is_utf8_pat
6039                 || state_num == EXACTFU_SS
6040                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6041             {
6042               /* Either target or the pattern are utf8, or has the issue where
6043                * the fold lengths may differ. */
6044                 const char * const l = locinput;
6045                 char *e = reginfo->strend;
6046
6047                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6048                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6049                 {
6050                     sayNO;
6051                 }
6052                 locinput = e;
6053                 break;
6054             }
6055
6056             /* Neither the target nor the pattern are utf8 */
6057             if (UCHARAT(s) != nextchr
6058                 && !NEXTCHR_IS_EOS
6059                 && UCHARAT(s) != fold_array[nextchr])
6060             {
6061                 sayNO;
6062             }
6063             if (reginfo->strend - locinput < ln)
6064                 sayNO;
6065             if (ln > 1 && ! folder(s, locinput, ln))
6066                 sayNO;
6067             locinput += ln;
6068             break;
6069         }
6070
6071         case NBOUNDL: /*  /\B/l  */
6072             to_complement = 1;
6073             /* FALLTHROUGH */
6074
6075         case BOUNDL:  /*  /\b/l  */
6076         {
6077             bool b1, b2;
6078             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6079
6080             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6081                 if (! IN_UTF8_CTYPE_LOCALE) {
6082                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6083                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6084                 }
6085                 goto boundu;
6086             }
6087
6088             if (utf8_target) {
6089                 if (locinput == reginfo->strbeg)
6090                     b1 = isWORDCHAR_LC('\n');
6091                 else {
6092                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6093                                                         (U8*)(reginfo->strbeg)),
6094                                                  (U8*)(reginfo->strend));
6095                 }
6096                 b2 = (NEXTCHR_IS_EOS)
6097                     ? isWORDCHAR_LC('\n')
6098                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6099                                               (U8*) reginfo->strend);
6100             }
6101             else { /* Here the string isn't utf8 */
6102                 b1 = (locinput == reginfo->strbeg)
6103                      ? isWORDCHAR_LC('\n')
6104                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6105                 b2 = (NEXTCHR_IS_EOS)
6106                     ? isWORDCHAR_LC('\n')
6107                     : isWORDCHAR_LC(nextchr);
6108             }
6109             if (to_complement ^ (b1 == b2)) {
6110                 sayNO;
6111             }
6112             break;
6113         }
6114
6115         case NBOUND:  /*  /\B/   */
6116             to_complement = 1;
6117             /* FALLTHROUGH */
6118
6119         case BOUND:   /*  /\b/   */
6120             if (utf8_target) {
6121                 goto bound_utf8;
6122             }
6123             goto bound_ascii_match_only;
6124
6125         case NBOUNDA: /*  /\B/a  */
6126             to_complement = 1;
6127             /* FALLTHROUGH */
6128
6129         case BOUNDA:  /*  /\b/a  */
6130         {
6131             bool b1, b2;
6132
6133           bound_ascii_match_only:
6134             /* Here the string isn't utf8, or is utf8 and only ascii characters
6135              * are to match \w.  In the latter case looking at the byte just
6136              * prior to the current one may be just the final byte of a
6137              * multi-byte character.  This is ok.  There are two cases:
6138              * 1) it is a single byte character, and then the test is doing
6139              *    just what it's supposed to.
6140              * 2) it is a multi-byte character, in which case the final byte is
6141              *    never mistakable for ASCII, and so the test will say it is
6142              *    not a word character, which is the correct answer. */
6143             b1 = (locinput == reginfo->strbeg)
6144                  ? isWORDCHAR_A('\n')
6145                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6146             b2 = (NEXTCHR_IS_EOS)
6147                 ? isWORDCHAR_A('\n')
6148                 : isWORDCHAR_A(nextchr);
6149             if (to_complement ^ (b1 == b2)) {
6150                 sayNO;
6151             }
6152             break;
6153         }
6154
6155         case NBOUNDU: /*  /\B/u  */
6156             to_complement = 1;
6157             /* FALLTHROUGH */
6158
6159         case BOUNDU:  /*  /\b/u  */
6160
6161           boundu:
6162             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6163                 match = FALSE;
6164             }
6165             else if (utf8_target) {
6166               bound_utf8:
6167                 switch((bound_type) FLAGS(scan)) {
6168                     case TRADITIONAL_BOUND:
6169                     {
6170                         bool b1, b2;
6171                         b1 = (locinput == reginfo->strbeg)
6172                              ? 0 /* isWORDCHAR_L1('\n') */
6173                              : isWORDCHAR_utf8_safe(
6174                                                reghop3((U8*)locinput,
6175                                                        -1,
6176                                                        (U8*)(reginfo->strbeg)),
6177                                                     (U8*) reginfo->strend);
6178                         b2 = (NEXTCHR_IS_EOS)
6179                             ? 0 /* isWORDCHAR_L1('\n') */
6180                             : isWORDCHAR_utf8_safe((U8*)locinput,
6181                                                    (U8*) reginfo->strend);
6182                         match = cBOOL(b1 != b2);
6183                         break;
6184                     }
6185                     case GCB_BOUND:
6186                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6187                             match = TRUE; /* GCB always matches at begin and
6188                                              end */
6189                         }
6190                         else {
6191                             /* Find the gcb values of previous and current
6192                              * chars, then see if is a break point */
6193                             match = isGCB(getGCB_VAL_UTF8(
6194                                                 reghop3((U8*)locinput,
6195                                                         -1,
6196                                                         (U8*)(reginfo->strbeg)),
6197                                                 (U8*) reginfo->strend),
6198                                           getGCB_VAL_UTF8((U8*) locinput,
6199                                                         (U8*) reginfo->strend),
6200                                           (U8*) reginfo->strbeg,
6201                                           (U8*) locinput,
6202                                           utf8_target);
6203                         }
6204                         break;
6205
6206                     case LB_BOUND:
6207                         if (locinput == reginfo->strbeg) {
6208                             match = FALSE;
6209                         }
6210                         else if (NEXTCHR_IS_EOS) {
6211                             match = TRUE;
6212                         }
6213                         else {
6214                             match = isLB(getLB_VAL_UTF8(
6215                                                 reghop3((U8*)locinput,
6216                                                         -1,
6217                                                         (U8*)(reginfo->strbeg)),
6218                                                 (U8*) reginfo->strend),
6219                                           getLB_VAL_UTF8((U8*) locinput,
6220                                                         (U8*) reginfo->strend),
6221                                           (U8*) reginfo->strbeg,
6222                                           (U8*) locinput,
6223                                           (U8*) reginfo->strend,
6224                                           utf8_target);
6225                         }
6226                         break;
6227
6228                     case SB_BOUND: /* Always matches at begin and end */
6229                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6230                             match = TRUE;
6231                         }
6232                         else {
6233                             match = isSB(getSB_VAL_UTF8(
6234                                                 reghop3((U8*)locinput,
6235                                                         -1,
6236                                                         (U8*)(reginfo->strbeg)),
6237                                                 (U8*) reginfo->strend),
6238                                           getSB_VAL_UTF8((U8*) locinput,
6239                                                         (U8*) reginfo->strend),
6240                                           (U8*) reginfo->strbeg,
6241                                           (U8*) locinput,
6242                                           (U8*) reginfo->strend,
6243                                           utf8_target);
6244                         }
6245                         break;
6246
6247                     case WB_BOUND:
6248                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6249                             match = TRUE;
6250                         }
6251                         else {
6252                             match = isWB(WB_UNKNOWN,
6253                                          getWB_VAL_UTF8(
6254                                                 reghop3((U8*)locinput,
6255                                                         -1,
6256                                                         (U8*)(reginfo->strbeg)),
6257                                                 (U8*) reginfo->strend),
6258                                           getWB_VAL_UTF8((U8*) locinput,
6259                                                         (U8*) reginfo->strend),
6260                                           (U8*) reginfo->strbeg,
6261                                           (U8*) locinput,
6262                                           (U8*) reginfo->strend,
6263                                           utf8_target);
6264                         }
6265                         break;
6266                 }
6267             }
6268             else {  /* Not utf8 target */
6269                 switch((bound_type) FLAGS(scan)) {
6270                     case TRADITIONAL_BOUND:
6271                     {
6272                         bool b1, b2;
6273                         b1 = (locinput == reginfo->strbeg)
6274                             ? 0 /* isWORDCHAR_L1('\n') */
6275                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6276                         b2 = (NEXTCHR_IS_EOS)
6277                             ? 0 /* isWORDCHAR_L1('\n') */
6278                             : isWORDCHAR_L1(nextchr);
6279                         match = cBOOL(b1 != b2);
6280                         break;
6281                     }
6282
6283                     case GCB_BOUND:
6284                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6285                             match = TRUE; /* GCB always matches at begin and
6286                                              end */
6287                         }
6288                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6289                                    range */
6290                             match =    UCHARAT(locinput - 1) != '\r'
6291                                     || UCHARAT(locinput) != '\n';
6292                         }
6293                         break;
6294
6295                     case LB_BOUND:
6296                         if (locinput == reginfo->strbeg) {
6297                             match = FALSE;
6298                         }
6299                         else if (NEXTCHR_IS_EOS) {
6300                             match = TRUE;
6301                         }
6302                         else {
6303                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6304                                          getLB_VAL_CP(UCHARAT(locinput)),
6305                                          (U8*) reginfo->strbeg,
6306                                          (U8*) locinput,
6307                                          (U8*) reginfo->strend,
6308                                          utf8_target);
6309                         }
6310                         break;
6311
6312                     case SB_BOUND: /* Always matches at begin and end */
6313                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6314                             match = TRUE;
6315                         }
6316                         else {
6317                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6318                                          getSB_VAL_CP(UCHARAT(locinput)),
6319                                          (U8*) reginfo->strbeg,
6320                                          (U8*) locinput,
6321                                          (U8*) reginfo->strend,
6322                                          utf8_target);
6323                         }
6324                         break;
6325
6326                     case WB_BOUND:
6327                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6328                             match = TRUE;
6329                         }
6330                         else {
6331                             match = isWB(WB_UNKNOWN,
6332                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6333                                          getWB_VAL_CP(UCHARAT(locinput)),
6334                                          (U8*) reginfo->strbeg,
6335                                          (U8*) locinput,
6336                                          (U8*) reginfo->strend,
6337                                          utf8_target);
6338                         }
6339                         break;
6340                 }
6341             }
6342
6343             if (to_complement ^ ! match) {
6344                 sayNO;
6345             }
6346             break;
6347
6348         case ANYOFL:  /*  /[abc]/l      */
6349             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6350
6351             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6352             {
6353               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6354             }
6355             /* FALLTHROUGH */
6356         case ANYOFD:  /*   /[abc]/d       */
6357         case ANYOF:  /*   /[abc]/       */
6358             if (NEXTCHR_IS_EOS)
6359                 sayNO;
6360             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6361                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6362                                                                    utf8_target))
6363                     sayNO;
6364                 locinput += UTF8SKIP(locinput);
6365             }
6366             else {
6367                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6368                     sayNO;
6369                 locinput++;
6370             }
6371             break;
6372
6373         /* The argument (FLAGS) to all the POSIX node types is the class number
6374          * */
6375
6376         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6377             to_complement = 1;
6378             /* FALLTHROUGH */
6379
6380         case POSIXL:    /* \w or [:punct:] etc. under /l */
6381             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6382             if (NEXTCHR_IS_EOS)
6383                 sayNO;
6384
6385             /* Use isFOO_lc() for characters within Latin1.  (Note that
6386              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6387              * wouldn't be invariant) */
6388             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6389                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6390                     sayNO;
6391                 }
6392
6393                 locinput++;
6394                 break;
6395             }
6396
6397             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6398                 /* An above Latin-1 code point, or malformed */
6399                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6400                                                        reginfo->strend);
6401                 goto utf8_posix_above_latin1;
6402             }
6403
6404             /* Here is a UTF-8 variant code point below 256 and the target is
6405              * UTF-8 */
6406             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6407                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6408                                             *(locinput + 1))))))
6409             {
6410                 sayNO;
6411             }
6412
6413             goto increment_locinput;
6414
6415         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6416             to_complement = 1;
6417             /* FALLTHROUGH */
6418
6419         case POSIXD:    /* \w or [:punct:] etc. under /d */
6420             if (utf8_target) {
6421                 goto utf8_posix;
6422             }
6423             goto posixa;
6424
6425         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6426
6427             if (NEXTCHR_IS_EOS) {
6428                 sayNO;
6429             }
6430
6431             /* All UTF-8 variants match */
6432             if (! UTF8_IS_INVARIANT(nextchr)) {
6433                 goto increment_locinput;
6434             }
6435
6436             to_complement = 1;
6437             goto join_nposixa;
6438
6439         case POSIXA:    /* \w or [:punct:] etc. under /a */
6440
6441           posixa:
6442             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6443              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6444              * character is a single byte */
6445
6446             if (NEXTCHR_IS_EOS) {
6447                 sayNO;
6448             }
6449
6450           join_nposixa:
6451
6452             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6453                                                                 FLAGS(scan)))))
6454             {
6455                 sayNO;
6456             }
6457
6458             /* Here we are either not in utf8, or we matched a utf8-invariant,
6459              * so the next char is the next byte */
6460             locinput++;
6461             break;
6462
6463         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6464             to_complement = 1;
6465             /* FALLTHROUGH */
6466
6467         case POSIXU:    /* \w or [:punct:] etc. under /u */
6468           utf8_posix:
6469             if (NEXTCHR_IS_EOS) {
6470                 sayNO;
6471             }
6472
6473             /* Use _generic_isCC() for characters within Latin1.  (Note that
6474              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6475              * wouldn't be invariant) */
6476             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6477                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6478                                                            FLAGS(scan)))))
6479                 {
6480                     sayNO;
6481                 }
6482                 locinput++;
6483             }
6484             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6485                 if (! (to_complement
6486                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6487                                                                *(locinput + 1)),
6488                                              FLAGS(scan)))))
6489                 {
6490                     sayNO;
6491                 }
6492                 locinput += 2;
6493             }
6494             else {  /* Handle above Latin-1 code points */
6495               utf8_posix_above_latin1:
6496                 classnum = (_char_class_number) FLAGS(scan);
6497                 if (classnum < _FIRST_NON_SWASH_CC) {
6498
6499                     /* Here, uses a swash to find such code points.  Load if if
6500                      * not done already */
6501                     if (! PL_utf8_swash_ptrs[classnum]) {
6502                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6503                         PL_utf8_swash_ptrs[classnum]
6504                                 = _core_swash_init("utf8",
6505                                         "",
6506                                         &PL_sv_undef, 1, 0,
6507                                         PL_XPosix_ptrs[classnum], &flags);
6508                     }
6509                     if (! (to_complement
6510                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6511                                                (U8 *) locinput, TRUE))))
6512                     {
6513                         sayNO;
6514                     }
6515                 }
6516                 else {  /* Here, uses macros to find above Latin-1 code points */
6517                     switch (classnum) {
6518                         case _CC_ENUM_SPACE:
6519                             if (! (to_complement
6520                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
6521                             {
6522                                 sayNO;
6523                             }
6524                             break;
6525                         case _CC_ENUM_BLANK:
6526                             if (! (to_complement
6527                                             ^ cBOOL(is_HORIZWS_high(locinput))))
6528                             {
6529                                 sayNO;
6530                             }
6531                             break;
6532                         case _CC_ENUM_XDIGIT:
6533                             if (! (to_complement
6534                                             ^ cBOOL(is_XDIGIT_high(locinput))))
6535                             {
6536                                 sayNO;
6537                             }
6538                             break;
6539                         case _CC_ENUM_VERTSPACE:
6540                             if (! (to_complement
6541                                             ^ cBOOL(is_VERTWS_high(locinput))))
6542                             {
6543                                 sayNO;
6544                             }
6545                             break;
6546                         default:    /* The rest, e.g. [:cntrl:], can't match
6547                                        above Latin1 */
6548                             if (! to_complement) {
6549                                 sayNO;
6550                             }
6551                             break;
6552                     }
6553                 }
6554                 locinput += UTF8SKIP(locinput);
6555             }
6556             break;
6557
6558         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6559                        a Unicode extended Grapheme Cluster */
6560             if (NEXTCHR_IS_EOS)
6561                 sayNO;
6562             if  (! utf8_target) {
6563
6564                 /* Match either CR LF  or '.', as all the other possibilities
6565                  * require utf8 */
6566                 locinput++;         /* Match the . or CR */
6567                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6568                                        match the LF */
6569                     && locinput < reginfo->strend
6570                     && UCHARAT(locinput) == '\n')
6571                 {
6572                     locinput++;
6573                 }
6574             }
6575             else {
6576
6577                 /* Get the gcb type for the current character */
6578                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6579                                                        (U8*) reginfo->strend);
6580
6581                 /* Then scan through the input until we get to the first
6582                  * character whose type is supposed to be a gcb with the
6583                  * current character.  (There is always a break at the
6584                  * end-of-input) */
6585                 locinput += UTF8SKIP(locinput);
6586                 while (locinput < reginfo->strend) {
6587                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6588                                                          (U8*) reginfo->strend);
6589                     if (isGCB(prev_gcb, cur_gcb,
6590                               (U8*) reginfo->strbeg, (U8*) locinput,
6591                               utf8_target))
6592                     {
6593                         break;
6594                     }
6595
6596                     prev_gcb = cur_gcb;
6597                     locinput += UTF8SKIP(locinput);
6598                 }
6599
6600
6601             }
6602             break;
6603
6604         case NREFFL:  /*  /\g{name}/il  */
6605         {   /* The capture buffer cases.  The ones beginning with N for the
6606                named buffers just convert to the equivalent numbered and
6607                pretend they were called as the corresponding numbered buffer
6608                op.  */
6609             /* don't initialize these in the declaration, it makes C++
6610                unhappy */
6611             const char *s;
6612             char type;
6613             re_fold_t folder;
6614             const U8 *fold_array;
6615             UV utf8_fold_flags;
6616
6617             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6618             folder = foldEQ_locale;
6619             fold_array = PL_fold_locale;
6620             type = REFFL;
6621             utf8_fold_flags = FOLDEQ_LOCALE;
6622             goto do_nref;
6623
6624         case NREFFA:  /*  /\g{name}/iaa  */
6625             folder = foldEQ_latin1;
6626             fold_array = PL_fold_latin1;
6627             type = REFFA;
6628             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6629             goto do_nref;
6630
6631         case NREFFU:  /*  /\g{name}/iu  */
6632             folder = foldEQ_latin1;
6633             fold_array = PL_fold_latin1;
6634             type = REFFU;
6635             utf8_fold_flags = 0;
6636             goto do_nref;
6637
6638         case NREFF:  /*  /\g{name}/i  */
6639             folder = foldEQ;
6640             fold_array = PL_fold;
6641             type = REFF;
6642             utf8_fold_flags = 0;
6643             goto do_nref;
6644
6645         case NREF:  /*  /\g{name}/   */
6646             type = REF;
6647             folder = NULL;
6648             fold_array = NULL;
6649             utf8_fold_flags = 0;
6650           do_nref:
6651
6652             /* For the named back references, find the corresponding buffer
6653              * number */
6654             n = reg_check_named_buff_matched(rex,scan);
6655
6656             if ( ! n ) {
6657                 sayNO;
6658             }
6659             goto do_nref_ref_common;
6660
6661         case REFFL:  /*  /\1/il  */
6662             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6663             folder = foldEQ_locale;
6664             fold_array = PL_fold_locale;
6665             utf8_fold_flags = FOLDEQ_LOCALE;
6666             goto do_ref;
6667
6668         case REFFA:  /*  /\1/iaa  */
6669             folder = foldEQ_latin1;
6670             fold_array = PL_fold_latin1;
6671             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6672             goto do_ref;
6673
6674         case REFFU:  /*  /\1/iu  */
6675             folder = foldEQ_latin1;
6676             fold_array = PL_fold_latin1;
6677             utf8_fold_flags = 0;
6678             goto do_ref;
6679
6680         case REFF:  /*  /\1/i  */
6681             folder = foldEQ;
6682             fold_array = PL_fold;
6683             utf8_fold_flags = 0;
6684             goto do_ref;
6685
6686         case REF:  /*  /\1/    */
6687             folder = NULL;
6688             fold_array = NULL;
6689             utf8_fold_flags = 0;
6690
6691           do_ref:
6692             type = OP(scan);
6693             n = ARG(scan);  /* which paren pair */
6694
6695           do_nref_ref_common:
6696             ln = rex->offs[n].start;
6697             endref = rex->offs[n].end;
6698             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6699             if (rex->lastparen < n || ln == -1 || endref == -1)
6700                 sayNO;                  /* Do not match unless seen CLOSEn. */
6701             if (ln == endref)
6702                 break;
6703
6704             s = reginfo->strbeg + ln;
6705             if (type != REF     /* REF can do byte comparison */
6706                 && (utf8_target || type == REFFU || type == REFFL))
6707             {
6708                 char * limit = reginfo->strend;
6709
6710                 /* This call case insensitively compares the entire buffer
6711                     * at s, with the current input starting at locinput, but
6712                     * not going off the end given by reginfo->strend, and
6713                     * returns in <limit> upon success, how much of the
6714                     * current input was matched */
6715                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
6716                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6717                 {
6718                     sayNO;
6719                 }
6720                 locinput = limit;
6721                 break;
6722             }
6723
6724             /* Not utf8:  Inline the first character, for speed. */
6725             if (!NEXTCHR_IS_EOS &&
6726                 UCHARAT(s) != nextchr &&
6727                 (type == REF ||
6728                  UCHARAT(s) != fold_array[nextchr]))
6729                 sayNO;
6730             ln = endref - ln;
6731             if (locinput + ln > reginfo->strend)
6732                 sayNO;
6733             if (ln > 1 && (type == REF
6734                            ? memNE(s, locinput, ln)
6735                            : ! folder(s, locinput, ln)))
6736                 sayNO;
6737             locinput += ln;
6738             break;
6739         }
6740
6741         case NOTHING: /* null op; e.g. the 'nothing' following
6742                        * the '*' in m{(a+|b)*}' */
6743             break;
6744         case TAIL: /* placeholder while compiling (A|B|C) */
6745             break;
6746
6747 #undef  ST
6748 #define ST st->u.eval
6749 #define CUR_EVAL cur_eval->u.eval
6750
6751         {
6752             SV *ret;
6753             REGEXP *re_sv;
6754             regexp *re;
6755             regexp_internal *rei;
6756             regnode *startpoint;
6757             U32 arg;
6758
6759         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6760             arg= (U32)ARG(scan);
6761             if (cur_eval && cur_eval->locinput == locinput) {
6762                 if ( ++nochange_depth > max_nochange_depth )
6763                     Perl_croak(aTHX_
6764                         "Pattern subroutine nesting without pos change"
6765                         " exceeded limit in regex");
6766             } else {
6767                 nochange_depth = 0;
6768             }
6769             re_sv = rex_sv;
6770             re = rex;
6771             rei = rexi;
6772             startpoint = scan + ARG2L(scan);
6773             EVAL_CLOSE_PAREN_SET( st, arg );
6774             /* Detect infinite recursion
6775              *
6776              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6777              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6778              * So we track the position in the string we are at each time
6779              * we recurse and if we try to enter the same routine twice from
6780              * the same position we throw an error.
6781              */
6782             if ( rex->recurse_locinput[arg] == locinput ) {
6783                 /* FIXME: we should show the regop that is failing as part
6784                  * of the error message. */
6785                 Perl_croak(aTHX_ "Infinite recursion in regex");
6786             } else {
6787                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6788                 rex->recurse_locinput[arg]= locinput;
6789
6790                 DEBUG_r({
6791                     GET_RE_DEBUG_FLAGS_DECL;
6792                     DEBUG_STACK_r({
6793                         Perl_re_exec_indentf( aTHX_
6794                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6795                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6796                         );
6797                     });
6798                 });
6799             }
6800
6801             /* Save all the positions seen so far. */
6802             ST.cp = regcppush(rex, 0, maxopenparen);
6803             REGCP_SET(ST.lastcp);
6804
6805             /* and then jump to the code we share with EVAL */
6806             goto eval_recurse_doit;
6807             /* NOTREACHED */
6808
6809         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
6810             if (cur_eval && cur_eval->locinput==locinput) {
6811                 if ( ++nochange_depth > max_nochange_depth )
6812                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6813             } else {
6814                 nochange_depth = 0;
6815             }
6816             {
6817                 /* execute the code in the {...} */
6818
6819                 dSP;
6820                 IV before;
6821                 OP * const oop = PL_op;
6822                 COP * const ocurcop = PL_curcop;
6823                 OP *nop;
6824                 CV *newcv;
6825
6826                 /* save *all* paren positions */
6827                 regcppush(rex, 0, maxopenparen);
6828                 REGCP_SET(ST.lastcp);
6829
6830                 if (!caller_cv)
6831                     caller_cv = find_runcv(NULL);
6832
6833                 n = ARG(scan);
6834
6835                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6836                     newcv = (ReANY(
6837                                     (REGEXP*)(rexi->data->data[n])
6838                             ))->qr_anoncv;
6839                     nop = (OP*)rexi->data->data[n+1];
6840                 }
6841                 else if (rexi->data->what[n] == 'l') { /* literal code */
6842                     newcv = caller_cv;
6843                     nop = (OP*)rexi->data->data[n];
6844                     assert(CvDEPTH(newcv));
6845                 }
6846                 else {
6847                     /* literal with own CV */
6848                     assert(rexi->data->what[n] == 'L');
6849                     newcv = rex->qr_anoncv;
6850                     nop = (OP*)rexi->data->data[n];
6851                 }
6852
6853                 /* Some notes about MULTICALL and the context and save stacks.
6854                  *
6855                  * In something like
6856                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
6857                  * since codeblocks don't introduce a new scope (so that
6858                  * local() etc accumulate), at the end of a successful
6859                  * match there will be a SAVEt_CLEARSV on the savestack
6860                  * for each of $x, $y, $z. If the three code blocks above
6861                  * happen to have come from different CVs (e.g. via
6862                  * embedded qr//s), then we must ensure that during any
6863                  * savestack unwinding, PL_comppad always points to the
6864                  * right pad at each moment. We achieve this by
6865                  * interleaving SAVEt_COMPPAD's on the savestack whenever
6866                  * there is a change of pad.
6867                  * In theory whenever we call a code block, we should
6868                  * push a CXt_SUB context, then pop it on return from
6869                  * that code block. This causes a bit of an issue in that
6870                  * normally popping a context also clears the savestack
6871                  * back to cx->blk_oldsaveix, but here we specifically
6872                  * don't want to clear the save stack on exit from the
6873                  * code block.
6874                  * Also for efficiency we don't want to keep pushing and
6875                  * popping the single SUB context as we backtrack etc.
6876                  * So instead, we push a single context the first time
6877                  * we need, it, then hang onto it until the end of this
6878                  * function. Whenever we encounter a new code block, we
6879                  * update the CV etc if that's changed. During the times
6880                  * in this function where we're not executing a code
6881                  * block, having the SUB context still there is a bit
6882                  * naughty - but we hope that no-one notices.
6883                  * When the SUB context is initially pushed, we fake up
6884                  * cx->blk_oldsaveix to be as if we'd pushed this context
6885                  * on first entry to S_regmatch rather than at some random
6886                  * point during the regexe execution. That way if we
6887                  * croak, popping the context stack will ensure that
6888                  * *everything* SAVEd by this function is undone and then
6889                  * the context popped, rather than e.g., popping the
6890                  * context (and restoring the original PL_comppad) then
6891                  * popping more of the savestack and restoring a bad
6892                  * PL_comppad.
6893                  */
6894
6895                 /* If this is the first EVAL, push a MULTICALL. On
6896                  * subsequent calls, if we're executing a different CV, or
6897                  * if PL_comppad has got messed up from backtracking
6898                  * through SAVECOMPPADs, then refresh the context.
6899                  */
6900                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6901                 {
6902                     U8 flags = (CXp_SUB_RE |
6903                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6904                     SAVECOMPPAD();
6905                     if (last_pushed_cv) {
6906                         CHANGE_MULTICALL_FLAGS(newcv, flags);
6907                     }
6908                     else {
6909                         PUSH_MULTICALL_FLAGS(newcv, flags);
6910                     }
6911                     /* see notes above */
6912                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
6913
6914                     last_pushed_cv = newcv;
6915                 }
6916                 else {
6917                     /* these assignments are just to silence compiler
6918                      * warnings */
6919                     multicall_cop = NULL;
6920                 }
6921                 last_pad = PL_comppad;
6922
6923                 /* the initial nextstate you would normally execute
6924                  * at the start of an eval (which would cause error
6925                  * messages to come from the eval), may be optimised
6926                  * away from the execution path in the regex code blocks;
6927                  * so manually set PL_curcop to it initially */
6928                 {
6929                     OP *o = cUNOPx(nop)->op_first;
6930                     assert(o->op_type == OP_NULL);
6931                     if (o->op_targ == OP_SCOPE) {
6932                         o = cUNOPo->op_first;
6933                     }
6934                     else {
6935                         assert(o->op_targ == OP_LEAVE);
6936                         o = cUNOPo->op_first;
6937                         assert(o->op_type == OP_ENTER);
6938                         o = OpSIBLING(o);
6939                     }
6940
6941                     if (o->op_type != OP_STUB) {
6942                         assert(    o->op_type == OP_NEXTSTATE
6943                                 || o->op_type == OP_DBSTATE
6944                                 || (o->op_type == OP_NULL
6945                                     &&  (  o->op_targ == OP_NEXTSTATE
6946                                         || o->op_targ == OP_DBSTATE
6947                                         )
6948                                     )
6949                         );
6950                         PL_curcop = (COP*)o;
6951                     }
6952                 }
6953                 nop = nop->op_next;
6954
6955                 DEBUG_STATE_r( Perl_re_printf( aTHX_
6956                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
6957
6958                 rex->offs[0].end = locinput - reginfo->strbeg;
6959                 if (reginfo->info_aux_eval->pos_magic)
6960                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6961                                   reginfo->sv, reginfo->strbeg,
6962                                   locinput - reginfo->strbeg);
6963
6964                 if (sv_yes_mark) {
6965                     SV *sv_mrk = get_sv("REGMARK", 1);
6966                     sv_setsv(sv_mrk, sv_yes_mark);
6967                 }
6968
6969                 /* we don't use MULTICALL here as we want to call the
6970                  * first op of the block of interest, rather than the
6971                  * first op of the sub. Also, we don't want to free
6972                  * the savestack frame */
6973                 before = (IV)(SP-PL_stack_base);
6974                 PL_op = nop;
6975                 CALLRUNOPS(aTHX);                       /* Scalar context. */
6976                 SPAGAIN;
6977                 if ((IV)(SP-PL_stack_base) == before)
6978                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
6979                 else {
6980                     ret = POPs;
6981                     PUTBACK;
6982                 }
6983
6984                 /* before restoring everything, evaluate the returned
6985                  * value, so that 'uninit' warnings don't use the wrong
6986                  * PL_op or pad. Also need to process any magic vars
6987                  * (e.g. $1) *before* parentheses are restored */
6988
6989                 PL_op = NULL;
6990
6991                 re_sv = NULL;
6992                 if (logical == 0)        /*   (?{})/   */
6993                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6994                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
6995                     sw = cBOOL(SvTRUE_NN(ret));
6996                     logical = 0;
6997                 }
6998                 else {                   /*  /(??{})  */
6999                     /*  if its overloaded, let the regex compiler handle
7000                      *  it; otherwise extract regex, or stringify  */
7001                     if (SvGMAGICAL(ret))
7002                         ret = sv_mortalcopy(ret);
7003                     if (!SvAMAGIC(ret)) {
7004                         SV *sv = ret;
7005                         if (SvROK(sv))
7006                             sv = SvRV(sv);
7007                         if (SvTYPE(sv) == SVt_REGEXP)
7008                             re_sv = (REGEXP*) sv;
7009                         else if (SvSMAGICAL(ret)) {
7010                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7011                             if (mg)
7012                                 re_sv = (REGEXP *) mg->mg_obj;
7013                         }
7014
7015                         /* force any undef warnings here */
7016                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7017                             ret = sv_mortalcopy(ret);
7018                             (void) SvPV_force_nolen(ret);
7019                         }
7020                     }
7021
7022                 }
7023
7024                 /* *** Note that at this point we don't restore
7025                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7026                  * be used again soon. This is safe as long as nothing
7027                  * in the regexp code uses the pad ! */
7028                 PL_op = oop;
7029                 PL_curcop = ocurcop;
7030                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7031                 PL_curpm_under = PL_curpm;
7032                 PL_curpm = PL_reg_curpm;
7033
7034                 if (logical != 2) {
7035                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7036                     /* NOTREACHED */
7037                 }
7038             }
7039
7040                 /* only /(??{})/  from now on */
7041                 logical = 0;
7042                 {
7043                     /* extract RE object from returned value; compiling if
7044                      * necessary */
7045
7046                     if (re_sv) {
7047                         re_sv = reg_temp_copy(NULL, re_sv);
7048                     }
7049                     else {
7050                         U32 pm_flags = 0;
7051
7052                         if (SvUTF8(ret) && IN_BYTES) {
7053                             /* In use 'bytes': make a copy of the octet
7054                              * sequence, but without the flag on */
7055                             STRLEN len;
7056                             const char *const p = SvPV(ret, len);
7057                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7058                         }
7059                         if (rex->intflags & PREGf_USE_RE_EVAL)
7060                             pm_flags |= PMf_USE_RE_EVAL;
7061
7062                         /* if we got here, it should be an engine which
7063                          * supports compiling code blocks and stuff */
7064                         assert(rex->engine && rex->engine->op_comp);
7065                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7066                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7067                                     rex->engine, NULL, NULL,
7068                                     /* copy /msixn etc to inner pattern */
7069                                     ARG2L(scan),
7070                                     pm_flags);
7071
7072                         if (!(SvFLAGS(ret)
7073                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7074                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7075                             /* This isn't a first class regexp. Instead, it's
7076                                caching a regexp onto an existing, Perl visible
7077                                scalar.  */
7078                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7079                         }
7080                     }
7081                     SAVEFREESV(re_sv);
7082                     re = ReANY(re_sv);
7083                 }
7084                 RXp_MATCH_COPIED_off(re);
7085                 re->subbeg = rex->subbeg;
7086                 re->sublen = rex->sublen;
7087                 re->suboffset = rex->suboffset;
7088                 re->subcoffset = rex->subcoffset;
7089                 re->lastparen = 0;
7090                 re->lastcloseparen = 0;
7091                 rei = RXi_GET(re);
7092                 DEBUG_EXECUTE_r(
7093                     debug_start_match(re_sv, utf8_target, locinput,
7094                                     reginfo->strend, "Matching embedded");
7095                 );
7096                 startpoint = rei->program + 1;
7097                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7098                                              * close_paren only for GOSUB */
7099                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7100                 /* Save all the seen positions so far. */
7101                 ST.cp = regcppush(rex, 0, maxopenparen);
7102                 REGCP_SET(ST.lastcp);
7103                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7104                 maxopenparen = 0;
7105                 /* run the pattern returned from (??{...}) */
7106
7107               eval_recurse_doit: /* Share code with GOSUB below this line
7108                             * At this point we expect the stack context to be
7109                             * set up correctly */
7110
7111                 /* invalidate the S-L poscache. We're now executing a
7112                  * different set of WHILEM ops (and their associated
7113                  * indexes) against the same string, so the bits in the
7114                  * cache are meaningless. Setting maxiter to zero forces
7115                  * the cache to be invalidated and zeroed before reuse.
7116                  * XXX This is too dramatic a measure. Ideally we should
7117                  * save the old cache and restore when running the outer
7118                  * pattern again */
7119                 reginfo->poscache_maxiter = 0;
7120
7121                 /* the new regexp might have a different is_utf8_pat than we do */
7122                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7123
7124                 ST.prev_rex = rex_sv;
7125                 ST.prev_curlyx = cur_curlyx;
7126                 rex_sv = re_sv;
7127                 SET_reg_curpm(rex_sv);
7128                 rex = re;
7129                 rexi = rei;
7130                 cur_curlyx = NULL;
7131                 ST.B = next;
7132                 ST.prev_eval = cur_eval;
7133                 cur_eval = st;
7134                 /* now continue from first node in postoned RE */
7135                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7136                 NOT_REACHED; /* NOTREACHED */
7137         }
7138
7139         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7140             /* note: this is called twice; first after popping B, then A */
7141             DEBUG_STACK_r({
7142                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7143                     depth, cur_eval, ST.prev_eval);
7144             });
7145
7146 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7147             if ( cur_eval && CUR_EVAL.close_paren ) {\
7148                 DEBUG_STACK_r({ \
7149                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7150                         depth,    \
7151                         CUR_EVAL.close_paren - 1,\
7152                         cur_eval, \
7153                         VAL);     \
7154                 });               \
7155                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7156             }
7157
7158             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7159
7160             rex_sv = ST.prev_rex;
7161             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7162             SET_reg_curpm(rex_sv);
7163             rex = ReANY(rex_sv);
7164             rexi = RXi_GET(rex);
7165             {
7166                 /* preserve $^R across LEAVE's. See Bug 121070. */
7167                 SV *save_sv= GvSV(PL_replgv);
7168                 SvREFCNT_inc(save_sv);
7169                 regcpblow(ST.cp); /* LEAVE in disguise */
7170                 sv_setsv(GvSV(PL_replgv), save_sv);
7171                 SvREFCNT_dec(save_sv);
7172             }
7173             cur_eval = ST.prev_eval;
7174             cur_curlyx = ST.prev_curlyx;
7175
7176             /* Invalidate cache. See "invalidate" comment above. */
7177             reginfo->poscache_maxiter = 0;
7178             if ( nochange_depth )
7179                 nochange_depth--;
7180
7181             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7182             sayYES;
7183
7184
7185         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7186             REGCP_UNWIND(ST.lastcp);
7187             sayNO;
7188
7189         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7190             /* note: this is called twice; first after popping B, then A */
7191             DEBUG_STACK_r({
7192                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7193                     depth, cur_eval, ST.prev_eval);
7194             });
7195
7196             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7197
7198             rex_sv = ST.prev_rex;
7199             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7200             SET_reg_curpm(rex_sv);
7201             rex = ReANY(rex_sv);
7202             rexi = RXi_GET(rex);
7203
7204             REGCP_UNWIND(ST.lastcp);
7205             regcppop(rex, &maxopenparen);
7206             cur_eval = ST.prev_eval;
7207             cur_curlyx = ST.prev_curlyx;
7208
7209             /* Invalidate cache. See "invalidate" comment above. */
7210             reginfo->poscache_maxiter = 0;
7211             if ( nochange_depth )
7212                 nochange_depth--;
7213
7214             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7215             sayNO_SILENT;
7216 #undef ST
7217
7218         case OPEN: /*  (  */
7219             n = ARG(scan);  /* which paren pair */
7220             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7221             if (n > maxopenparen)
7222                 maxopenparen = n;
7223             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7224                 "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7225                 depth,
7226                 PTR2UV(rex),
7227                 PTR2UV(rex->offs),
7228                 (UV)n,
7229                 (IV)rex->offs[n].start_tmp,
7230                 (UV)maxopenparen
7231             ));
7232             lastopen = n;
7233             break;
7234
7235 /* XXX really need to log other places start/end are set too */
7236 #define CLOSE_CAPTURE                                                      \
7237     rex->offs[n].start = rex->offs[n].start_tmp;                           \
7238     rex->offs[n].end = locinput - reginfo->strbeg;                         \
7239     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
7240         "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
7241         depth,                                                             \
7242         PTR2UV(rex),                                                       \
7243         PTR2UV(rex->offs),                                                 \
7244         (UV)n,                                                             \
7245         (IV)rex->offs[n].start,                                            \
7246         (IV)rex->offs[n].end                                               \
7247     ))
7248
7249         case CLOSE:  /*  )  */
7250             n = ARG(scan);  /* which paren pair */
7251             CLOSE_CAPTURE;
7252             if (n > rex->lastparen)
7253                 rex->lastparen = n;
7254             rex->lastcloseparen = n;
7255             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7256                 goto fake_end;
7257
7258             break;
7259
7260         case ACCEPT:  /*  (*ACCEPT)  */
7261             if (scan->flags)
7262                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7263             if (ARG2L(scan)){
7264                 regnode *cursor;
7265                 for (cursor=scan;
7266                      cursor && OP(cursor)!=END;
7267                      cursor=regnext(cursor))
7268                 {
7269                     if ( OP(cursor)==CLOSE ){
7270                         n = ARG(cursor);
7271                         if ( n <= lastopen ) {
7272                             CLOSE_CAPTURE;
7273                             if (n > rex->lastparen)
7274                                 rex->lastparen = n;
7275                             rex->lastcloseparen = n;
7276                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7277                                 break;
7278                         }
7279                     }
7280                 }
7281             }
7282             goto fake_end;
7283             /* NOTREACHED */
7284
7285         case GROUPP:  /*  (?(1))  */
7286             n = ARG(scan);  /* which paren pair */
7287             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7288             break;
7289
7290         case NGROUPP:  /*  (?(<name>))  */
7291             /* reg_check_named_buff_matched returns 0 for no match */
7292             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7293             break;
7294
7295         case INSUBP:   /*  (?(R))  */
7296             n = ARG(scan);
7297             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7298              * of SCAN is already set up as matches a eval.close_paren */
7299             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7300             break;
7301
7302         case DEFINEP:  /*  (?(DEFINE))  */
7303             sw = 0;
7304             break;
7305
7306         case IFTHEN:   /*  (?(cond)A|B)  */
7307             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7308             if (sw)
7309                 next = NEXTOPER(NEXTOPER(scan));
7310             else {
7311                 next = scan + ARG(scan);
7312                 if (OP(next) == IFTHEN) /* Fake one. */
7313                     next = NEXTOPER(NEXTOPER(next));
7314             }
7315             break;
7316
7317         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7318             logical = scan->flags;
7319             break;
7320
7321 /*******************************************************************
7322
7323 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7324 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7325 STAR/PLUS/CURLY/CURLYN are used instead.)
7326
7327 A*B is compiled as <CURLYX><A><WHILEM><B>
7328
7329 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7330 state, which contains the current count, initialised to -1. It also sets
7331 cur_curlyx to point to this state, with any previous value saved in the
7332 state block.
7333
7334 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7335 since the pattern may possibly match zero times (i.e. it's a while {} loop
7336 rather than a do {} while loop).
7337
7338 Each entry to WHILEM represents a successful match of A. The count in the
7339 CURLYX block is incremented, another WHILEM state is pushed, and execution
7340 passes to A or B depending on greediness and the current count.
7341
7342 For example, if matching against the string a1a2a3b (where the aN are
7343 substrings that match /A/), then the match progresses as follows: (the
7344 pushed states are interspersed with the bits of strings matched so far):
7345
7346     <CURLYX cnt=-1>
7347     <CURLYX cnt=0><WHILEM>
7348     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7349     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7350     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7351     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7352
7353 (Contrast this with something like CURLYM, which maintains only a single
7354 backtrack state:
7355
7356     <CURLYM cnt=0> a1
7357     a1 <CURLYM cnt=1> a2
7358     a1 a2 <CURLYM cnt=2> a3
7359     a1 a2 a3 <CURLYM cnt=3> b
7360 )
7361
7362 Each WHILEM state block marks a point to backtrack to upon partial failure
7363 of A or B, and also contains some minor state data related to that
7364 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7365 overall state, such as the count, and pointers to the A and B ops.
7366
7367 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7368 must always point to the *current* CURLYX block, the rules are:
7369
7370 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7371 and set cur_curlyx to point the new block.
7372
7373 When popping the CURLYX block after a successful or unsuccessful match,
7374 restore the previous cur_curlyx.
7375
7376 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7377 to the outer one saved in the CURLYX block.
7378
7379 When popping the WHILEM block after a successful or unsuccessful B match,
7380 restore the previous cur_curlyx.
7381
7382 Here's an example for the pattern (AI* BI)*BO
7383 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7384
7385 cur_
7386 curlyx backtrack stack
7387 ------ ---------------
7388 NULL
7389 CO     <CO prev=NULL> <WO>
7390 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7391 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7392 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7393
7394 At this point the pattern succeeds, and we work back down the stack to
7395 clean up, restoring as we go:
7396
7397 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7398 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7399 CO     <CO prev=NULL> <WO>
7400 NULL
7401
7402 *******************************************************************/
7403
7404 #define ST st->u.curlyx
7405
7406         case CURLYX:    /* start of /A*B/  (for complex A) */
7407         {
7408             /* No need to save/restore up to this paren */
7409             I32 parenfloor = scan->flags;
7410
7411             assert(next); /* keep Coverity happy */
7412             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7413                 next += ARG(next);
7414
7415             /* XXXX Probably it is better to teach regpush to support
7416                parenfloor > maxopenparen ... */
7417             if (parenfloor > (I32)rex->lastparen)
7418                 parenfloor = rex->lastparen; /* Pessimization... */
7419
7420             ST.prev_curlyx= cur_curlyx;
7421             cur_curlyx = st;
7422             ST.cp = PL_savestack_ix;
7423
7424             /* these fields contain the state of the current curly.
7425              * they are accessed by subsequent WHILEMs */
7426             ST.parenfloor = parenfloor;
7427             ST.me = scan;
7428             ST.B = next;
7429             ST.minmod = minmod;
7430             minmod = 0;
7431             ST.count = -1;      /* this will be updated by WHILEM */
7432             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7433
7434             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7435             NOT_REACHED; /* NOTREACHED */
7436         }
7437
7438         case CURLYX_end: /* just finished matching all of A*B */
7439             cur_curlyx = ST.prev_curlyx;
7440             sayYES;
7441             NOT_REACHED; /* NOTREACHED */
7442
7443         case CURLYX_end_fail: /* just failed to match all of A*B */
7444             regcpblow(ST.cp);
7445             cur_curlyx = ST.prev_curlyx;
7446             sayNO;
7447             NOT_REACHED; /* NOTREACHED */
7448
7449
7450 #undef ST
7451 #define ST st->u.whilem
7452
7453         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7454         {
7455             /* see the discussion above about CURLYX/WHILEM */
7456             I32 n;
7457             int min, max;
7458             regnode *A;
7459
7460             assert(cur_curlyx); /* keep Coverity happy */
7461
7462             min = ARG1(cur_curlyx->u.curlyx.me);
7463             max = ARG2(cur_curlyx->u.curlyx.me);
7464             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7465             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7466             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7467             ST.cache_offset = 0;
7468             ST.cache_mask = 0;
7469
7470
7471             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: matched %ld out of %d..%d\n",
7472                   depth, (long)n, min, max)
7473             );
7474
7475             /* First just match a string of min A's. */
7476
7477             if (n < min) {
7478                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7479                 cur_curlyx->u.curlyx.lastloc = locinput;
7480                 REGCP_SET(ST.lastcp);
7481
7482                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7483                 NOT_REACHED; /* NOTREACHED */
7484             }
7485
7486             /* If degenerate A matches "", assume A done. */
7487
7488             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7489                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: empty match detected, trying continuation...\n",
7490                    depth)
7491                 );
7492                 goto do_whilem_B_max;
7493             }
7494
7495             /* super-linear cache processing.
7496              *
7497              * The idea here is that for certain types of CURLYX/WHILEM -
7498              * principally those whose upper bound is infinity (and
7499              * excluding regexes that have things like \1 and other very
7500              * non-regular expresssiony things), then if a pattern like
7501              * /....A*.../ fails and we backtrack to the WHILEM, then we
7502              * make a note that this particular WHILEM op was at string
7503              * position 47 (say) when the rest of pattern failed. Then, if
7504              * we ever find ourselves back at that WHILEM, and at string
7505              * position 47 again, we can just fail immediately rather than
7506              * running the rest of the pattern again.
7507              *
7508              * This is very handy when patterns start to go
7509              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7510              * with a combinatorial explosion of backtracking.
7511              *
7512              * The cache is implemented as a bit array, with one bit per
7513              * string byte position per WHILEM op (up to 16) - so its
7514              * between 0.25 and 2x the string size.
7515              *
7516              * To avoid allocating a poscache buffer every time, we do an
7517              * initially countdown; only after we have  executed a WHILEM
7518              * op (string-length x #WHILEMs) times do we allocate the
7519              * cache.
7520              *
7521              * The top 4 bits of scan->flags byte say how many different
7522              * relevant CURLLYX/WHILEM op pairs there are, while the
7523              * bottom 4-bits is the identifying index number of this
7524              * WHILEM.
7525              */
7526
7527             if (scan->flags) {
7528
7529                 if (!reginfo->poscache_maxiter) {
7530                     /* start the countdown: Postpone detection until we
7531                      * know the match is not *that* much linear. */
7532                     reginfo->poscache_maxiter
7533                         =    (reginfo->strend - reginfo->strbeg + 1)
7534                            * (scan->flags>>4);
7535                     /* possible overflow for long strings and many CURLYX's */
7536                     if (reginfo->poscache_maxiter < 0)
7537                         reginfo->poscache_maxiter = I32_MAX;
7538                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7539                 }
7540
7541                 if (reginfo->poscache_iter-- == 0) {
7542                     /* initialise cache */
7543                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7544                     regmatch_info_aux *const aux = reginfo->info_aux;
7545                     if (aux->poscache) {
7546                         if ((SSize_t)reginfo->poscache_size < size) {
7547                             Renew(aux->poscache, size, char);
7548                             reginfo->poscache_size = size;
7549                         }
7550                         Zero(aux->poscache, size, char);
7551                     }
7552                     else {
7553                         reginfo->poscache_size = size;
7554                         Newxz(aux->poscache, size, char);
7555                     }
7556                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7557       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
7558                               PL_colors[4], PL_colors[5])
7559                     );
7560                 }
7561
7562                 if (reginfo->poscache_iter < 0) {
7563                     /* have we already failed at this position? */
7564                     SSize_t offset, mask;
7565
7566                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7567                     offset  = (scan->flags & 0xf) - 1
7568                                 +   (locinput - reginfo->strbeg)
7569                                   * (scan->flags>>4);
7570                     mask    = 1 << (offset % 8);
7571                     offset /= 8;
7572                     if (reginfo->info_aux->poscache[offset] & mask) {
7573                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: (cache) already tried at this position...\n",
7574                             depth)
7575                         );
7576                         cur_curlyx->u.curlyx.count--;
7577                         sayNO; /* cache records failure */
7578                     }
7579                     ST.cache_offset = offset;
7580                     ST.cache_mask   = mask;
7581                 }
7582             }
7583
7584             /* Prefer B over A for minimal matching. */
7585
7586             if (cur_curlyx->u.curlyx.minmod) {
7587                 ST.save_curlyx = cur_curlyx;
7588                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7589                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7590                                     locinput);
7591                 NOT_REACHED; /* NOTREACHED */
7592             }
7593
7594             /* Prefer A over B for maximal matching. */
7595
7596             if (n < max) { /* More greed allowed? */
7597                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7598                             maxopenparen);
7599                 cur_curlyx->u.curlyx.lastloc = locinput;
7600                 REGCP_SET(ST.lastcp);
7601                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7602                 NOT_REACHED; /* NOTREACHED */
7603             }
7604             goto do_whilem_B_max;
7605         }
7606         NOT_REACHED; /* NOTREACHED */
7607
7608         case WHILEM_B_min: /* just matched B in a minimal match */
7609         case WHILEM_B_max: /* just matched B in a maximal match */
7610             cur_curlyx = ST.save_curlyx;
7611             sayYES;
7612             NOT_REACHED; /* NOTREACHED */
7613
7614         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7615             cur_curlyx = ST.save_curlyx;
7616             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7617             cur_curlyx->u.curlyx.count--;
7618             CACHEsayNO;
7619             NOT_REACHED; /* NOTREACHED */
7620
7621         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7622             REGCP_UNWIND(ST.lastcp);
7623             regcppop(rex, &maxopenparen);
7624             /* FALLTHROUGH */
7625         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7626             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7627             cur_curlyx->u.curlyx.count--;
7628             CACHEsayNO;
7629             NOT_REACHED; /* NOTREACHED */
7630
7631         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7632             REGCP_UNWIND(ST.lastcp);
7633             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
7634             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "whilem: failed, trying continuation...\n",
7635                 depth)
7636             );
7637           do_whilem_B_max:
7638             if (cur_curlyx->u.curlyx.count >= REG_INFTY
7639                 && ckWARN(WARN_REGEXP)
7640                 && !reginfo->warned)
7641             {
7642                 reginfo->warned = TRUE;
7643                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7644                      "Complex regular subexpression recursion limit (%d) "
7645                      "exceeded",
7646                      REG_INFTY - 1);
7647             }
7648
7649             /* now try B */
7650             ST.save_curlyx = cur_curlyx;
7651             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7652             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7653                                 locinput);
7654             NOT_REACHED; /* NOTREACHED */
7655
7656         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7657             cur_curlyx = ST.save_curlyx;
7658
7659             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
7660                 /* Maximum greed exceeded */
7661                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7662                     && ckWARN(WARN_REGEXP)
7663                     && !reginfo->warned)
7664                 {
7665                     reginfo->warned     = TRUE;
7666                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7667                         "Complex regular subexpression recursion "
7668                         "limit (%d) exceeded",
7669                         REG_INFTY - 1);
7670                 }
7671                 cur_curlyx->u.curlyx.count--;
7672                 CACHEsayNO;
7673             }
7674
7675             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "trying longer...\n", depth)
7676             );
7677             /* Try grabbing another A and see if it helps. */
7678             cur_curlyx->u.curlyx.lastloc = locinput;
7679             PUSH_STATE_GOTO(WHILEM_A_min,
7680                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
7681                 locinput);
7682             NOT_REACHED; /* NOTREACHED */
7683
7684 #undef  ST
7685 #define ST st->u.branch
7686
7687         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
7688             next = scan + ARG(scan);
7689             if (next == scan)
7690                 next = NULL;
7691             scan = NEXTOPER(scan);
7692             /* FALLTHROUGH */
7693
7694         case BRANCH:        /*  /(...|A|...)/ */
7695             scan = NEXTOPER(scan); /* scan now points to inner node */
7696             ST.lastparen = rex->lastparen;
7697             ST.lastcloseparen = rex->lastcloseparen;
7698             ST.next_branch = next;
7699             REGCP_SET(ST.cp);
7700
7701             /* Now go into the branch */
7702             if (has_cutgroup) {
7703                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7704             } else {
7705                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7706             }
7707             NOT_REACHED; /* NOTREACHED */
7708
7709         case CUTGROUP:  /*  /(*THEN)/  */
7710             sv_yes_mark = st->u.mark.mark_name = scan->flags
7711                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7712                 : NULL;
7713             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7714             NOT_REACHED; /* NOTREACHED */
7715
7716         case CUTGROUP_next_fail:
7717             do_cutgroup = 1;
7718             no_final = 1;
7719             if (st->u.mark.mark_name)
7720                 sv_commit = st->u.mark.mark_name;
7721             sayNO;
7722             NOT_REACHED; /* NOTREACHED */
7723
7724         case BRANCH_next:
7725             sayYES;
7726             NOT_REACHED; /* NOTREACHED */
7727
7728         case BRANCH_next_fail: /* that branch failed; try the next, if any */
7729             if (do_cutgroup) {
7730                 do_cutgroup = 0;
7731                 no_final = 0;
7732             }
7733             REGCP_UNWIND(ST.cp);
7734             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7735             scan = ST.next_branch;
7736             /* no more branches? */
7737             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7738                 DEBUG_EXECUTE_r({
7739                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
7740                         depth,
7741                         PL_colors[4],
7742                         PL_colors[5] );
7743                 });
7744                 sayNO_SILENT;
7745             }
7746             continue; /* execute next BRANCH[J] op */
7747             /* NOTREACHED */
7748
7749         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7750             minmod = 1;
7751             break;
7752
7753 #undef  ST
7754 #define ST st->u.curlym
7755
7756         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
7757
7758             /* This is an optimisation of CURLYX that enables us to push
7759              * only a single backtracking state, no matter how many matches
7760              * there are in {m,n}. It relies on the pattern being constant
7761              * length, with no parens to influence future backrefs
7762              */
7763
7764             ST.me = scan;
7765             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7766
7767             ST.lastparen      = rex->lastparen;
7768             ST.lastcloseparen = rex->lastcloseparen;
7769
7770             /* if paren positive, emulate an OPEN/CLOSE around A */
7771             if (ST.me->flags) {
7772                 U32 paren = ST.me->flags;
7773                 if (paren > maxopenparen)
7774                     maxopenparen = paren;
7775                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7776             }
7777             ST.A = scan;
7778             ST.B = next;
7779             ST.alen = 0;
7780             ST.count = 0;
7781             ST.minmod = minmod;
7782             minmod = 0;
7783             ST.c1 = CHRTEST_UNINIT;
7784             REGCP_SET(ST.cp);
7785
7786             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7787                 goto curlym_do_B;
7788
7789           curlym_do_A: /* execute the A in /A{m,n}B/  */
7790             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7791             NOT_REACHED; /* NOTREACHED */
7792
7793         case CURLYM_A: /* we've just matched an A */
7794             ST.count++;
7795             /* after first match, determine A's length: u.curlym.alen */
7796             if (ST.count == 1) {
7797                 if (reginfo->is_utf8_target) {
7798                     char *s = st->locinput;
7799                     while (s < locinput) {
7800                         ST.alen++;
7801                         s += UTF8SKIP(s);
7802                     }
7803                 }
7804                 else {
7805                     ST.alen = locinput - st->locinput;
7806                 }
7807                 if (ST.alen == 0)
7808                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7809             }
7810             DEBUG_EXECUTE_r(
7811                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
7812                           depth, (IV) ST.count, (IV)ST.alen)
7813             );
7814
7815             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7816                 goto fake_end;
7817
7818             {
7819                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7820                 if ( max == REG_INFTY || ST.count < max )
7821                     goto curlym_do_A; /* try to match another A */
7822             }
7823             goto curlym_do_B; /* try to match B */
7824
7825         case CURLYM_A_fail: /* just failed to match an A */
7826             REGCP_UNWIND(ST.cp);
7827
7828
7829             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7830                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7831                 sayNO;
7832
7833           curlym_do_B: /* execute the B in /A{m,n}B/  */
7834             if (ST.c1 == CHRTEST_UNINIT) {
7835                 /* calculate c1 and c2 for possible match of 1st char
7836                  * following curly */
7837                 ST.c1 = ST.c2 = CHRTEST_VOID;
7838                 assert(ST.B);
7839                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7840                     regnode *text_node = ST.B;
7841                     if (! HAS_TEXT(text_node))
7842                         FIND_NEXT_IMPT(text_node);
7843                     /* this used to be
7844
7845                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7846
7847                         But the former is redundant in light of the latter.
7848
7849                         if this changes back then the macro for
7850                         IS_TEXT and friends need to change.
7851                      */
7852                     if (PL_regkind[OP(text_node)] == EXACT) {
7853                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7854                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7855                            reginfo))
7856                         {
7857                             sayNO;
7858                         }
7859                     }
7860                 }
7861             }
7862
7863             DEBUG_EXECUTE_r(
7864                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
7865                     depth, (IV)ST.count)
7866                 );
7867             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7868                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7869                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7870                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7871                     {
7872                         /* simulate B failing */
7873                         DEBUG_OPTIMISE_r(
7874                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
7875                                 depth,
7876                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7877                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7878                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7879                         );
7880                         state_num = CURLYM_B_fail;
7881                         goto reenter_switch;
7882                     }
7883                 }
7884                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7885                     /* simulate B failing */
7886                     DEBUG_OPTIMISE_r(
7887                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7888                             depth,
7889                             (int) nextchr, ST.c1, ST.c2)
7890                     );
7891                     state_num = CURLYM_B_fail;
7892                     goto reenter_switch;
7893                 }
7894             }
7895
7896             if (ST.me->flags) {
7897                 /* emulate CLOSE: mark current A as captured */
7898                 I32 paren = ST.me->flags;
7899                 if (ST.count) {
7900                     rex->offs[paren].start
7901                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7902                     rex->offs[paren].end = locinput - reginfo->strbeg;
7903                     if ((U32)paren > rex->lastparen)
7904                         rex->lastparen = paren;
7905                     rex->lastcloseparen = paren;
7906                 }
7907                 else
7908                     rex->offs[paren].end = -1;
7909
7910                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7911                 {
7912                     if (ST.count)
7913                         goto fake_end;
7914                     else
7915                         sayNO;
7916                 }
7917             }
7918
7919             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7920             NOT_REACHED; /* NOTREACHED */
7921
7922         case CURLYM_B_fail: /* just failed to match a B */
7923             REGCP_UNWIND(ST.cp);
7924             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7925             if (ST.minmod) {
7926                 I32 max = ARG2(ST.me);
7927                 if (max != REG_INFTY && ST.count == max)
7928                     sayNO;
7929                 goto curlym_do_A; /* try to match a further A */
7930             }
7931             /* backtrack one A */
7932             if (ST.count == ARG1(ST.me) /* min */)
7933                 sayNO;
7934             ST.count--;
7935             SET_locinput(HOPc(locinput, -ST.alen));
7936             goto curlym_do_B; /* try to match B */
7937
7938 #undef ST
7939 #define ST st->u.curly
7940
7941 #define CURLY_SETPAREN(paren, success) \
7942     if (paren) { \
7943         if (success) { \
7944             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7945             rex->offs[paren].end = locinput - reginfo->strbeg; \
7946             if (paren > rex->lastparen) \
7947                 rex->lastparen = paren; \
7948             rex->lastcloseparen = paren; \
7949         } \
7950         else { \
7951             rex->offs[paren].end = -1; \
7952             rex->lastparen      = ST.lastparen; \
7953             rex->lastcloseparen = ST.lastcloseparen; \
7954         } \
7955     }
7956
7957         case STAR:              /*  /A*B/ where A is width 1 char */
7958             ST.paren = 0;
7959             ST.min = 0;
7960             ST.max = REG_INFTY;
7961             scan = NEXTOPER(scan);
7962             goto repeat;
7963
7964         case PLUS:              /*  /A+B/ where A is width 1 char */
7965             ST.paren = 0;
7966             ST.min = 1;
7967             ST.max = REG_INFTY;
7968             scan = NEXTOPER(scan);
7969             goto repeat;
7970
7971         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
7972             ST.paren = scan->flags;     /* Which paren to set */
7973             ST.lastparen      = rex->lastparen;
7974             ST.lastcloseparen = rex->lastcloseparen;
7975             if (ST.paren > maxopenparen)
7976                 maxopenparen = ST.paren;
7977             ST.min = ARG1(scan);  /* min to match */
7978             ST.max = ARG2(scan);  /* max to match */
7979             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
7980             {
7981                 ST.min=1;
7982                 ST.max=1;
7983             }
7984             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7985             goto repeat;
7986
7987         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
7988             ST.paren = 0;
7989             ST.min = ARG1(scan);  /* min to match */
7990             ST.max = ARG2(scan);  /* max to match */
7991             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7992           repeat:
7993             /*
7994             * Lookahead to avoid useless match attempts
7995             * when we know what character comes next.
7996             *
7997             * Used to only do .*x and .*?x, but now it allows
7998             * for )'s, ('s and (?{ ... })'s to be in the way
7999             * of the quantifier and the EXACT-like node.  -- japhy
8000             */
8001
8002             assert(ST.min <= ST.max);
8003             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8004                 ST.c1 = ST.c2 = CHRTEST_VOID;
8005             }
8006             else {
8007                 regnode *text_node = next;
8008
8009                 if (! HAS_TEXT(text_node))
8010                     FIND_NEXT_IMPT(text_node);
8011
8012                 if (! HAS_TEXT(text_node))
8013                     ST.c1 = ST.c2 = CHRTEST_VOID;
8014                 else {
8015                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8016                         ST.c1 = ST.c2 = CHRTEST_VOID;
8017                     }
8018                     else {
8019
8020                     /*  Currently we only get here when
8021
8022                         PL_rekind[OP(text_node)] == EXACT
8023
8024                         if this changes back then the macro for IS_TEXT and
8025                         friends need to change. */
8026                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8027                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8028                            reginfo))
8029                         {
8030                             sayNO;
8031                         }
8032                     }
8033                 }
8034             }
8035
8036             ST.A = scan;
8037             ST.B = next;
8038             if (minmod) {
8039                 char *li = locinput;
8040                 minmod = 0;
8041                 if (ST.min &&
8042                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8043                             < ST.min)
8044                     sayNO;
8045                 SET_locinput(li);
8046                 ST.count = ST.min;
8047                 REGCP_SET(ST.cp);
8048                 if (ST.c1 == CHRTEST_VOID)
8049                     goto curly_try_B_min;
8050
8051                 ST.oldloc = locinput;
8052
8053                 /* set ST.maxpos to the furthest point along the
8054                  * string that could possibly match */
8055                 if  (ST.max == REG_INFTY) {
8056                     ST.maxpos = reginfo->strend - 1;
8057                     if (utf8_target)
8058                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8059                             ST.maxpos--;
8060                 }
8061                 else if (utf8_target) {
8062                     int m = ST.max - ST.min;
8063                     for (ST.maxpos = locinput;
8064                          m >0 && ST.maxpos < reginfo->strend; m--)
8065                         ST.maxpos += UTF8SKIP(ST.maxpos);
8066                 }
8067                 else {
8068                     ST.maxpos = locinput + ST.max - ST.min;
8069                     if (ST.maxpos >= reginfo->strend)
8070                         ST.maxpos = reginfo->strend - 1;
8071                 }
8072                 goto curly_try_B_min_known;
8073
8074             }
8075             else {
8076                 /* avoid taking address of locinput, so it can remain
8077                  * a register var */
8078                 char *li = locinput;
8079                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8080                 if (ST.count < ST.min)
8081                     sayNO;
8082                 SET_locinput(li);
8083                 if ((ST.count > ST.min)
8084                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8085                 {
8086                     /* A{m,n} must come at the end of the string, there's
8087                      * no point in backing off ... */
8088                     ST.min = ST.count;
8089                     /* ...except that $ and \Z can match before *and* after
8090                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8091                        We may back off by one in this case. */
8092                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8093                         ST.min--;
8094                 }
8095                 REGCP_SET(ST.cp);
8096                 goto curly_try_B_max;
8097             }
8098             NOT_REACHED; /* NOTREACHED */
8099
8100         case CURLY_B_min_known_fail:
8101             /* failed to find B in a non-greedy match where c1,c2 valid */
8102
8103             REGCP_UNWIND(ST.cp);
8104             if (ST.paren) {
8105                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8106             }
8107             /* Couldn't or didn't -- move forward. */
8108             ST.oldloc = locinput;
8109             if (utf8_target)
8110                 locinput += UTF8SKIP(locinput);
8111             else
8112                 locinput++;
8113             ST.count++;
8114           curly_try_B_min_known:
8115              /* find the next place where 'B' could work, then call B */
8116             {
8117                 int n;
8118                 if (utf8_target) {
8119                     n = (ST.oldloc == locinput) ? 0 : 1;
8120                     if (ST.c1 == ST.c2) {
8121                         /* set n to utf8_distance(oldloc, locinput) */
8122                         while (locinput <= ST.maxpos
8123                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8124                         {
8125                             locinput += UTF8SKIP(locinput);
8126                             n++;
8127                         }
8128                     }
8129                     else {
8130                         /* set n to utf8_distance(oldloc, locinput) */
8131                         while (locinput <= ST.maxpos
8132                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8133                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8134                         {
8135                             locinput += UTF8SKIP(locinput);
8136                             n++;
8137                         }
8138                     }
8139                 }
8140                 else {  /* Not utf8_target */
8141                     if (ST.c1 == ST.c2) {
8142                         while (locinput <= ST.maxpos &&
8143                                UCHARAT(locinput) != ST.c1)
8144                             locinput++;
8145                     }
8146                     else {
8147                         while (locinput <= ST.maxpos
8148                                && UCHARAT(locinput) != ST.c1
8149                                && UCHARAT(locinput) != ST.c2)
8150                             locinput++;
8151                     }
8152                     n = locinput - ST.oldloc;
8153                 }
8154                 if (locinput > ST.maxpos)
8155                     sayNO;
8156                 if (n) {
8157                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8158                      * at b; check that everything between oldloc and
8159                      * locinput matches */
8160                     char *li = ST.oldloc;
8161                     ST.count += n;
8162                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8163                         sayNO;
8164                     assert(n == REG_INFTY || locinput == li);
8165                 }
8166                 CURLY_SETPAREN(ST.paren, ST.count);
8167                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8168                     goto fake_end;
8169                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8170             }
8171             NOT_REACHED; /* NOTREACHED */
8172
8173         case CURLY_B_min_fail:
8174             /* failed to find B in a non-greedy match where c1,c2 invalid */
8175
8176             REGCP_UNWIND(ST.cp);
8177             if (ST.paren) {
8178                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8179             }
8180             /* failed -- move forward one */
8181             {
8182                 char *li = locinput;
8183                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8184                     sayNO;
8185                 }
8186                 locinput = li;
8187             }
8188             {
8189                 ST.count++;
8190                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8191                         ST.count > 0)) /* count overflow ? */
8192                 {
8193                   curly_try_B_min:
8194                     CURLY_SETPAREN(ST.paren, ST.count);
8195                     if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8196                         goto fake_end;
8197                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8198                 }
8199             }
8200             sayNO;
8201             NOT_REACHED; /* NOTREACHED */
8202
8203           curly_try_B_max:
8204             /* a successful greedy match: now try to match B */
8205             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8206                 goto fake_end;
8207             {
8208                 bool could_match = locinput < reginfo->strend;
8209
8210                 /* If it could work, try it. */
8211                 if (ST.c1 != CHRTEST_VOID && could_match) {
8212                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8213                     {
8214                         could_match = memEQ(locinput,
8215                                             ST.c1_utf8,
8216                                             UTF8SKIP(locinput))
8217                                     || memEQ(locinput,
8218                                              ST.c2_utf8,
8219                                              UTF8SKIP(locinput));
8220                     }
8221                     else {
8222                         could_match = UCHARAT(locinput) == ST.c1
8223                                       || UCHARAT(locinput) == ST.c2;
8224                     }
8225                 }
8226                 if (ST.c1 == CHRTEST_VOID || could_match) {
8227                     CURLY_SETPAREN(ST.paren, ST.count);
8228                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8229                     NOT_REACHED; /* NOTREACHED */
8230                 }
8231             }
8232             /* FALLTHROUGH */
8233
8234         case CURLY_B_max_fail:
8235             /* failed to find B in a greedy match */
8236
8237             REGCP_UNWIND(ST.cp);
8238             if (ST.paren) {
8239                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8240             }
8241             /*  back up. */
8242             if (--ST.count < ST.min)
8243                 sayNO;
8244             locinput = HOPc(locinput, -1);
8245             goto curly_try_B_max;
8246
8247 #undef ST
8248
8249         case END: /*  last op of main pattern  */
8250           fake_end:
8251             if (cur_eval) {
8252                 /* we've just finished A in /(??{A})B/; now continue with B */
8253                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8254                 st->u.eval.prev_rex = rex_sv;           /* inner */
8255
8256                 /* Save *all* the positions. */
8257                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8258                 rex_sv = CUR_EVAL.prev_rex;
8259                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8260                 SET_reg_curpm(rex_sv);
8261                 rex = ReANY(rex_sv);
8262                 rexi = RXi_GET(rex);
8263
8264                 st->u.eval.prev_curlyx = cur_curlyx;
8265                 cur_curlyx = CUR_EVAL.prev_curlyx;
8266
8267                 REGCP_SET(st->u.eval.lastcp);
8268
8269                 /* Restore parens of the outer rex without popping the
8270                  * savestack */
8271                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8272
8273                 st->u.eval.prev_eval = cur_eval;
8274                 cur_eval = CUR_EVAL.prev_eval;
8275                 DEBUG_EXECUTE_r(
8276                     Perl_re_exec_indentf( aTHX_  "EVAL trying tail ... (cur_eval=%p)\n",
8277                                       depth, cur_eval););
8278                 if ( nochange_depth )
8279                     nochange_depth--;
8280
8281                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8282
8283                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8284                                     locinput); /* match B */
8285             }
8286
8287             if (locinput < reginfo->till) {
8288                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8289                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8290                                       PL_colors[4],
8291                                       (long)(locinput - startpos),
8292                                       (long)(reginfo->till - startpos),
8293                                       PL_colors[5]));
8294
8295                 sayNO_SILENT;           /* Cannot match: too short. */
8296             }
8297             sayYES;                     /* Success! */
8298
8299         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8300             DEBUG_EXECUTE_r(
8301             Perl_re_exec_indentf( aTHX_  "%ssubpattern success...%s\n",
8302                 depth, PL_colors[4], PL_colors[5]));
8303             sayYES;                     /* Success! */
8304
8305 #undef  ST
8306 #define ST st->u.ifmatch
8307
8308         {
8309             char *newstart;
8310
8311         case SUSPEND:   /* (?>A) */
8312             ST.wanted = 1;
8313             newstart = locinput;
8314             goto do_ifmatch;
8315
8316         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8317             ST.wanted = 0;
8318             goto ifmatch_trivial_fail_test;
8319
8320         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8321             ST.wanted = 1;
8322           ifmatch_trivial_fail_test:
8323             if (scan->flags) {
8324                 char * const s = HOPBACKc(locinput, scan->flags);
8325                 if (!s) {
8326                     /* trivial fail */
8327                     if (logical) {
8328                         logical = 0;
8329                         sw = 1 - cBOOL(ST.wanted);
8330                     }
8331                     else if (ST.wanted)
8332                         sayNO;
8333                     next = scan + ARG(scan);
8334                     if (next == scan)
8335                         next = NULL;
8336                     break;
8337                 }
8338                 newstart = s;
8339             }
8340             else
8341                 newstart = locinput;
8342
8343           do_ifmatch:
8344             ST.me = scan;
8345             ST.logical = logical;
8346             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8347
8348             /* execute body of (?...A) */
8349             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8350             NOT_REACHED; /* NOTREACHED */
8351         }
8352
8353         case IFMATCH_A_fail: /* body of (?...A) failed */
8354             ST.wanted = !ST.wanted;
8355             /* FALLTHROUGH */
8356
8357         case IFMATCH_A: /* body of (?...A) succeeded */
8358             if (ST.logical) {
8359                 sw = cBOOL(ST.wanted);
8360             }
8361             else if (!ST.wanted)
8362                 sayNO;
8363
8364             if (OP(ST.me) != SUSPEND) {
8365                 /* restore old position except for (?>...) */
8366                 locinput = st->locinput;
8367             }
8368             scan = ST.me + ARG(ST.me);
8369             if (scan == ST.me)
8370                 scan = NULL;
8371             continue; /* execute B */
8372
8373 #undef ST
8374
8375         case LONGJMP: /*  alternative with many branches compiles to
8376                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8377             next = scan + ARG(scan);
8378             if (next == scan)
8379                 next = NULL;
8380             break;
8381
8382         case COMMIT:  /*  (*COMMIT)  */
8383             reginfo->cutpoint = reginfo->strend;
8384             /* FALLTHROUGH */
8385
8386         case PRUNE:   /*  (*PRUNE)   */
8387             if (scan->flags)
8388                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8389             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8390             NOT_REACHED; /* NOTREACHED */
8391
8392         case COMMIT_next_fail:
8393             no_final = 1;
8394             /* FALLTHROUGH */
8395             sayNO;
8396             NOT_REACHED; /* NOTREACHED */
8397
8398         case OPFAIL:   /* (*FAIL)  */
8399             if (scan->flags)
8400                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8401             if (logical) {
8402                 /* deal with (?(?!)X|Y) properly,
8403                  * make sure we trigger the no branch
8404                  * of the trailing IFTHEN structure*/
8405                 sw= 0;
8406                 break;
8407             } else {
8408                 sayNO;
8409             }
8410             NOT_REACHED; /* NOTREACHED */
8411
8412 #define ST st->u.mark
8413         case MARKPOINT: /*  (*MARK:foo)  */
8414             ST.prev_mark = mark_state;
8415             ST.mark_name = sv_commit = sv_yes_mark
8416                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8417             mark_state = st;
8418             ST.mark_loc = locinput;
8419             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8420             NOT_REACHED; /* NOTREACHED */
8421
8422         case MARKPOINT_next:
8423             mark_state = ST.prev_mark;
8424             sayYES;
8425             NOT_REACHED; /* NOTREACHED */
8426
8427         case MARKPOINT_next_fail:
8428             if (popmark && sv_eq(ST.mark_name,popmark))
8429             {
8430                 if (ST.mark_loc > startpoint)
8431                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8432                 popmark = NULL; /* we found our mark */
8433                 sv_commit = ST.mark_name;
8434
8435                 DEBUG_EXECUTE_r({
8436                         Perl_re_exec_indentf( aTHX_  "%ssetting cutpoint to mark:%" SVf "...%s\n",
8437                             depth,
8438                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8439                 });
8440             }
8441             mark_state = ST.prev_mark;
8442             sv_yes_mark = mark_state ?
8443                 mark_state->u.mark.mark_name : NULL;
8444             sayNO;
8445             NOT_REACHED; /* NOTREACHED */
8446
8447         case SKIP:  /*  (*SKIP)  */
8448             if (!scan->flags) {
8449                 /* (*SKIP) : if we fail we cut here*/
8450                 ST.mark_name = NULL;
8451                 ST.mark_loc = locinput;
8452                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8453             } else {
8454                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8455                    otherwise do nothing.  Meaning we need to scan
8456                  */
8457                 regmatch_state *cur = mark_state;
8458                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8459
8460                 while (cur) {
8461                     if ( sv_eq( cur->u.mark.mark_name,
8462                                 find ) )
8463                     {
8464                         ST.mark_name = find;
8465                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8466                     }
8467                     cur = cur->u.mark.prev_mark;
8468                 }
8469             }
8470             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8471             break;
8472
8473         case SKIP_next_fail:
8474             if (ST.mark_name) {
8475                 /* (*CUT:NAME) - Set up to search for the name as we
8476                    collapse the stack*/
8477                 popmark = ST.mark_name;
8478             } else {
8479                 /* (*CUT) - No name, we cut here.*/
8480                 if (ST.mark_loc > startpoint)
8481                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8482                 /* but we set sv_commit to latest mark_name if there
8483                    is one so they can test to see how things lead to this
8484                    cut */
8485                 if (mark_state)
8486                     sv_commit=mark_state->u.mark.mark_name;
8487             }
8488             no_final = 1;
8489             sayNO;
8490             NOT_REACHED; /* NOTREACHED */
8491 #undef ST
8492
8493         case LNBREAK: /* \R */
8494             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8495                 locinput += n;
8496             } else
8497                 sayNO;
8498             break;
8499
8500         default:
8501             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8502                           PTR2UV(scan), OP(scan));
8503             Perl_croak(aTHX_ "regexp memory corruption");
8504
8505         /* this is a point to jump to in order to increment
8506          * locinput by one character */
8507           increment_locinput:
8508             assert(!NEXTCHR_IS_EOS);
8509             if (utf8_target) {
8510                 locinput += PL_utf8skip[nextchr];
8511                 /* locinput is allowed to go 1 char off the end (signifying
8512                  * EOS), but not 2+ */
8513                 if (locinput > reginfo->strend)
8514                     sayNO;
8515             }
8516             else
8517                 locinput++;
8518             break;
8519
8520         } /* end switch */
8521
8522         /* switch break jumps here */
8523         scan = next; /* prepare to execute the next op and ... */
8524         continue;    /* ... jump back to the top, reusing st */
8525         /* NOTREACHED */
8526
8527       push_yes_state:
8528         /* push a state that backtracks on success */
8529         st->u.yes.prev_yes_state = yes_state;
8530         yes_state = st;
8531         /* FALLTHROUGH */
8532       push_state:
8533         /* push a new regex state, then continue at scan  */
8534         {
8535             regmatch_state *newst;
8536
8537             DEBUG_STACK_r({
8538                 regmatch_state *cur = st;
8539                 regmatch_state *curyes = yes_state;
8540                 U32 i;
8541                 regmatch_slab *slab = PL_regmatch_slab;
8542                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
8543                     if (cur < SLAB_FIRST(slab)) {
8544                         slab = slab->prev;
8545                         cur = SLAB_LAST(slab);
8546                     }
8547                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
8548                         depth,
8549                         i ? "    " : "push",
8550                         depth - i, PL_reg_name[cur->resume_state],
8551                         (curyes == cur) ? "yes" : ""
8552                     );
8553                     if (curyes == cur)
8554                         curyes = cur->u.yes.prev_yes_state;
8555                 }
8556             } else
8557                 DEBUG_STATE_pp("push")
8558             );
8559             depth++;
8560             st->locinput = locinput;
8561             newst = st+1;
8562             if (newst >  SLAB_LAST(PL_regmatch_slab))
8563                 newst = S_push_slab(aTHX);
8564             PL_regmatch_state = newst;
8565
8566             locinput = pushinput;
8567             st = newst;
8568             continue;
8569             /* NOTREACHED */
8570         }
8571     }
8572 #ifdef SOLARIS_BAD_OPTIMIZER
8573 #  undef PL_charclass
8574 #endif
8575
8576     /*
8577     * We get here only if there's trouble -- normally "case END" is
8578     * the terminating point.
8579     */
8580     Perl_croak(aTHX_ "corrupted regexp pointers");
8581     NOT_REACHED; /* NOTREACHED */
8582
8583   yes:
8584     if (yes_state) {
8585         /* we have successfully completed a subexpression, but we must now
8586          * pop to the state marked by yes_state and continue from there */
8587         assert(st != yes_state);
8588 #ifdef DEBUGGING
8589         while (st != yes_state) {
8590             st--;
8591             if (st < SLAB_FIRST(PL_regmatch_slab)) {
8592                 PL_regmatch_slab = PL_regmatch_slab->prev;
8593                 st = SLAB_LAST(PL_regmatch_slab);
8594             }
8595             DEBUG_STATE_r({
8596                 if (no_final) {
8597                     DEBUG_STATE_pp("pop (no final)");
8598                 } else {
8599                     DEBUG_STATE_pp("pop (yes)");
8600                 }
8601             });
8602             depth--;
8603         }
8604 #else
8605         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8606             || yes_state > SLAB_LAST(PL_regmatch_slab))
8607         {
8608             /* not in this slab, pop slab */
8609             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8610             PL_regmatch_slab = PL_regmatch_slab->prev;
8611             st = SLAB_LAST(PL_regmatch_slab);
8612         }
8613         depth -= (st - yes_state);
8614 #endif
8615         st = yes_state;
8616         yes_state = st->u.yes.prev_yes_state;
8617         PL_regmatch_state = st;
8618
8619         if (no_final)
8620             locinput= st->locinput;
8621         state_num = st->resume_state + no_final;
8622         goto reenter_switch;
8623     }
8624
8625     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
8626                           PL_colors[4], PL_colors[5]));
8627
8628     if (reginfo->info_aux_eval) {
8629         /* each successfully executed (?{...}) block does the equivalent of
8630          *   local $^R = do {...}
8631          * When popping the save stack, all these locals would be undone;
8632          * bypass this by setting the outermost saved $^R to the latest
8633          * value */
8634         /* I dont know if this is needed or works properly now.
8635          * see code related to PL_replgv elsewhere in this file.
8636          * Yves
8637          */
8638         if (oreplsv != GvSV(PL_replgv))
8639             sv_setsv(oreplsv, GvSV(PL_replgv));
8640     }
8641     result = 1;
8642     goto final_exit;
8643
8644   no:
8645     DEBUG_EXECUTE_r(
8646         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
8647             depth,
8648             PL_colors[4], PL_colors[5])
8649         );
8650
8651   no_silent:
8652     if (no_final) {
8653         if (yes_state) {
8654             goto yes;
8655         } else {
8656             goto final_exit;
8657         }
8658     }
8659     if (depth) {
8660         /* there's a previous state to backtrack to */
8661         st--;
8662         if (st < SLAB_FIRST(PL_regmatch_slab)) {
8663             PL_regmatch_slab = PL_regmatch_slab->prev;
8664             st = SLAB_LAST(PL_regmatch_slab);
8665         }
8666         PL_regmatch_state = st;
8667         locinput= st->locinput;
8668
8669         DEBUG_STATE_pp("pop");
8670         depth--;
8671         if (yes_state == st)
8672             yes_state = st->u.yes.prev_yes_state;
8673
8674         state_num = st->resume_state + 1; /* failure = success + 1 */
8675         PERL_ASYNC_CHECK();
8676         goto reenter_switch;
8677     }
8678     result = 0;
8679
8680   final_exit:
8681     if (rex->intflags & PREGf_VERBARG_SEEN) {
8682         SV *sv_err = get_sv("REGERROR", 1);
8683         SV *sv_mrk = get_sv("REGMARK", 1);
8684         if (result) {
8685             sv_commit = &PL_sv_no;
8686             if (!sv_yes_mark)
8687                 sv_yes_mark = &PL_sv_yes;
8688         } else {
8689             if (!sv_commit)
8690                 sv_commit = &PL_sv_yes;
8691             sv_yes_mark = &PL_sv_no;
8692         }
8693         assert(sv_err);
8694         assert(sv_mrk);
8695         sv_setsv(sv_err, sv_commit);
8696         sv_setsv(sv_mrk, sv_yes_mark);
8697     }
8698
8699
8700     if (last_pushed_cv) {
8701         dSP;
8702         /* see "Some notes about MULTICALL" above */
8703         POP_MULTICALL;
8704         PERL_UNUSED_VAR(SP);
8705     }
8706     else
8707         LEAVE_SCOPE(orig_savestack_ix);
8708
8709     assert(!result ||  locinput - reginfo->strbeg >= 0);
8710     return result ?  locinput - reginfo->strbeg : -1;
8711 }
8712
8713 /*
8714  - regrepeat - repeatedly match something simple, report how many
8715  *
8716  * What 'simple' means is a node which can be the operand of a quantifier like
8717  * '+', or {1,3}
8718  *
8719  * startposp - pointer a pointer to the start position.  This is updated
8720  *             to point to the byte following the highest successful
8721  *             match.
8722  * p         - the regnode to be repeatedly matched against.
8723  * reginfo   - struct holding match state, such as strend
8724  * max       - maximum number of things to match.
8725  * depth     - (for debugging) backtracking depth.
8726  */
8727 STATIC I32
8728 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8729             regmatch_info *const reginfo, I32 max _pDEPTH)
8730 {
8731     char *scan;     /* Pointer to current position in target string */
8732     I32 c;
8733     char *loceol = reginfo->strend;   /* local version */
8734     I32 hardcount = 0;  /* How many matches so far */
8735     bool utf8_target = reginfo->is_utf8_target;
8736     unsigned int to_complement = 0;  /* Invert the result? */
8737     UV utf8_flags;
8738     _char_class_number classnum;
8739
8740     PERL_ARGS_ASSERT_REGREPEAT;
8741
8742     scan = *startposp;
8743     if (max == REG_INFTY)
8744         max = I32_MAX;
8745     else if (! utf8_target && loceol - scan > max)
8746         loceol = scan + max;
8747
8748     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8749      * to the maximum of how far we should go in it (leaving it set to the real
8750      * end, if the maximum permissible would take us beyond that).  This allows
8751      * us to make the loop exit condition that we haven't gone past <loceol> to
8752      * also mean that we haven't exceeded the max permissible count, saving a
8753      * test each time through the loop.  But it assumes that the OP matches a
8754      * single byte, which is true for most of the OPs below when applied to a
8755      * non-UTF-8 target.  Those relatively few OPs that don't have this
8756      * characteristic will have to compensate.
8757      *
8758      * There is no adjustment for UTF-8 targets, as the number of bytes per
8759      * character varies.  OPs will have to test both that the count is less
8760      * than the max permissible (using <hardcount> to keep track), and that we
8761      * are still within the bounds of the string (using <loceol>.  A few OPs
8762      * match a single byte no matter what the encoding.  They can omit the max
8763      * test if, for the UTF-8 case, they do the adjustment that was skipped
8764      * above.
8765      *
8766      * Thus, the code above sets things up for the common case; and exceptional
8767      * cases need extra work; the common case is to make sure <scan> doesn't
8768      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8769      * count doesn't exceed the maximum permissible */
8770
8771     switch (OP(p)) {
8772     case REG_ANY:
8773         if (utf8_target) {
8774             while (scan < loceol && hardcount < max && *scan != '\n') {
8775                 scan += UTF8SKIP(scan);
8776                 hardcount++;
8777             }
8778         } else {
8779             while (scan < loceol && *scan != '\n')
8780                 scan++;
8781         }
8782         break;
8783     case SANY:
8784         if (utf8_target) {
8785             while (scan < loceol && hardcount < max) {
8786                 scan += UTF8SKIP(scan);
8787                 hardcount++;
8788             }
8789         }
8790         else
8791             scan = loceol;
8792         break;
8793     case EXACTL:
8794         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8795         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8796             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8797         }
8798         /* FALLTHROUGH */
8799     case EXACT:
8800         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8801
8802         c = (U8)*STRING(p);
8803
8804         /* Can use a simple loop if the pattern char to match on is invariant
8805          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
8806          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8807          * true iff it doesn't matter if the argument is in UTF-8 or not */
8808         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8809             if (utf8_target && loceol - scan > max) {
8810                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8811                  * since here, to match at all, 1 char == 1 byte */
8812                 loceol = scan + max;
8813             }
8814             while (scan < loceol && UCHARAT(scan) == c) {
8815                 scan++;
8816             }
8817         }
8818         else if (reginfo->is_utf8_pat) {
8819             if (utf8_target) {
8820                 STRLEN scan_char_len;
8821
8822                 /* When both target and pattern are UTF-8, we have to do
8823                  * string EQ */
8824                 while (hardcount < max
8825                        && scan < loceol
8826                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8827                        && memEQ(scan, STRING(p), scan_char_len))
8828                 {
8829                     scan += scan_char_len;
8830                     hardcount++;
8831                 }
8832             }
8833             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8834
8835                 /* Target isn't utf8; convert the character in the UTF-8
8836                  * pattern to non-UTF8, and do a simple loop */
8837                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8838                 while (scan < loceol && UCHARAT(scan) == c) {
8839                     scan++;
8840                 }
8841             } /* else pattern char is above Latin1, can't possibly match the
8842                  non-UTF-8 target */
8843         }
8844         else {
8845
8846             /* Here, the string must be utf8; pattern isn't, and <c> is
8847              * different in utf8 than not, so can't compare them directly.
8848              * Outside the loop, find the two utf8 bytes that represent c, and
8849              * then look for those in sequence in the utf8 string */
8850             U8 high = UTF8_TWO_BYTE_HI(c);
8851             U8 low = UTF8_TWO_BYTE_LO(c);
8852
8853             while (hardcount < max
8854                     && scan + 1 < loceol
8855                     && UCHARAT(scan) == high
8856                     && UCHARAT(scan + 1) == low)
8857             {
8858                 scan += 2;
8859                 hardcount++;
8860             }
8861         }
8862         break;
8863
8864     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
8865         assert(! reginfo->is_utf8_pat);
8866         /* FALLTHROUGH */
8867     case EXACTFA:
8868         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8869         goto do_exactf;
8870
8871     case EXACTFL:
8872         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8873         utf8_flags = FOLDEQ_LOCALE;
8874         goto do_exactf;
8875
8876     case EXACTF:   /* This node only generated for non-utf8 patterns */
8877         assert(! reginfo->is_utf8_pat);
8878         utf8_flags = 0;
8879         goto do_exactf;
8880
8881     case EXACTFLU8:
8882         if (! utf8_target) {
8883             break;
8884         }
8885         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8886                                     | FOLDEQ_S2_FOLDS_SANE;
8887         goto do_exactf;
8888
8889     case EXACTFU_SS:
8890     case EXACTFU:
8891         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8892
8893       do_exactf: {
8894         int c1, c2;
8895         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8896
8897         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8898
8899         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8900                                         reginfo))
8901         {
8902             if (c1 == CHRTEST_VOID) {
8903                 /* Use full Unicode fold matching */
8904                 char *tmpeol = reginfo->strend;
8905                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8906                 while (hardcount < max
8907                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8908                                              STRING(p), NULL, pat_len,
8909                                              reginfo->is_utf8_pat, utf8_flags))
8910                 {
8911                     scan = tmpeol;
8912                     tmpeol = reginfo->strend;
8913                     hardcount++;
8914                 }
8915             }
8916             else if (utf8_target) {
8917                 if (c1 == c2) {
8918                     while (scan < loceol
8919                            && hardcount < max
8920                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8921                     {
8922                         scan += UTF8SKIP(scan);
8923                         hardcount++;
8924                     }
8925                 }
8926                 else {
8927                     while (scan < loceol
8928                            && hardcount < max
8929                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8930                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8931                     {
8932                         scan += UTF8SKIP(scan);
8933                         hardcount++;
8934                     }
8935                 }
8936             }
8937             else if (c1 == c2) {
8938                 while (scan < loceol && UCHARAT(scan) == c1) {
8939                     scan++;
8940                 }
8941             }
8942             else {
8943                 while (scan < loceol &&
8944                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8945                 {
8946                     scan++;
8947                 }
8948             }
8949         }
8950         break;
8951     }
8952     case ANYOFL:
8953         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8954
8955         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
8956             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8957         }
8958         /* FALLTHROUGH */
8959     case ANYOFD:
8960     case ANYOF:
8961         if (utf8_target) {
8962             while (hardcount < max
8963                    && scan < loceol
8964                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8965             {
8966                 scan += UTF8SKIP(scan);
8967                 hardcount++;
8968             }
8969         }
8970         else if (ANYOF_FLAGS(p)) {
8971             while (scan < loceol
8972                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
8973                 scan++;
8974         }
8975         else {
8976             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
8977                 scan++;
8978         }
8979         break;
8980
8981     /* The argument (FLAGS) to all the POSIX node types is the class number */
8982
8983     case NPOSIXL:
8984         to_complement = 1;
8985         /* FALLTHROUGH */
8986
8987     case POSIXL:
8988         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8989         if (! utf8_target) {
8990             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8991                                                                    *scan)))
8992             {
8993                 scan++;
8994             }
8995         } else {
8996             while (hardcount < max && scan < loceol
8997                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8998                                                                   (U8 *) scan)))
8999             {
9000                 scan += UTF8SKIP(scan);
9001                 hardcount++;
9002             }
9003         }
9004         break;
9005
9006     case POSIXD:
9007         if (utf8_target) {
9008             goto utf8_posix;
9009         }
9010         /* FALLTHROUGH */
9011
9012     case POSIXA:
9013         if (utf8_target && loceol - scan > max) {
9014
9015             /* We didn't adjust <loceol> at the beginning of this routine
9016              * because is UTF-8, but it is actually ok to do so, since here, to
9017              * match, 1 char == 1 byte. */
9018             loceol = scan + max;
9019         }
9020         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9021             scan++;
9022         }
9023         break;
9024
9025     case NPOSIXD:
9026         if (utf8_target) {
9027             to_complement = 1;
9028             goto utf8_posix;
9029         }
9030         /* FALLTHROUGH */
9031
9032     case NPOSIXA:
9033         if (! utf8_target) {
9034             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9035                 scan++;
9036             }
9037         }
9038         else {
9039
9040             /* The complement of something that matches only ASCII matches all
9041              * non-ASCII, plus everything in ASCII that isn't in the class. */
9042             while (hardcount < max && scan < loceol
9043                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9044                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9045             {
9046                 scan += UTF8SKIP(scan);
9047                 hardcount++;
9048             }
9049         }
9050         break;
9051
9052     case NPOSIXU:
9053         to_complement = 1;
9054         /* FALLTHROUGH */
9055
9056     case POSIXU:
9057         if (! utf8_target) {
9058             while (scan < loceol && to_complement
9059                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9060             {
9061                 scan++;
9062             }
9063         }
9064         else {
9065           utf8_posix:
9066             classnum = (_char_class_number) FLAGS(p);
9067             if (classnum < _FIRST_NON_SWASH_CC) {
9068
9069                 /* Here, a swash is needed for above-Latin1 code points.
9070                  * Process as many Latin1 code points using the built-in rules.
9071                  * Go to another loop to finish processing upon encountering
9072                  * the first Latin1 code point.  We could do that in this loop
9073                  * as well, but the other way saves having to test if the swash
9074                  * has been loaded every time through the loop: extra space to
9075                  * save a test. */
9076                 while (hardcount < max && scan < loceol) {
9077                     if (UTF8_IS_INVARIANT(*scan)) {
9078                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
9079                                                                    classnum))))
9080                         {
9081                             break;
9082                         }
9083                         scan++;
9084                     }
9085                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
9086                         if (! (to_complement
9087                               ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
9088                                                                      *(scan + 1)),
9089                                                     classnum))))
9090                         {
9091                             break;
9092                         }
9093                         scan += 2;
9094                     }
9095                     else {
9096                         goto found_above_latin1;
9097                     }
9098
9099                     hardcount++;
9100                 }
9101             }
9102             else {
9103                 /* For these character classes, the knowledge of how to handle
9104                  * every code point is compiled in to Perl via a macro.  This
9105                  * code is written for making the loops as tight as possible.
9106                  * It could be refactored to save space instead */
9107                 switch (classnum) {
9108                     case _CC_ENUM_SPACE:
9109                         while (hardcount < max
9110                                && scan < loceol
9111                                && (to_complement
9112                                    ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9113                         {
9114                             scan += UTF8SKIP(scan);
9115                             hardcount++;
9116                         }
9117                         break;
9118                     case _CC_ENUM_BLANK:
9119                         while (hardcount < max
9120                                && scan < loceol
9121                                && (to_complement
9122                                     ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9123                         {
9124                             scan += UTF8SKIP(scan);
9125                             hardcount++;
9126                         }
9127                         break;
9128                     case _CC_ENUM_XDIGIT:
9129                         while (hardcount < max
9130                                && scan < loceol
9131                                && (to_complement
9132                                    ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9133                         {
9134                             scan += UTF8SKIP(scan);
9135                             hardcount++;
9136                         }
9137                         break;
9138                     case _CC_ENUM_VERTSPACE:
9139                         while (hardcount < max
9140                                && scan < loceol
9141                                && (to_complement
9142                                    ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9143                         {
9144                             scan += UTF8SKIP(scan);
9145                             hardcount++;
9146                         }
9147                         break;
9148                     case _CC_ENUM_CNTRL:
9149                         while (hardcount < max
9150                                && scan < loceol
9151                                && (to_complement
9152                                    ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9153                         {
9154                             scan += UTF8SKIP(scan);
9155                             hardcount++;
9156                         }
9157                         break;
9158                     default:
9159                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9160                 }
9161             }
9162         }
9163         break;
9164
9165       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
9166
9167         /* Load the swash if not already present */
9168         if (! PL_utf8_swash_ptrs[classnum]) {
9169             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9170             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9171                                         "utf8",
9172                                         "",
9173                                         &PL_sv_undef, 1, 0,
9174                                         PL_XPosix_ptrs[classnum], &flags);
9175         }
9176
9177         while (hardcount < max && scan < loceol
9178                && to_complement ^ cBOOL(_generic_utf8_safe(
9179                                        classnum,
9180                                        scan,
9181                                        loceol,
9182                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
9183                                                    (U8 *) scan,
9184                                                    TRUE))))
9185         {
9186             scan += UTF8SKIP(scan);
9187             hardcount++;
9188         }
9189         break;
9190
9191     case LNBREAK:
9192         if (utf8_target) {
9193             while (hardcount < max && scan < loceol &&
9194                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9195                 scan += c;
9196                 hardcount++;
9197             }
9198         } else {
9199             /* LNBREAK can match one or two latin chars, which is ok, but we
9200              * have to use hardcount in this situation, and throw away the
9201              * adjustment to <loceol> done before the switch statement */
9202             loceol = reginfo->strend;
9203             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9204                 scan+=c;
9205                 hardcount++;
9206             }
9207         }
9208         break;
9209
9210     case BOUNDL:
9211     case NBOUNDL:
9212         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9213         /* FALLTHROUGH */
9214     case BOUND:
9215     case BOUNDA:
9216     case BOUNDU:
9217     case EOS:
9218     case GPOS:
9219     case KEEPS:
9220     case NBOUND:
9221     case NBOUNDA:
9222     case NBOUNDU:
9223     case OPFAIL:
9224     case SBOL:
9225     case SEOL:
9226         /* These are all 0 width, so match right here or not at all. */
9227         break;
9228
9229     default:
9230         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9231         NOT_REACHED; /* NOTREACHED */
9232
9233     }
9234
9235     if (hardcount)
9236         c = hardcount;
9237     else
9238         c = scan - *startposp;
9239     *startposp = scan;
9240
9241     DEBUG_r({
9242         GET_RE_DEBUG_FLAGS_DECL;
9243         DEBUG_EXECUTE_r({
9244             SV * const prop = sv_newmortal();
9245             regprop(prog, prop, p, reginfo, NULL);
9246             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9247                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9248         });
9249     });
9250
9251     return(c);
9252 }
9253
9254
9255 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9256 /*
9257 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9258 create a copy so that changes the caller makes won't change the shared one.
9259 If <altsvp> is non-null, will return NULL in it, for back-compat.
9260  */
9261 SV *
9262 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9263 {
9264     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9265
9266     if (altsvp) {
9267         *altsvp = NULL;
9268     }
9269
9270     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9271 }
9272
9273 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9274
9275 /*
9276  - reginclass - determine if a character falls into a character class
9277
9278   n is the ANYOF-type regnode
9279   p is the target string
9280   p_end points to one byte beyond the end of the target string
9281   utf8_target tells whether p is in UTF-8.
9282
9283   Returns true if matched; false otherwise.
9284
9285   Note that this can be a synthetic start class, a combination of various
9286   nodes, so things you think might be mutually exclusive, such as locale,
9287   aren't.  It can match both locale and non-locale
9288
9289  */
9290
9291 STATIC bool
9292 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9293 {
9294     dVAR;
9295     const char flags = ANYOF_FLAGS(n);
9296     bool match = FALSE;
9297     UV c = *p;
9298
9299     PERL_ARGS_ASSERT_REGINCLASS;
9300
9301     /* If c is not already the code point, get it.  Note that
9302      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9303     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9304         STRLEN c_len = 0;
9305         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9306         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9307         if (c_len == (STRLEN)-1) {
9308             _force_out_malformed_utf8_message(p, p_end,
9309                                               utf8n_flags,
9310                                               1 /* 1 means die */ );
9311             NOT_REACHED; /* NOTREACHED */
9312         }
9313         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9314             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9315         }
9316     }
9317
9318     /* If this character is potentially in the bitmap, check it */
9319     if (c < NUM_ANYOF_CODE_POINTS) {
9320         if (ANYOF_BITMAP_TEST(n, c))
9321             match = TRUE;
9322         else if ((flags
9323                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9324                   && OP(n) == ANYOFD
9325                   && ! utf8_target
9326                   && ! isASCII(c))
9327         {
9328             match = TRUE;
9329         }
9330         else if (flags & ANYOF_LOCALE_FLAGS) {
9331             if ((flags & ANYOFL_FOLD)
9332                 && c < 256
9333                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9334             {
9335                 match = TRUE;
9336             }
9337             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9338                      && c < 256
9339             ) {
9340
9341                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9342                  * if the class includes the Posix character class given by
9343                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9344                  * complemented Posix class given by int(bit/2).  So we loop
9345                  * through the bits, each time changing whether we complement
9346                  * the result or not.  Suppose for the sake of illustration
9347                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9348                  * is set, it means there is a match for this ANYOF node if the
9349                  * character is in the class given by the expression (0 / 2 = 0
9350                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9351                  * and since 'to_complement' is 0, the result will stay TRUE,
9352                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9353                  * bit 1 is 1.  That means there is a match if the character
9354                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9355                  * but will on bit 1.  On the second iteration 'to_complement'
9356                  * will be 1, so the exclusive or will reverse things, so we
9357                  * are testing for \W.  On the third iteration, 'to_complement'
9358                  * will be 0, and we would be testing for \s; the fourth
9359                  * iteration would test for \S, etc.
9360                  *
9361                  * Note that this code assumes that all the classes are closed
9362                  * under folding.  For example, if a character matches \w, then
9363                  * its fold does too; and vice versa.  This should be true for
9364                  * any well-behaved locale for all the currently defined Posix
9365                  * classes, except for :lower: and :upper:, which are handled
9366                  * by the pseudo-class :cased: which matches if either of the
9367                  * other two does.  To get rid of this assumption, an outer
9368                  * loop could be used below to iterate over both the source
9369                  * character, and its fold (if different) */
9370
9371                 int count = 0;
9372                 int to_complement = 0;
9373
9374                 while (count < ANYOF_MAX) {
9375                     if (ANYOF_POSIXL_TEST(n, count)
9376                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9377                     {
9378                         match = TRUE;
9379                         break;
9380                     }
9381                     count++;
9382                     to_complement ^= 1;
9383                 }
9384             }
9385         }
9386     }
9387
9388
9389     /* If the bitmap didn't (or couldn't) match, and something outside the
9390      * bitmap could match, try that. */
9391     if (!match) {
9392         if (c >= NUM_ANYOF_CODE_POINTS
9393             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9394         {
9395             match = TRUE;       /* Everything above the bitmap matches */
9396         }
9397             /* Here doesn't match everything above the bitmap.  If there is
9398              * some information available beyond the bitmap, we may find a
9399              * match in it.  If so, this is most likely because the code point
9400              * is outside the bitmap range.  But rarely, it could be because of
9401              * some other reason.  If so, various flags are set to indicate
9402              * this possibility.  On ANYOFD nodes, there may be matches that
9403              * happen only when the target string is UTF-8; or for other node
9404              * types, because runtime lookup is needed, regardless of the
9405              * UTF-8ness of the target string.  Finally, under /il, there may
9406              * be some matches only possible if the locale is a UTF-8 one. */
9407         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9408                  && (   c >= NUM_ANYOF_CODE_POINTS
9409                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9410                          && (   UNLIKELY(OP(n) != ANYOFD)
9411                              || (utf8_target && ! isASCII_uni(c)
9412 #                               if NUM_ANYOF_CODE_POINTS > 256
9413                                                                  && c < 256
9414 #                               endif
9415                                 )))
9416                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9417                          && IN_UTF8_CTYPE_LOCALE)))
9418         {
9419             SV* only_utf8_locale = NULL;
9420             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9421                                                        &only_utf8_locale, NULL);
9422             if (sw) {
9423                 U8 utf8_buffer[2];
9424                 U8 * utf8_p;
9425                 if (utf8_target) {
9426                     utf8_p = (U8 *) p;
9427                 } else { /* Convert to utf8 */
9428                     utf8_p = utf8_buffer;
9429                     append_utf8_from_native_byte(*p, &utf8_p);
9430                     utf8_p = utf8_buffer;
9431                 }
9432
9433                 if (swash_fetch(sw, utf8_p, TRUE)) {
9434                     match = TRUE;
9435                 }
9436             }
9437             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9438                 match = _invlist_contains_cp(only_utf8_locale, c);
9439             }
9440         }
9441
9442         if (UNICODE_IS_SUPER(c)
9443             && (flags
9444                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9445             && OP(n) != ANYOFD
9446             && ckWARN_d(WARN_NON_UNICODE))
9447         {
9448             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9449                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9450         }
9451     }
9452
9453 #if ANYOF_INVERT != 1
9454     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9455      * use it, don't */
9456 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9457 #endif
9458
9459     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9460     return (flags & ANYOF_INVERT) ^ match;
9461 }
9462
9463 STATIC U8 *
9464 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9465 {
9466     /* return the position 'off' UTF-8 characters away from 's', forward if
9467      * 'off' >= 0, backwards if negative.  But don't go outside of position
9468      * 'lim', which better be < s  if off < 0 */
9469
9470     PERL_ARGS_ASSERT_REGHOP3;
9471
9472     if (off >= 0) {
9473         while (off-- && s < lim) {
9474             /* XXX could check well-formedness here */
9475             U8 *new_s = s + UTF8SKIP(s);
9476             if (new_s > lim) /* lim may be in the middle of a long character */
9477                 return s;
9478             s = new_s;
9479         }
9480     }
9481     else {
9482         while (off++ && s > lim) {
9483             s--;
9484             if (UTF8_IS_CONTINUED(*s)) {
9485                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9486                     s--;
9487                 if (! UTF8_IS_START(*s)) {
9488                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9489                 }
9490             }
9491             /* XXX could check well-formedness here */
9492         }
9493     }
9494     return s;
9495 }
9496
9497 STATIC U8 *
9498 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9499 {
9500     PERL_ARGS_ASSERT_REGHOP4;
9501
9502     if (off >= 0) {
9503         while (off-- && s < rlim) {
9504             /* XXX could check well-formedness here */
9505             s += UTF8SKIP(s);
9506         }
9507     }
9508     else {
9509         while (off++ && s > llim) {
9510             s--;
9511             if (UTF8_IS_CONTINUED(*s)) {
9512                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9513                     s--;
9514                 if (! UTF8_IS_START(*s)) {
9515                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9516                 }
9517             }
9518             /* XXX could check well-formedness here */
9519         }
9520     }
9521     return s;
9522 }
9523
9524 /* like reghop3, but returns NULL on overrun, rather than returning last
9525  * char pos */
9526
9527 STATIC U8 *
9528 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9529 {
9530     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9531
9532     if (off >= 0) {
9533         while (off-- && s < lim) {
9534             /* XXX could check well-formedness here */
9535             s += UTF8SKIP(s);
9536         }
9537         if (off >= 0)
9538             return NULL;
9539     }
9540     else {
9541         while (off++ && s > lim) {
9542             s--;
9543             if (UTF8_IS_CONTINUED(*s)) {
9544                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9545                     s--;
9546                 if (! UTF8_IS_START(*s)) {
9547                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9548                 }
9549             }
9550             /* XXX could check well-formedness here */
9551         }
9552         if (off <= 0)
9553             return NULL;
9554     }
9555     return s;
9556 }
9557
9558
9559 /* when executing a regex that may have (?{}), extra stuff needs setting
9560    up that will be visible to the called code, even before the current
9561    match has finished. In particular:
9562
9563    * $_ is localised to the SV currently being matched;
9564    * pos($_) is created if necessary, ready to be updated on each call-out
9565      to code;
9566    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9567      isn't set until the current pattern is successfully finished), so that
9568      $1 etc of the match-so-far can be seen;
9569    * save the old values of subbeg etc of the current regex, and  set then
9570      to the current string (again, this is normally only done at the end
9571      of execution)
9572 */
9573
9574 static void
9575 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9576 {
9577     MAGIC *mg;
9578     regexp *const rex = ReANY(reginfo->prog);
9579     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9580
9581     eval_state->rex = rex;
9582
9583     if (reginfo->sv) {
9584         /* Make $_ available to executed code. */
9585         if (reginfo->sv != DEFSV) {
9586             SAVE_DEFSV;
9587             DEFSV_set(reginfo->sv);
9588         }
9589
9590         if (!(mg = mg_find_mglob(reginfo->sv))) {
9591             /* prepare for quick setting of pos */
9592             mg = sv_magicext_mglob(reginfo->sv);
9593             mg->mg_len = -1;
9594         }
9595         eval_state->pos_magic = mg;
9596         eval_state->pos       = mg->mg_len;
9597         eval_state->pos_flags = mg->mg_flags;
9598     }
9599     else
9600         eval_state->pos_magic = NULL;
9601
9602     if (!PL_reg_curpm) {
9603         /* PL_reg_curpm is a fake PMOP that we can attach the current
9604          * regex to and point PL_curpm at, so that $1 et al are visible
9605          * within a /(?{})/. It's just allocated once per interpreter the
9606          * first time its needed */
9607         Newxz(PL_reg_curpm, 1, PMOP);
9608 #ifdef USE_ITHREADS
9609         {
9610             SV* const repointer = &PL_sv_undef;
9611             /* this regexp is also owned by the new PL_reg_curpm, which
9612                will try to free it.  */
9613             av_push(PL_regex_padav, repointer);
9614             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
9615             PL_regex_pad = AvARRAY(PL_regex_padav);
9616         }
9617 #endif
9618     }
9619     SET_reg_curpm(reginfo->prog);
9620     eval_state->curpm = PL_curpm;
9621     PL_curpm_under = PL_curpm;
9622     PL_curpm = PL_reg_curpm;
9623     if (RXp_MATCH_COPIED(rex)) {
9624         /*  Here is a serious problem: we cannot rewrite subbeg,
9625             since it may be needed if this match fails.  Thus
9626             $` inside (?{}) could fail... */
9627         eval_state->subbeg     = rex->subbeg;
9628         eval_state->sublen     = rex->sublen;
9629         eval_state->suboffset  = rex->suboffset;
9630         eval_state->subcoffset = rex->subcoffset;
9631 #ifdef PERL_ANY_COW
9632         eval_state->saved_copy = rex->saved_copy;
9633 #endif
9634         RXp_MATCH_COPIED_off(rex);
9635     }
9636     else
9637         eval_state->subbeg = NULL;
9638     rex->subbeg = (char *)reginfo->strbeg;
9639     rex->suboffset = 0;
9640     rex->subcoffset = 0;
9641     rex->sublen = reginfo->strend - reginfo->strbeg;
9642 }
9643
9644
9645 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
9646
9647 static void
9648 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
9649 {
9650     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
9651     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
9652     regmatch_slab *s;
9653
9654     Safefree(aux->poscache);
9655
9656     if (eval_state) {
9657
9658         /* undo the effects of S_setup_eval_state() */
9659
9660         if (eval_state->subbeg) {
9661             regexp * const rex = eval_state->rex;
9662             rex->subbeg     = eval_state->subbeg;
9663             rex->sublen     = eval_state->sublen;
9664             rex->suboffset  = eval_state->suboffset;
9665             rex->subcoffset = eval_state->subcoffset;
9666 #ifdef PERL_ANY_COW
9667             rex->saved_copy = eval_state->saved_copy;
9668 #endif
9669             RXp_MATCH_COPIED_on(rex);
9670         }
9671         if (eval_state->pos_magic)
9672         {
9673             eval_state->pos_magic->mg_len = eval_state->pos;
9674             eval_state->pos_magic->mg_flags =
9675                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
9676                | (eval_state->pos_flags & MGf_BYTES);
9677         }
9678
9679         PL_curpm = eval_state->curpm;
9680     }
9681
9682     PL_regmatch_state = aux->old_regmatch_state;
9683     PL_regmatch_slab  = aux->old_regmatch_slab;
9684
9685     /* free all slabs above current one - this must be the last action
9686      * of this function, as aux and eval_state are allocated within
9687      * slabs and may be freed here */
9688
9689     s = PL_regmatch_slab->next;
9690     if (s) {
9691         PL_regmatch_slab->next = NULL;
9692         while (s) {
9693             regmatch_slab * const osl = s;
9694             s = s->next;
9695             Safefree(osl);
9696         }
9697     }
9698 }
9699
9700
9701 STATIC void
9702 S_to_utf8_substr(pTHX_ regexp *prog)
9703 {
9704     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9705      * on the converted value */
9706
9707     int i = 1;
9708
9709     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9710
9711     do {
9712         if (prog->substrs->data[i].substr
9713             && !prog->substrs->data[i].utf8_substr) {
9714             SV* const sv = newSVsv(prog->substrs->data[i].substr);
9715             prog->substrs->data[i].utf8_substr = sv;
9716             sv_utf8_upgrade(sv);
9717             if (SvVALID(prog->substrs->data[i].substr)) {
9718                 if (SvTAIL(prog->substrs->data[i].substr)) {
9719                     /* Trim the trailing \n that fbm_compile added last
9720                        time.  */
9721                     SvCUR_set(sv, SvCUR(sv) - 1);
9722                     /* Whilst this makes the SV technically "invalid" (as its
9723                        buffer is no longer followed by "\0") when fbm_compile()
9724                        adds the "\n" back, a "\0" is restored.  */
9725                     fbm_compile(sv, FBMcf_TAIL);
9726                 } else
9727                     fbm_compile(sv, 0);
9728             }
9729             if (prog->substrs->data[i].substr == prog->check_substr)
9730                 prog->check_utf8 = sv;
9731         }
9732     } while (i--);
9733 }
9734
9735 STATIC bool
9736 S_to_byte_substr(pTHX_ regexp *prog)
9737 {
9738     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9739      * on the converted value; returns FALSE if can't be converted. */
9740
9741     int i = 1;
9742
9743     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9744
9745     do {
9746         if (prog->substrs->data[i].utf8_substr
9747             && !prog->substrs->data[i].substr) {
9748             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
9749             if (! sv_utf8_downgrade(sv, TRUE)) {
9750                 return FALSE;
9751             }
9752             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9753                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9754                     /* Trim the trailing \n that fbm_compile added last
9755                         time.  */
9756                     SvCUR_set(sv, SvCUR(sv) - 1);
9757                     fbm_compile(sv, FBMcf_TAIL);
9758                 } else
9759                     fbm_compile(sv, 0);
9760             }
9761             prog->substrs->data[i].substr = sv;
9762             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9763                 prog->check_substr = sv;
9764         }
9765     } while (i--);
9766
9767     return TRUE;
9768 }
9769
9770 #ifndef PERL_IN_XSUB_RE
9771
9772 bool
9773 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
9774 {
9775     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
9776      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
9777      * the larger string bounded by 'strbeg' and 'strend'.
9778      *
9779      * 'cp' needs to be assigned (if not a future version of the Unicode
9780      * Standard could make it something that combines with adjacent characters,
9781      * so code using it would then break), and there has to be a GCB break
9782      * before and after the character. */
9783
9784     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
9785     const U8 * prev_cp_start;
9786
9787     PERL_ARGS_ASSERT__IS_GRAPHEME;
9788
9789     /* Unassigned code points are forbidden */
9790     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
9791                                     _invlist_search(PL_Assigned_invlist, cp))))
9792     {
9793         return FALSE;
9794     }
9795
9796     cp_gcb_val = getGCB_VAL_CP(cp);
9797
9798     /* Find the GCB value of the previous code point in the input */
9799     prev_cp_start = utf8_hop_back(s, -1, strbeg);
9800     if (UNLIKELY(prev_cp_start == s)) {
9801         prev_cp_gcb_val = GCB_EDGE;
9802     }
9803     else {
9804         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
9805     }
9806
9807     /* And check that is a grapheme boundary */
9808     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
9809                 TRUE /* is UTF-8 encoded */ ))
9810     {
9811         return FALSE;
9812     }
9813
9814     /* Similarly verify there is a break between the current character and the
9815      * following one */
9816     s += UTF8SKIP(s);
9817     if (s >= strend) {
9818         next_cp_gcb_val = GCB_EDGE;
9819     }
9820     else {
9821         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
9822     }
9823
9824     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
9825 }
9826
9827 #endif
9828
9829
9830
9831 /*
9832  * ex: set ts=8 sts=4 sw=4 et:
9833  */