regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /* Valid only if 'c', the character being looke-up, is an invariant under
 111  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 112  * everything matchable is straight forward in the bitmap */
 113 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 114                                 ? reginclass(prog,p,c,c+1,u)                \
 115                                 : ANYOF_BITMAP_TEST(p,*(c)))
 116
 117 /*
 118  * Forwards.
 119  */
 120
 121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 122
 123 #define HOPc(pos,off) \
 124         (char *)(reginfo->is_utf8_target \
 125             ? reghop3((U8*)pos, off, \
 126                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 127             : (U8*)(pos + off))
 128
 129 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 130 #define HOPBACK3(pos, off, lim) \
 131         (reginfo->is_utf8_target                          \
 132             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 133             : (pos - off >= lim)                                 \
 134                 ? (U8*)pos - off                                 \
 135                 : NULL)
 136
 137 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 138
 139 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 140 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 141
 142 /* lim must be +ve. Returns NULL on overshoot */
 143 #define HOPMAYBE3(pos,off,lim) \
 144         (reginfo->is_utf8_target                        \
 145             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 146             : ((U8*)pos + off <= lim)                   \
 147                 ? (U8*)pos + off                        \
 148                 : NULL)
 149
 150 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 151  * off must be >=0; args should be vars rather than expressions */
 152 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 153     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 154     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 155 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 156
 157 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 158     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 159     : (U8*)(pos + off))
 160 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 161
 162 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 163 #define NEXTCHR_IS_EOS (nextchr < 0)
 164
 165 #define SET_nextchr \
 166     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 167
 168 #define SET_locinput(p) \
 169     locinput = (p);  \
 170     SET_nextchr
 171
 172
 173 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 174         if (!swash_ptr) {                                                     \
 175             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 176             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 177                                          1, 0, invlist, &flags);              \
 178             assert(swash_ptr);                                                \
 179         }                                                                     \
 180     } STMT_END
 181
 182 /* If in debug mode, we test that a known character properly matches */
 183 #ifdef DEBUGGING
 184 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 185                                           property_name,                      \
 186                                           invlist,                            \
 187                                           utf8_char_in_property)              \
 188         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 189         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 190 #else
 191 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 192                                           property_name,                      \
 193                                           invlist,                            \
 194                                           utf8_char_in_property)              \
 195         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 196 #endif
 197
 198 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 199                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 200                                         "",                                   \
 201                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 202                                         LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
 203
 204 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 205 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 206
 207 /* for use after a quantifier and before an EXACT-like node -- japhy */
 208 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 209  *
 210  * NOTE that *nothing* that affects backtracking should be in here, specifically
 211  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 212  * node that is in between two EXACT like nodes when ascertaining what the required
 213  * "follow" character is. This should probably be moved to regex compile time
 214  * although it may be done at run time beause of the REF possibility - more
 215  * investigation required. -- demerphq
 216 */
 217 #define JUMPABLE(rn) (                                                             \
 218     OP(rn) == OPEN ||                                                              \
 219     (OP(rn) == CLOSE &&                                                            \
 220      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 221     OP(rn) == EVAL ||                                                              \
 222     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 223     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 224     OP(rn) == KEEPS ||                                                             \
 225     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 226 )
 227 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 228
 229 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 230
 231 #if 0
 232 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 233    we don't need this definition.  XXX These are now out-of-sync*/
 234 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 235 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 236 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 237
 238 #else
 239 /* ... so we use this as its faster. */
 240 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 241 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 242 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 243 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 244
 245 #endif
 246
 247 /*
 248   Search for mandatory following text node; for lookahead, the text must
 249   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 250 */
 251 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 252     while (JUMPABLE(rn)) { \
 253         const OPCODE type = OP(rn); \
 254         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 255             rn = NEXTOPER(NEXTOPER(rn)); \
 256         else if (type == PLUS) \
 257             rn = NEXTOPER(rn); \
 258         else if (type == IFMATCH) \
 259             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 260         else rn += NEXT_OFF(rn); \
 261     } \
 262 } STMT_END
 263
 264 #define SLAB_FIRST(s) (&(s)->states[0])
 265 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 266
 267 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 268 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 269 static regmatch_state * S_push_slab(pTHX);
 270
 271 #define REGCP_PAREN_ELEMS 3
 272 #define REGCP_OTHER_ELEMS 3
 273 #define REGCP_FRAME_ELEMS 1
 274 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 275  * are needed for the regexp context stack bookkeeping. */
 276
 277 STATIC CHECKPOINT
 278 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 279 {
 280     const int retval = PL_savestack_ix;
 281     const int paren_elems_to_push =
 282                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 283     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 284     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 285     I32 p;
 286     GET_RE_DEBUG_FLAGS_DECL;
 287
 288     PERL_ARGS_ASSERT_REGCPPUSH;
 289
 290     if (paren_elems_to_push < 0)
 291         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 292                    (int)paren_elems_to_push, (int)maxopenparen,
 293                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 294
 295     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 296         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 297                    " out of range (%lu-%ld)",
 298                    total_elems,
 299                    (unsigned long)maxopenparen,
 300                    (long)parenfloor);
 301
 302     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 303
 304     DEBUG_BUFFERS_r(
 305         if ((int)maxopenparen > (int)parenfloor)
 306             Perl_re_exec_indentf( aTHX_
 307                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 308                 depth,
 309                 PTR2UV(rex),
 310                 PTR2UV(rex->offs)
 311             );
 312     );
 313     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 314 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 315         SSPUSHIV(rex->offs[p].end);
 316         SSPUSHIV(rex->offs[p].start);
 317         SSPUSHINT(rex->offs[p].start_tmp);
 318         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 319             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 320             depth,
 321             (UV)p,
 322             (IV)rex->offs[p].start,
 323             (IV)rex->offs[p].start_tmp,
 324             (IV)rex->offs[p].end
 325         ));
 326     }
 327 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 328     SSPUSHINT(maxopenparen);
 329     SSPUSHINT(rex->lastparen);
 330     SSPUSHINT(rex->lastcloseparen);
 331     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 332
 333     return retval;
 334 }
 335
 336 /* These are needed since we do not localize EVAL nodes: */
 337 #define REGCP_SET(cp)                                           \
 338     DEBUG_STATE_r(                                              \
 339         Perl_re_exec_indentf( aTHX_                             \
 340             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 341             depth, (IV)PL_savestack_ix                          \
 342         )                                                       \
 343     );                                                          \
 344     cp = PL_savestack_ix
 345
 346 #define REGCP_UNWIND(cp)                                        \
 347     DEBUG_STATE_r(                                              \
 348         if (cp != PL_savestack_ix)                              \
 349             Perl_re_exec_indentf( aTHX_                         \
 350                 "Clearing an EVAL scope, savestack=%"           \
 351                 IVdf "..%" IVdf "\n",                           \
 352                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 353             )                                                   \
 354     );                                                          \
 355     regcpblow(cp)
 356
 357 #define UNWIND_PAREN(lp, lcp)               \
 358     for (n = rex->lastparen; n > lp; n--)   \
 359         rex->offs[n].end = -1;              \
 360     rex->lastparen = n;                     \
 361     rex->lastcloseparen = lcp;
 362
 363
 364 STATIC void
 365 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 366 {
 367     UV i;
 368     U32 paren;
 369     GET_RE_DEBUG_FLAGS_DECL;
 370
 371     PERL_ARGS_ASSERT_REGCPPOP;
 372
 373     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 374     i = SSPOPUV;
 375     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 376     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 377     rex->lastcloseparen = SSPOPINT;
 378     rex->lastparen = SSPOPINT;
 379     *maxopenparen_p = SSPOPINT;
 380
 381     i -= REGCP_OTHER_ELEMS;
 382     /* Now restore the parentheses context. */
 383     DEBUG_BUFFERS_r(
 384         if (i || rex->lastparen + 1 <= rex->nparens)
 385             Perl_re_exec_indentf( aTHX_
 386                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 387                 depth,
 388                 PTR2UV(rex),
 389                 PTR2UV(rex->offs)
 390             );
 391     );
 392     paren = *maxopenparen_p;
 393     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 394         SSize_t tmps;
 395         rex->offs[paren].start_tmp = SSPOPINT;
 396         rex->offs[paren].start = SSPOPIV;
 397         tmps = SSPOPIV;
 398         if (paren <= rex->lastparen)
 399             rex->offs[paren].end = tmps;
 400         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 401             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 402             depth,
 403             (UV)paren,
 404             (IV)rex->offs[paren].start,
 405             (IV)rex->offs[paren].start_tmp,
 406             (IV)rex->offs[paren].end,
 407             (paren > rex->lastparen ? "(skipped)" : ""));
 408         );
 409         paren--;
 410     }
 411 #if 1
 412     /* It would seem that the similar code in regtry()
 413      * already takes care of this, and in fact it is in
 414      * a better location to since this code can #if 0-ed out
 415      * but the code in regtry() is needed or otherwise tests
 416      * requiring null fields (pat.t#187 and split.t#{13,14}
 417      * (as of patchlevel 7877)  will fail.  Then again,
 418      * this code seems to be necessary or otherwise
 419      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 420      * --jhi updated by dapm */
 421     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 422         if (i > *maxopenparen_p)
 423             rex->offs[i].start = -1;
 424         rex->offs[i].end = -1;
 425         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 426             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 427             depth,
 428             (UV)i,
 429             (i > *maxopenparen_p) ? "-1" : "  "
 430         ));
 431     }
 432 #endif
 433 }
 434
 435 /* restore the parens and associated vars at savestack position ix,
 436  * but without popping the stack */
 437
 438 STATIC void
 439 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 440 {
 441     I32 tmpix = PL_savestack_ix;
 442     PERL_ARGS_ASSERT_REGCP_RESTORE;
 443
 444     PL_savestack_ix = ix;
 445     regcppop(rex, maxopenparen_p);
 446     PL_savestack_ix = tmpix;
 447 }
 448
 449 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 450
 451 #ifndef PERL_IN_XSUB_RE
 452
 453 bool
 454 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 455 {
 456     /* Returns a boolean as to whether or not 'character' is a member of the
 457      * Posix character class given by 'classnum' that should be equivalent to a
 458      * value in the typedef '_char_class_number'.
 459      *
 460      * Ideally this could be replaced by a just an array of function pointers
 461      * to the C library functions that implement the macros this calls.
 462      * However, to compile, the precise function signatures are required, and
 463      * these may vary from platform to to platform.  To avoid having to figure
 464      * out what those all are on each platform, I (khw) am using this method,
 465      * which adds an extra layer of function call overhead (unless the C
 466      * optimizer strips it away).  But we don't particularly care about
 467      * performance with locales anyway. */
 468
 469     switch ((_char_class_number) classnum) {
 470         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 471         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 472         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 473         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 474         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 475                                         || isUPPER_LC(character);
 476         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 477         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 478         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 479         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 480         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 481         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 482         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 483         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 484         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 485         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 486         default:    /* VERTSPACE should never occur in locales */
 487             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 488     }
 489
 490     NOT_REACHED; /* NOTREACHED */
 491     return FALSE;
 492 }
 493
 494 #endif
 495
 496 STATIC bool
 497 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 498 {
 499     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 500      * 'character' is a member of the Posix character class given by 'classnum'
 501      * that should be equivalent to a value in the typedef
 502      * '_char_class_number'.
 503      *
 504      * This just calls isFOO_lc on the code point for the character if it is in
 505      * the range 0-255.  Outside that range, all characters use Unicode
 506      * rules, ignoring any locale.  So use the Unicode function if this class
 507      * requires a swash, and use the Unicode macro otherwise. */
 508
 509     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 510
 511     if (UTF8_IS_INVARIANT(*character)) {
 512         return isFOO_lc(classnum, *character);
 513     }
 514     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 515         return isFOO_lc(classnum,
 516                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 517     }
 518
 519     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 520
 521     if (classnum < _FIRST_NON_SWASH_CC) {
 522
 523         /* Initialize the swash unless done already */
 524         if (! PL_utf8_swash_ptrs[classnum]) {
 525             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 526             PL_utf8_swash_ptrs[classnum] =
 527                     _core_swash_init("utf8",
 528                                      "",
 529                                      &PL_sv_undef, 1, 0,
 530                                      PL_XPosix_ptrs[classnum], &flags);
 531         }
 532
 533         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 534                                  character,
 535                                  TRUE /* is UTF */ ));
 536     }
 537
 538     switch ((_char_class_number) classnum) {
 539         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 540         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 541         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 542         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 543         default:                 break;
 544     }
 545
 546     return FALSE; /* Things like CNTRL are always below 256 */
 547 }
 548
 549 /*
 550  * pregexec and friends
 551  */
 552
 553 #ifndef PERL_IN_XSUB_RE
 554 /*
 555  - pregexec - match a regexp against a string
 556  */
 557 I32
 558 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 559          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 560 /* stringarg: the point in the string at which to begin matching */
 561 /* strend:    pointer to null at end of string */
 562 /* strbeg:    real beginning of string */
 563 /* minend:    end of match must be >= minend bytes after stringarg. */
 564 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 565  *            itself is accessed via the pointers above */
 566 /* nosave:    For optimizations. */
 567 {
 568     PERL_ARGS_ASSERT_PREGEXEC;
 569
 570     return
 571         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 572                       nosave ? 0 : REXEC_COPY_STR);
 573 }
 574 #endif
 575
 576
 577
 578 /* re_intuit_start():
 579  *
 580  * Based on some optimiser hints, try to find the earliest position in the
 581  * string where the regex could match.
 582  *
 583  *   rx:     the regex to match against
 584  *   sv:     the SV being matched: only used for utf8 flag; the string
 585  *           itself is accessed via the pointers below. Note that on
 586  *           something like an overloaded SV, SvPOK(sv) may be false
 587  *           and the string pointers may point to something unrelated to
 588  *           the SV itself.
 589  *   strbeg: real beginning of string
 590  *   strpos: the point in the string at which to begin matching
 591  *   strend: pointer to the byte following the last char of the string
 592  *   flags   currently unused; set to 0
 593  *   data:   currently unused; set to NULL
 594  *
 595  * The basic idea of re_intuit_start() is to use some known information
 596  * about the pattern, namely:
 597  *
 598  *   a) the longest known anchored substring (i.e. one that's at a
 599  *      constant offset from the beginning of the pattern; but not
 600  *      necessarily at a fixed offset from the beginning of the
 601  *      string);
 602  *   b) the longest floating substring (i.e. one that's not at a constant
 603  *      offset from the beginning of the pattern);
 604  *   c) Whether the pattern is anchored to the string; either
 605  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 606  *      or anchored to pos(): /\G/;
 607  *   d) A start class: a real or synthetic character class which
 608  *      represents which characters are legal at the start of the pattern;
 609  *
 610  * to either quickly reject the match, or to find the earliest position
 611  * within the string at which the pattern might match, thus avoiding
 612  * running the full NFA engine at those earlier locations, only to
 613  * eventually fail and retry further along.
 614  *
 615  * Returns NULL if the pattern can't match, or returns the address within
 616  * the string which is the earliest place the match could occur.
 617  *
 618  * The longest of the anchored and floating substrings is called 'check'
 619  * and is checked first. The other is called 'other' and is checked
 620  * second. The 'other' substring may not be present.  For example,
 621  *
 622  *    /(abc|xyz)ABC\d{0,3}DEFG/
 623  *
 624  * will have
 625  *
 626  *   check substr (float)    = "DEFG", offset 6..9 chars
 627  *   other substr (anchored) = "ABC",  offset 3..3 chars
 628  *   stclass = [ax]
 629  *
 630  * Be aware that during the course of this function, sometimes 'anchored'
 631  * refers to a substring being anchored relative to the start of the
 632  * pattern, and sometimes to the pattern itself being anchored relative to
 633  * the string. For example:
 634  *
 635  *   /\dabc/:   "abc" is anchored to the pattern;
 636  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 637  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 638  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 639  *                    but the pattern is anchored to the string.
 640  */
 641
 642 char *
 643 Perl_re_intuit_start(pTHX_
 644                     REGEXP * const rx,
 645                     SV *sv,
 646                     const char * const strbeg,
 647                     char *strpos,
 648                     char *strend,
 649                     const U32 flags,
 650                     re_scream_pos_data *data)
 651 {
 652     struct regexp *const prog = ReANY(rx);
 653     SSize_t start_shift = prog->check_offset_min;
 654     /* Should be nonnegative! */
 655     SSize_t end_shift   = 0;
 656     /* current lowest pos in string where the regex can start matching */
 657     char *rx_origin = strpos;
 658     SV *check;
 659     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 660     U8   other_ix = 1 - prog->substrs->check_ix;
 661     bool ml_anch = 0;
 662     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 663     char *check_at = NULL;              /* check substr found at this pos */
 664     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 665     RXi_GET_DECL(prog,progi);
 666     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 667     regmatch_info *const reginfo = &reginfo_buf;
 668     GET_RE_DEBUG_FLAGS_DECL;
 669
 670     PERL_ARGS_ASSERT_RE_INTUIT_START;
 671     PERL_UNUSED_ARG(flags);
 672     PERL_UNUSED_ARG(data);
 673
 674     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 675                 "Intuit: trying to determine minimum start position...\n"));
 676
 677     /* for now, assume that all substr offsets are positive. If at some point
 678      * in the future someone wants to do clever things with lookbehind and
 679      * -ve offsets, they'll need to fix up any code in this function
 680      * which uses these offsets. See the thread beginning
 681      * <20140113145929.GF27210@iabyn.com>
 682      */
 683     assert(prog->substrs->data[0].min_offset >= 0);
 684     assert(prog->substrs->data[0].max_offset >= 0);
 685     assert(prog->substrs->data[1].min_offset >= 0);
 686     assert(prog->substrs->data[1].max_offset >= 0);
 687     assert(prog->substrs->data[2].min_offset >= 0);
 688     assert(prog->substrs->data[2].max_offset >= 0);
 689
 690     /* for now, assume that if both present, that the floating substring
 691      * doesn't start before the anchored substring.
 692      * If you break this assumption (e.g. doing better optimisations
 693      * with lookahead/behind), then you'll need to audit the code in this
 694      * function carefully first
 695      */
 696     assert(
 697             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 698               && (prog->float_utf8    || prog->float_substr))
 699            || (prog->float_min_offset >= prog->anchored_offset));
 700
 701     /* byte rather than char calculation for efficiency. It fails
 702      * to quickly reject some cases that can't match, but will reject
 703      * them later after doing full char arithmetic */
 704     if (prog->minlen > strend - strpos) {
 705         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 706                               "  String too short...\n"));
 707         goto fail;
 708     }
 709
 710     RXp_MATCH_UTF8_set(prog, utf8_target);
 711     reginfo->is_utf8_target = cBOOL(utf8_target);
 712     reginfo->info_aux = NULL;
 713     reginfo->strbeg = strbeg;
 714     reginfo->strend = strend;
 715     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 716     reginfo->intuit = 1;
 717     /* not actually used within intuit, but zero for safety anyway */
 718     reginfo->poscache_maxiter = 0;
 719
 720     if (utf8_target) {
 721         if ((!prog->anchored_utf8 && prog->anchored_substr)
 722                 || (!prog->float_utf8 && prog->float_substr))
 723             to_utf8_substr(prog);
 724         check = prog->check_utf8;
 725     } else {
 726         if (!prog->check_substr && prog->check_utf8) {
 727             if (! to_byte_substr(prog)) {
 728                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 729             }
 730         }
 731         check = prog->check_substr;
 732     }
 733
 734     /* dump the various substring data */
 735     DEBUG_OPTIMISE_MORE_r({
 736         int i;
 737         for (i=0; i<=2; i++) {
 738             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 739                                   : prog->substrs->data[i].substr);
 740             if (!sv)
 741                 continue;
 742
 743             Perl_re_printf( aTHX_
 744                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
 745                 " useful=%" IVdf " utf8=%d [%s]\n",
 746                 i,
 747                 (IV)prog->substrs->data[i].min_offset,
 748                 (IV)prog->substrs->data[i].max_offset,
 749                 (IV)prog->substrs->data[i].end_shift,
 750                 BmUSEFUL(sv),
 751                 utf8_target ? 1 : 0,
 752                 SvPEEK(sv));
 753         }
 754     });
 755
 756     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 757
 758         /* ml_anch: check after \n?
 759          *
 760          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 761          * with /.*.../, these flags will have been added by the
 762          * compiler:
 763          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 764          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 765          */
 766         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 767                    && !(prog->intflags & PREGf_IMPLICIT);
 768
 769         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 770             /* we are only allowed to match at BOS or \G */
 771
 772             /* trivially reject if there's a BOS anchor and we're not at BOS.
 773              *
 774              * Note that we don't try to do a similar quick reject for
 775              * \G, since generally the caller will have calculated strpos
 776              * based on pos() and gofs, so the string is already correctly
 777              * anchored by definition; and handling the exceptions would
 778              * be too fiddly (e.g. REXEC_IGNOREPOS).
 779              */
 780             if (   strpos != strbeg
 781                 && (prog->intflags & PREGf_ANCH_SBOL))
 782             {
 783                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 784                                 "  Not at start...\n"));
 785                 goto fail;
 786             }
 787
 788             /* in the presence of an anchor, the anchored (relative to the
 789              * start of the regex) substr must also be anchored relative
 790              * to strpos. So quickly reject if substr isn't found there.
 791              * This works for \G too, because the caller will already have
 792              * subtracted gofs from pos, and gofs is the offset from the
 793              * \G to the start of the regex. For example, in /.abc\Gdef/,
 794              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 795              * caller will have set strpos=pos()-4; we look for the substr
 796              * at position pos()-4+1, which lines up with the "a" */
 797
 798             if (prog->check_offset_min == prog->check_offset_max) {
 799                 /* Substring at constant offset from beg-of-str... */
 800                 SSize_t slen = SvCUR(check);
 801                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 802
 803                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 804                     "  Looking for check substr at fixed offset %" IVdf "...\n",
 805                     (IV)prog->check_offset_min));
 806
 807                 if (SvTAIL(check)) {
 808                     /* In this case, the regex is anchored at the end too.
 809                      * Unless it's a multiline match, the lengths must match
 810                      * exactly, give or take a \n.  NB: slen >= 1 since
 811                      * the last char of check is \n */
 812                     if (!multiline
 813                         && (   strend - s > slen
 814                             || strend - s < slen - 1
 815                             || (strend - s == slen && strend[-1] != '\n')))
 816                     {
 817                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 818                                             "  String too long...\n"));
 819                         goto fail_finish;
 820                     }
 821                     /* Now should match s[0..slen-2] */
 822                     slen--;
 823                 }
 824                 if (slen && (strend - s < slen
 825                     || *SvPVX_const(check) != *s
 826                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
 827                 {
 828                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 829                                     "  String not equal...\n"));
 830                     goto fail_finish;
 831                 }
 832
 833                 check_at = s;
 834                 goto success_at_start;
 835             }
 836         }
 837     }
 838
 839     end_shift = prog->check_end_shift;
 840
 841 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 842     if (end_shift < 0)
 843         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
 844                    (IV)end_shift, RX_PRECOMP(rx));
 845 #endif
 846
 847   restart:
 848
 849     /* This is the (re)entry point of the main loop in this function.
 850      * The goal of this loop is to:
 851      * 1) find the "check" substring in the region rx_origin..strend
 852      *    (adjusted by start_shift / end_shift). If not found, reject
 853      *    immediately.
 854      * 2) If it exists, look for the "other" substr too if defined; for
 855      *    example, if the check substr maps to the anchored substr, then
 856      *    check the floating substr, and vice-versa. If not found, go
 857      *    back to (1) with rx_origin suitably incremented.
 858      * 3) If we find an rx_origin position that doesn't contradict
 859      *    either of the substrings, then check the possible additional
 860      *    constraints on rx_origin of /^.../m or a known start class.
 861      *    If these fail, then depending on which constraints fail, jump
 862      *    back to here, or to various other re-entry points further along
 863      *    that skip some of the first steps.
 864      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 865      *    substring. If the start position was determined to be at the
 866      *    beginning of the string  - so, not rejected, but not optimised,
 867      *    since we have to run regmatch from position 0 - decrement the
 868      *    BmUSEFUL() count. Otherwise increment it.
 869      */
 870
 871
 872     /* first, look for the 'check' substring */
 873
 874     {
 875         U8* start_point;
 876         U8* end_point;
 877
 878         DEBUG_OPTIMISE_MORE_r({
 879             Perl_re_printf( aTHX_
 880                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
 881                 " Start shift: %" IVdf " End shift %" IVdf
 882                 " Real end Shift: %" IVdf "\n",
 883                 (IV)(rx_origin - strbeg),
 884                 (IV)prog->check_offset_min,
 885                 (IV)start_shift,
 886                 (IV)end_shift,
 887                 (IV)prog->check_end_shift);
 888         });
 889
 890         end_point = HOPBACK3(strend, end_shift, rx_origin);
 891         if (!end_point)
 892             goto fail_finish;
 893         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 894         if (!start_point)
 895             goto fail_finish;
 896
 897
 898         /* If the regex is absolutely anchored to either the start of the
 899          * string (SBOL) or to pos() (ANCH_GPOS), then
 900          * check_offset_max represents an upper bound on the string where
 901          * the substr could start. For the ANCH_GPOS case, we assume that
 902          * the caller of intuit will have already set strpos to
 903          * pos()-gofs, so in this case strpos + offset_max will still be
 904          * an upper bound on the substr.
 905          */
 906         if (!ml_anch
 907             && prog->intflags & PREGf_ANCH
 908             && prog->check_offset_max != SSize_t_MAX)
 909         {
 910             SSize_t len = SvCUR(check) - !!SvTAIL(check);
 911             const char * const anchor =
 912                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 913
 914             /* do a bytes rather than chars comparison. It's conservative;
 915              * so it skips doing the HOP if the result can't possibly end
 916              * up earlier than the old value of end_point.
 917              */
 918             if ((char*)end_point - anchor > prog->check_offset_max) {
 919                 end_point = HOP3lim((U8*)anchor,
 920                                 prog->check_offset_max,
 921                                 end_point -len)
 922                             + len;
 923             }
 924         }
 925
 926         check_at = fbm_instr( start_point, end_point,
 927                       check, multiline ? FBMrf_MULTILINE : 0);
 928
 929         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 930             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
 931             (IV)((char*)start_point - strbeg),
 932             (IV)((char*)end_point   - strbeg),
 933             (IV)(check_at ? check_at - strbeg : -1)
 934         ));
 935
 936         /* Update the count-of-usability, remove useless subpatterns,
 937             unshift s.  */
 938
 939         DEBUG_EXECUTE_r({
 940             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 941                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 942             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
 943                               (check_at ? "Found" : "Did not find"),
 944                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 945                     ? "anchored" : "floating"),
 946                 quoted,
 947                 RE_SV_TAIL(check),
 948                 (check_at ? " at offset " : "...\n") );
 949         });
 950
 951         if (!check_at)
 952             goto fail_finish;
 953         /* set rx_origin to the minimum position where the regex could start
 954          * matching, given the constraint of the just-matched check substring.
 955          * But don't set it lower than previously.
 956          */
 957
 958         if (check_at - rx_origin > prog->check_offset_max)
 959             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 960         /* Finish the diagnostic message */
 961         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 962             "%ld (rx_origin now %" IVdf ")...\n",
 963             (long)(check_at - strbeg),
 964             (IV)(rx_origin - strbeg)
 965         ));
 966     }
 967
 968
 969     /* now look for the 'other' substring if defined */
 970
 971     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 972                     : prog->substrs->data[other_ix].substr)
 973     {
 974         /* Take into account the "other" substring. */
 975         char *last, *last1;
 976         char *s;
 977         SV* must;
 978         struct reg_substr_datum *other;
 979
 980       do_other_substr:
 981         other = &prog->substrs->data[other_ix];
 982
 983         /* if "other" is anchored:
 984          * we've previously found a floating substr starting at check_at.
 985          * This means that the regex origin must lie somewhere
 986          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 987          * and max:                 HOP3(check_at, -check_offset_min)
 988          * (except that min will be >= strpos)
 989          * So the fixed  substr must lie somewhere between
 990          *  HOP3(min, anchored_offset)
 991          *  HOP3(max, anchored_offset) + SvCUR(substr)
 992          */
 993
 994         /* if "other" is floating
 995          * Calculate last1, the absolute latest point where the
 996          * floating substr could start in the string, ignoring any
 997          * constraints from the earlier fixed match. It is calculated
 998          * as follows:
 999          *
1000          * strend - prog->minlen (in chars) is the absolute latest
1001          * position within the string where the origin of the regex
1002          * could appear. The latest start point for the floating
1003          * substr is float_min_offset(*) on from the start of the
1004          * regex.  last1 simply combines thee two offsets.
1005          *
1006          * (*) You might think the latest start point should be
1007          * float_max_offset from the regex origin, and technically
1008          * you'd be correct. However, consider
1009          *    /a\d{2,4}bcd\w/
1010          * Here, float min, max are 3,5 and minlen is 7.
1011          * This can match either
1012          *    /a\d\dbcd\w/
1013          *    /a\d\d\dbcd\w/
1014          *    /a\d\d\d\dbcd\w/
1015          * In the first case, the regex matches minlen chars; in the
1016          * second, minlen+1, in the third, minlen+2.
1017          * In the first case, the floating offset is 3 (which equals
1018          * float_min), in the second, 4, and in the third, 5 (which
1019          * equals float_max). In all cases, the floating string bcd
1020          * can never start more than 4 chars from the end of the
1021          * string, which equals minlen - float_min. As the substring
1022          * starts to match more than float_min from the start of the
1023          * regex, it makes the regex match more than minlen chars,
1024          * and the two cancel each other out. So we can always use
1025          * float_min - minlen, rather than float_max - minlen for the
1026          * latest position in the string.
1027          *
1028          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1029          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1030          */
1031
1032         assert(prog->minlen >= other->min_offset);
1033         last1 = HOP3c(strend,
1034                         other->min_offset - prog->minlen, strbeg);
1035
1036         if (other_ix) {/* i.e. if (other-is-float) */
1037             /* last is the latest point where the floating substr could
1038              * start, *given* any constraints from the earlier fixed
1039              * match. This constraint is that the floating string starts
1040              * <= float_max_offset chars from the regex origin (rx_origin).
1041              * If this value is less than last1, use it instead.
1042              */
1043             assert(rx_origin <= last1);
1044             last =
1045                 /* this condition handles the offset==infinity case, and
1046                  * is a short-cut otherwise. Although it's comparing a
1047                  * byte offset to a char length, it does so in a safe way,
1048                  * since 1 char always occupies 1 or more bytes,
1049                  * so if a string range is  (last1 - rx_origin) bytes,
1050                  * it will be less than or equal to  (last1 - rx_origin)
1051                  * chars; meaning it errs towards doing the accurate HOP3
1052                  * rather than just using last1 as a short-cut */
1053                 (last1 - rx_origin) < other->max_offset
1054                     ? last1
1055                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1056         }
1057         else {
1058             assert(strpos + start_shift <= check_at);
1059             last = HOP4c(check_at, other->min_offset - start_shift,
1060                         strbeg, strend);
1061         }
1062
1063         s = HOP3c(rx_origin, other->min_offset, strend);
1064         if (s < other_last)     /* These positions already checked */
1065             s = other_last;
1066
1067         must = utf8_target ? other->utf8_substr : other->substr;
1068         assert(SvPOK(must));
1069         {
1070             char *from = s;
1071             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1072
1073             if (to > strend)
1074                 to = strend;
1075             if (from > to) {
1076                 s = NULL;
1077                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1078                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1079                     (IV)(from - strbeg),
1080                     (IV)(to   - strbeg)
1081                 ));
1082             }
1083             else {
1084                 s = fbm_instr(
1085                     (unsigned char*)from,
1086                     (unsigned char*)to,
1087                     must,
1088                     multiline ? FBMrf_MULTILINE : 0
1089                 );
1090                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1091                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1092                     (IV)(from - strbeg),
1093                     (IV)(to   - strbeg),
1094                     (IV)(s ? s - strbeg : -1)
1095                 ));
1096             }
1097         }
1098
1099         DEBUG_EXECUTE_r({
1100             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1101                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1102             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1103                 s ? "Found" : "Contradicts",
1104                 other_ix ? "floating" : "anchored",
1105                 quoted, RE_SV_TAIL(must));
1106         });
1107
1108
1109         if (!s) {
1110             /* last1 is latest possible substr location. If we didn't
1111              * find it before there, we never will */
1112             if (last >= last1) {
1113                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1114                                         "; giving up...\n"));
1115                 goto fail_finish;
1116             }
1117
1118             /* try to find the check substr again at a later
1119              * position. Maybe next time we'll find the "other" substr
1120              * in range too */
1121             other_last = HOP3c(last, 1, strend) /* highest failure */;
1122             rx_origin =
1123                 other_ix /* i.e. if other-is-float */
1124                     ? HOP3c(rx_origin, 1, strend)
1125                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1126             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1127                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1128                 (other_ix ? "floating" : "anchored"),
1129                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1130                 (IV)(rx_origin - strbeg)
1131             ));
1132             goto restart;
1133         }
1134         else {
1135             if (other_ix) { /* if (other-is-float) */
1136                 /* other_last is set to s, not s+1, since its possible for
1137                  * a floating substr to fail first time, then succeed
1138                  * second time at the same floating position; e.g.:
1139                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1140                  * The first time round, anchored and float match at
1141                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1142                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1143                  */
1144                 other_last = s;
1145             }
1146             else {
1147                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1148                 other_last = HOP3c(s, 1, strend);
1149             }
1150             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1151                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1152                   (long)(s - strbeg),
1153                 (IV)(rx_origin - strbeg)
1154               ));
1155
1156         }
1157     }
1158     else {
1159         DEBUG_OPTIMISE_MORE_r(
1160             Perl_re_printf( aTHX_
1161                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1162                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1163                 " strend:%" IVdf "\n",
1164                 (IV)prog->check_offset_min,
1165                 (IV)prog->check_offset_max,
1166                 (IV)(check_at-strbeg),
1167                 (IV)(rx_origin-strbeg),
1168                 (IV)(rx_origin-check_at),
1169                 (IV)(strend-strbeg)
1170             )
1171         );
1172     }
1173
1174   postprocess_substr_matches:
1175
1176     /* handle the extra constraint of /^.../m if present */
1177
1178     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1179         char *s;
1180
1181         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1182                         "  looking for /^/m anchor"));
1183
1184         /* we have failed the constraint of a \n before rx_origin.
1185          * Find the next \n, if any, even if it's beyond the current
1186          * anchored and/or floating substrings. Whether we should be
1187          * scanning ahead for the next \n or the next substr is debatable.
1188          * On the one hand you'd expect rare substrings to appear less
1189          * often than \n's. On the other hand, searching for \n means
1190          * we're effectively flipping between check_substr and "\n" on each
1191          * iteration as the current "rarest" string candidate, which
1192          * means for example that we'll quickly reject the whole string if
1193          * hasn't got a \n, rather than trying every substr position
1194          * first
1195          */
1196
1197         s = HOP3c(strend, - prog->minlen, strpos);
1198         if (s <= rx_origin ||
1199             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1200         {
1201             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1202                             "  Did not find /%s^%s/m...\n",
1203                             PL_colors[0], PL_colors[1]));
1204             goto fail_finish;
1205         }
1206
1207         /* earliest possible origin is 1 char after the \n.
1208          * (since *rx_origin == '\n', it's safe to ++ here rather than
1209          * HOP(rx_origin, 1)) */
1210         rx_origin++;
1211
1212         if (prog->substrs->check_ix == 0  /* check is anchored */
1213             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1214         {
1215             /* Position contradicts check-string; either because
1216              * check was anchored (and thus has no wiggle room),
1217              * or check was float and rx_origin is above the float range */
1218             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1219                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1220                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1221             goto restart;
1222         }
1223
1224         /* if we get here, the check substr must have been float,
1225          * is in range, and we may or may not have had an anchored
1226          * "other" substr which still contradicts */
1227         assert(prog->substrs->check_ix); /* check is float */
1228
1229         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1230             /* whoops, the anchored "other" substr exists, so we still
1231              * contradict. On the other hand, the float "check" substr
1232              * didn't contradict, so just retry the anchored "other"
1233              * substr */
1234             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1235                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1236                 PL_colors[0], PL_colors[1],
1237                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1238                 (IV)(rx_origin - strbeg)
1239             ));
1240             goto do_other_substr;
1241         }
1242
1243         /* success: we don't contradict the found floating substring
1244          * (and there's no anchored substr). */
1245         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1246             "  Found /%s^%s/m with rx_origin %ld...\n",
1247             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1248     }
1249     else {
1250         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1251             "  (multiline anchor test skipped)\n"));
1252     }
1253
1254   success_at_start:
1255
1256
1257     /* if we have a starting character class, then test that extra constraint.
1258      * (trie stclasses are too expensive to use here, we are better off to
1259      * leave it to regmatch itself) */
1260
1261     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1262         const U8* const str = (U8*)STRING(progi->regstclass);
1263
1264         /* XXX this value could be pre-computed */
1265         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1266                     ?  (reginfo->is_utf8_pat
1267                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1268                         : STR_LEN(progi->regstclass))
1269                     : 1);
1270         char * endpos;
1271         char *s;
1272         /* latest pos that a matching float substr constrains rx start to */
1273         char *rx_max_float = NULL;
1274
1275         /* if the current rx_origin is anchored, either by satisfying an
1276          * anchored substring constraint, or a /^.../m constraint, then we
1277          * can reject the current origin if the start class isn't found
1278          * at the current position. If we have a float-only match, then
1279          * rx_origin is constrained to a range; so look for the start class
1280          * in that range. if neither, then look for the start class in the
1281          * whole rest of the string */
1282
1283         /* XXX DAPM it's not clear what the minlen test is for, and why
1284          * it's not used in the floating case. Nothing in the test suite
1285          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1286          * Here are some old comments, which may or may not be correct:
1287          *
1288          *   minlen == 0 is possible if regstclass is \b or \B,
1289          *   and the fixed substr is ''$.
1290          *   Since minlen is already taken into account, rx_origin+1 is
1291          *   before strend; accidentally, minlen >= 1 guaranties no false
1292          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1293          *   0) below assumes that regstclass does not come from lookahead...
1294          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1295          *   This leaves EXACTF-ish only, which are dealt with in
1296          *   find_byclass().
1297          */
1298
1299         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1300             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1301         else if (prog->float_substr || prog->float_utf8) {
1302             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1303             endpos = HOP3clim(rx_max_float, cl_l, strend);
1304         }
1305         else
1306             endpos= strend;
1307
1308         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1309             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1310             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1311               (IV)start_shift, (IV)(check_at - strbeg),
1312               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1313
1314         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1315                             reginfo);
1316         if (!s) {
1317             if (endpos == strend) {
1318                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1319                                 "  Could not match STCLASS...\n") );
1320                 goto fail;
1321             }
1322             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1323                                "  This position contradicts STCLASS...\n") );
1324             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1325                         && !(prog->intflags & PREGf_IMPLICIT))
1326                 goto fail;
1327
1328             /* Contradict one of substrings */
1329             if (prog->anchored_substr || prog->anchored_utf8) {
1330                 if (prog->substrs->check_ix == 1) { /* check is float */
1331                     /* Have both, check_string is floating */
1332                     assert(rx_origin + start_shift <= check_at);
1333                     if (rx_origin + start_shift != check_at) {
1334                         /* not at latest position float substr could match:
1335                          * Recheck anchored substring, but not floating.
1336                          * The condition above is in bytes rather than
1337                          * chars for efficiency. It's conservative, in
1338                          * that it errs on the side of doing 'goto
1339                          * do_other_substr'. In this case, at worst,
1340                          * an extra anchored search may get done, but in
1341                          * practice the extra fbm_instr() is likely to
1342                          * get skipped anyway. */
1343                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1344                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1345                             (long)(other_last - strbeg),
1346                             (IV)(rx_origin - strbeg)
1347                         ));
1348                         goto do_other_substr;
1349                     }
1350                 }
1351             }
1352             else {
1353                 /* float-only */
1354
1355                 if (ml_anch) {
1356                     /* In the presence of ml_anch, we might be able to
1357                      * find another \n without breaking the current float
1358                      * constraint. */
1359
1360                     /* strictly speaking this should be HOP3c(..., 1, ...),
1361                      * but since we goto a block of code that's going to
1362                      * search for the next \n if any, its safe here */
1363                     rx_origin++;
1364                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1365                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1366                               PL_colors[0], PL_colors[1],
1367                               (long)(rx_origin - strbeg)) );
1368                     goto postprocess_substr_matches;
1369                 }
1370
1371                 /* strictly speaking this can never be true; but might
1372                  * be if we ever allow intuit without substrings */
1373                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1374                     goto fail;
1375
1376                 rx_origin = rx_max_float;
1377             }
1378
1379             /* at this point, any matching substrings have been
1380              * contradicted. Start again... */
1381
1382             rx_origin = HOP3c(rx_origin, 1, strend);
1383
1384             /* uses bytes rather than char calculations for efficiency.
1385              * It's conservative: it errs on the side of doing 'goto restart',
1386              * where there is code that does a proper char-based test */
1387             if (rx_origin + start_shift + end_shift > strend) {
1388                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1389                                        "  Could not match STCLASS...\n") );
1390                 goto fail;
1391             }
1392             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1393                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1394                 (prog->substrs->check_ix ? "floating" : "anchored"),
1395                 (long)(rx_origin + start_shift - strbeg),
1396                 (IV)(rx_origin - strbeg)
1397             ));
1398             goto restart;
1399         }
1400
1401         /* Success !!! */
1402
1403         if (rx_origin != s) {
1404             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1405                         "  By STCLASS: moving %ld --> %ld\n",
1406                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1407                    );
1408         }
1409         else {
1410             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1411                                   "  Does not contradict STCLASS...\n");
1412                    );
1413         }
1414     }
1415
1416     /* Decide whether using the substrings helped */
1417
1418     if (rx_origin != strpos) {
1419         /* Fixed substring is found far enough so that the match
1420            cannot start at strpos. */
1421
1422         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1423         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1424     }
1425     else {
1426         /* The found rx_origin position does not prohibit matching at
1427          * strpos, so calling intuit didn't gain us anything. Decrement
1428          * the BmUSEFUL() count on the check substring, and if we reach
1429          * zero, free it.  */
1430         if (!(prog->intflags & PREGf_NAUGHTY)
1431             && (utf8_target ? (
1432                 prog->check_utf8                /* Could be deleted already */
1433                 && --BmUSEFUL(prog->check_utf8) < 0
1434                 && (prog->check_utf8 == prog->float_utf8)
1435             ) : (
1436                 prog->check_substr              /* Could be deleted already */
1437                 && --BmUSEFUL(prog->check_substr) < 0
1438                 && (prog->check_substr == prog->float_substr)
1439             )))
1440         {
1441             /* If flags & SOMETHING - do not do it many times on the same match */
1442             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1443             /* XXX Does the destruction order has to change with utf8_target? */
1444             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1445             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1446             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1447             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1448             check = NULL;                       /* abort */
1449             /* XXXX This is a remnant of the old implementation.  It
1450                     looks wasteful, since now INTUIT can use many
1451                     other heuristics. */
1452             prog->extflags &= ~RXf_USE_INTUIT;
1453         }
1454     }
1455
1456     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1457             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1458              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1459
1460     return rx_origin;
1461
1462   fail_finish:                          /* Substring not found */
1463     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1464         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1465   fail:
1466     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1467                           PL_colors[4], PL_colors[5]));
1468     return NULL;
1469 }
1470
1471
1472 #define DECL_TRIE_TYPE(scan) \
1473     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1474                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1475                  trie_utf8l, trie_flu8 }                                            \
1476                     trie_type = ((scan->flags == EXACT)                             \
1477                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1478                                  : (scan->flags == EXACTL)                          \
1479                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1480                                     : (scan->flags == EXACTFA)                      \
1481                                       ? (utf8_target                                \
1482                                          ? trie_utf8_exactfa_fold                   \
1483                                          : trie_latin_utf8_exactfa_fold)            \
1484                                       : (scan->flags == EXACTFLU8                   \
1485                                          ? trie_flu8                                \
1486                                          : (utf8_target                             \
1487                                            ? trie_utf8_fold                         \
1488                                            :   trie_latin_utf8_fold)))
1489
1490 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1491 STMT_START {                                                                        \
1492     STRLEN skiplen;                                                                 \
1493     U8 flags = FOLD_FLAGS_FULL;                                                     \
1494     switch (trie_type) {                                                            \
1495     case trie_flu8:                                                                 \
1496         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1497         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1498             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1499         }                                                                           \
1500         goto do_trie_utf8_fold;                                                     \
1501     case trie_utf8_exactfa_fold:                                                    \
1502         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1503         /* FALLTHROUGH */                                                           \
1504     case trie_utf8_fold:                                                            \
1505       do_trie_utf8_fold:                                                            \
1506         if ( foldlen>0 ) {                                                          \
1507             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1508             foldlen -= len;                                                         \
1509             uscan += len;                                                           \
1510             len=0;                                                                  \
1511         } else {                                                                    \
1512             len = UTF8SKIP(uc);                                                     \
1513             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen,  \
1514                                                                             flags); \
1515             skiplen = UVCHR_SKIP( uvc );                                            \
1516             foldlen -= skiplen;                                                     \
1517             uscan = foldbuf + skiplen;                                              \
1518         }                                                                           \
1519         break;                                                                      \
1520     case trie_latin_utf8_exactfa_fold:                                              \
1521         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1522         /* FALLTHROUGH */                                                           \
1523     case trie_latin_utf8_fold:                                                      \
1524         if ( foldlen>0 ) {                                                          \
1525             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1526             foldlen -= len;                                                         \
1527             uscan += len;                                                           \
1528             len=0;                                                                  \
1529         } else {                                                                    \
1530             len = 1;                                                                \
1531             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1532             skiplen = UVCHR_SKIP( uvc );                                            \
1533             foldlen -= skiplen;                                                     \
1534             uscan = foldbuf + skiplen;                                              \
1535         }                                                                           \
1536         break;                                                                      \
1537     case trie_utf8l:                                                                \
1538         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1539         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1540             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1541         }                                                                           \
1542         /* FALLTHROUGH */                                                           \
1543     case trie_utf8:                                                                 \
1544         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1545         break;                                                                      \
1546     case trie_plain:                                                                \
1547         uvc = (UV)*uc;                                                              \
1548         len = 1;                                                                    \
1549     }                                                                               \
1550     if (uvc < 256) {                                                                \
1551         charid = trie->charmap[ uvc ];                                              \
1552     }                                                                               \
1553     else {                                                                          \
1554         charid = 0;                                                                 \
1555         if (widecharmap) {                                                          \
1556             SV** const svpp = hv_fetch(widecharmap,                                 \
1557                         (char*)&uvc, sizeof(UV), 0);                                \
1558             if (svpp)                                                               \
1559                 charid = (U16)SvIV(*svpp);                                          \
1560         }                                                                           \
1561     }                                                                               \
1562 } STMT_END
1563
1564 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1565     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1566                 startpos, doutf8, depth)
1567
1568 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1569 STMT_START {                                              \
1570     while (s <= e) {                                      \
1571         if ( (COND)                                       \
1572              && (ln == 1 || folder(s, pat_string, ln))    \
1573              && (reginfo->intuit || regtry(reginfo, &s)) )\
1574             goto got_it;                                  \
1575         s++;                                              \
1576     }                                                     \
1577 } STMT_END
1578
1579 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1580 STMT_START {                                          \
1581     while (s < strend) {                              \
1582         CODE                                          \
1583         s += UTF8SKIP(s);                             \
1584     }                                                 \
1585 } STMT_END
1586
1587 #define REXEC_FBC_SCAN(CODE)                          \
1588 STMT_START {                                          \
1589     while (s < strend) {                              \
1590         CODE                                          \
1591         s++;                                          \
1592     }                                                 \
1593 } STMT_END
1594
1595 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1596 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1597     if (COND) {                                                \
1598         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1599             goto got_it;                                       \
1600         else                                                   \
1601             tmp = doevery;                                     \
1602     }                                                          \
1603     else                                                       \
1604         tmp = 1;                                               \
1605 )
1606
1607 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1608 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1609     if (COND) {                                                \
1610         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1611             goto got_it;                                       \
1612         else                                                   \
1613             tmp = doevery;                                     \
1614     }                                                          \
1615     else                                                       \
1616         tmp = 1;                                               \
1617 )
1618
1619 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1620     if (utf8_target) {                                         \
1621         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1622     }                                                          \
1623     else {                                                     \
1624         REXEC_FBC_CLASS_SCAN(COND);                            \
1625     }
1626
1627 /* The three macros below are slightly different versions of the same logic.
1628  *
1629  * The first is for /a and /aa when the target string is UTF-8.  This can only
1630  * match ascii, but it must advance based on UTF-8.   The other two handle the
1631  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1632  * for the boundary (or non-boundary) between a word and non-word character.
1633  * The utf8 and non-utf8 cases have the same logic, but the details must be
1634  * different.  Find the "wordness" of the character just prior to this one, and
1635  * compare it with the wordness of this one.  If they differ, we have a
1636  * boundary.  At the beginning of the string, pretend that the previous
1637  * character was a new-line.
1638  *
1639  * All these macros uncleanly have side-effects with each other and outside
1640  * variables.  So far it's been too much trouble to clean-up
1641  *
1642  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1643  *               a word character or not.
1644  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1645  *               word/non-word
1646  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1647  *
1648  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1649  * are looking for a boundary or for a non-boundary.  If we are looking for a
1650  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1651  * see if this tentative match actually works, and if so, to quit the loop
1652  * here.  And vice-versa if we are looking for a non-boundary.
1653  *
1654  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1655  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1656  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1657  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1658  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1659  * complement.  But in that branch we complement tmp, meaning that at the
1660  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1661  * which means at the top of the loop in the next iteration, it is
1662  * TEST_NON_UTF8(s-1) */
1663 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1664     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1665     tmp = TEST_NON_UTF8(tmp);                                                  \
1666     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1667         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1668             tmp = !tmp;                                                        \
1669             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1670         }                                                                      \
1671         else {                                                                 \
1672             IF_FAIL;                                                           \
1673         }                                                                      \
1674     );                                                                         \
1675
1676 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1677  * TEST_UTF8 is a macro that for the same input code points returns identically
1678  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1679 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1680     if (s == reginfo->strbeg) {                                                \
1681         tmp = '\n';                                                            \
1682     }                                                                          \
1683     else { /* Back-up to the start of the previous character */                \
1684         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1685         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1686                                                        0, UTF8_ALLOW_DEFAULT); \
1687     }                                                                          \
1688     tmp = TEST_UV(tmp);                                                        \
1689     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1690     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1691         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1692             tmp = !tmp;                                                        \
1693             IF_SUCCESS;                                                        \
1694         }                                                                      \
1695         else {                                                                 \
1696             IF_FAIL;                                                           \
1697         }                                                                      \
1698     );
1699
1700 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1701  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1702  * macros below */
1703 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1704     if (utf8_target) {                                                         \
1705         UTF8_CODE                                                              \
1706     }                                                                          \
1707     else {  /* Not utf8 */                                                     \
1708         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1709         tmp = TEST_NON_UTF8(tmp);                                              \
1710         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1711             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1712                 IF_SUCCESS;                                                    \
1713                 tmp = !tmp;                                                    \
1714             }                                                                  \
1715             else {                                                             \
1716                 IF_FAIL;                                                       \
1717             }                                                                  \
1718         );                                                                     \
1719     }                                                                          \
1720     /* Here, things have been set up by the previous code so that tmp is the   \
1721      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1722      * utf8ness of the target).  We also have to check if this matches against \
1723      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1724      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1725      * string */                                                               \
1726     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1727         IF_SUCCESS;                                                            \
1728     }                                                                          \
1729     else {                                                                     \
1730         IF_FAIL;                                                               \
1731     }
1732
1733 /* This is the macro to use when we want to see if something that looks like it
1734  * could match, actually does, and if so exits the loop */
1735 #define REXEC_FBC_TRYIT                            \
1736     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1737         goto got_it
1738
1739 /* The only difference between the BOUND and NBOUND cases is that
1740  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1741  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1742  * with the other one being empty (PLACEHOLDER is defined as empty).
1743  *
1744  * The TEST_FOO parameters are for operating on different forms of input, but
1745  * all should be ones that return identically for the same underlying code
1746  * points */
1747 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1748     FBC_BOUND_COMMON(                                                          \
1749           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1750           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1751
1752 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1753     FBC_BOUND_COMMON(                                                          \
1754             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1755             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1756
1757 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1758     FBC_BOUND_COMMON(                                                          \
1759           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1760           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1761
1762 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1763     FBC_BOUND_COMMON(                                                          \
1764             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1765             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1766
1767 #ifdef DEBUGGING
1768 static IV
1769 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1770   IV cp_out = Perl__invlist_search(invlist, cp_in);
1771   assert(cp_out >= 0);
1772   return cp_out;
1773 }
1774 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1775         invmap[S_get_break_val_cp_checked(invlist, cp)]
1776 #else
1777 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1778         invmap[_invlist_search(invlist, cp)]
1779 #endif
1780
1781 /* Takes a pointer to an inversion list, a pointer to its corresponding
1782  * inversion map, and a code point, and returns the code point's value
1783  * according to the two arrays.  It assumes that all code points have a value.
1784  * This is used as the base macro for macros for particular properties */
1785 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1786         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
1787
1788 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1789  * of a code point, returning the value for the first code point in the string.
1790  * And it takes the particular macro name that finds the desired value given a
1791  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1792 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1793              (__ASSERT_(pos < strend)                                          \
1794                  /* Note assumes is valid UTF-8 */                             \
1795              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1796
1797 /* Returns the GCB value for the input code point */
1798 #define getGCB_VAL_CP(cp)                                                      \
1799           _generic_GET_BREAK_VAL_CP(                                           \
1800                                     PL_GCB_invlist,                            \
1801                                     _Perl_GCB_invmap,                          \
1802                                     (cp))
1803
1804 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1805  * bounded by pos and strend */
1806 #define getGCB_VAL_UTF8(pos, strend)                                           \
1807     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1808
1809 /* Returns the LB value for the input code point */
1810 #define getLB_VAL_CP(cp)                                                       \
1811           _generic_GET_BREAK_VAL_CP(                                           \
1812                                     PL_LB_invlist,                             \
1813                                     _Perl_LB_invmap,                           \
1814                                     (cp))
1815
1816 /* Returns the LB value for the first code point in the UTF-8 encoded string
1817  * bounded by pos and strend */
1818 #define getLB_VAL_UTF8(pos, strend)                                            \
1819     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1820
1821
1822 /* Returns the SB value for the input code point */
1823 #define getSB_VAL_CP(cp)                                                       \
1824           _generic_GET_BREAK_VAL_CP(                                           \
1825                                     PL_SB_invlist,                             \
1826                                     _Perl_SB_invmap,                     \
1827                                     (cp))
1828
1829 /* Returns the SB value for the first code point in the UTF-8 encoded string
1830  * bounded by pos and strend */
1831 #define getSB_VAL_UTF8(pos, strend)                                            \
1832     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1833
1834 /* Returns the WB value for the input code point */
1835 #define getWB_VAL_CP(cp)                                                       \
1836           _generic_GET_BREAK_VAL_CP(                                           \
1837                                     PL_WB_invlist,                             \
1838                                     _Perl_WB_invmap,                         \
1839                                     (cp))
1840
1841 /* Returns the WB value for the first code point in the UTF-8 encoded string
1842  * bounded by pos and strend */
1843 #define getWB_VAL_UTF8(pos, strend)                                            \
1844     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1845
1846 /* We know what class REx starts with.  Try to find this position... */
1847 /* if reginfo->intuit, its a dryrun */
1848 /* annoyingly all the vars in this routine have different names from their counterparts
1849    in regmatch. /grrr */
1850 STATIC char *
1851 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1852     const char *strend, regmatch_info *reginfo)
1853 {
1854     dVAR;
1855     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1856     char *pat_string;   /* The pattern's exactish string */
1857     char *pat_end;          /* ptr to end char of pat_string */
1858     re_fold_t folder;   /* Function for computing non-utf8 folds */
1859     const U8 *fold_array;   /* array for folding ords < 256 */
1860     STRLEN ln;
1861     STRLEN lnc;
1862     U8 c1;
1863     U8 c2;
1864     char *e;
1865     I32 tmp = 1;        /* Scratch variable? */
1866     const bool utf8_target = reginfo->is_utf8_target;
1867     UV utf8_fold_flags = 0;
1868     const bool is_utf8_pat = reginfo->is_utf8_pat;
1869     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1870                                    with a result inverts that result, as 0^1 =
1871                                    1 and 1^1 = 0 */
1872     _char_class_number classnum;
1873
1874     RXi_GET_DECL(prog,progi);
1875
1876     PERL_ARGS_ASSERT_FIND_BYCLASS;
1877
1878     /* We know what class it must start with. */
1879     switch (OP(c)) {
1880     case ANYOFL:
1881         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1882
1883         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
1884             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1885         }
1886
1887         /* FALLTHROUGH */
1888     case ANYOFD:
1889     case ANYOF:
1890         if (utf8_target) {
1891             REXEC_FBC_UTF8_CLASS_SCAN(
1892                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1893         }
1894         else if (ANYOF_FLAGS(c)) {
1895             REXEC_FBC_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
1896         }
1897         else {
1898             REXEC_FBC_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
1899         }
1900         break;
1901
1902     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1903         assert(! is_utf8_pat);
1904         /* FALLTHROUGH */
1905     case EXACTFA:
1906         if (is_utf8_pat || utf8_target) {
1907             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1908             goto do_exactf_utf8;
1909         }
1910         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1911         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1912         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1913
1914     case EXACTF:   /* This node only generated for non-utf8 patterns */
1915         assert(! is_utf8_pat);
1916         if (utf8_target) {
1917             utf8_fold_flags = 0;
1918             goto do_exactf_utf8;
1919         }
1920         fold_array = PL_fold;
1921         folder = foldEQ;
1922         goto do_exactf_non_utf8;
1923
1924     case EXACTFL:
1925         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1926         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1927             utf8_fold_flags = FOLDEQ_LOCALE;
1928             goto do_exactf_utf8;
1929         }
1930         fold_array = PL_fold_locale;
1931         folder = foldEQ_locale;
1932         goto do_exactf_non_utf8;
1933
1934     case EXACTFU_SS:
1935         if (is_utf8_pat) {
1936             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1937         }
1938         goto do_exactf_utf8;
1939
1940     case EXACTFLU8:
1941             if (! utf8_target) {    /* All code points in this node require
1942                                        UTF-8 to express.  */
1943                 break;
1944             }
1945             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1946                                              | FOLDEQ_S2_FOLDS_SANE;
1947             goto do_exactf_utf8;
1948
1949     case EXACTFU:
1950         if (is_utf8_pat || utf8_target) {
1951             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1952             goto do_exactf_utf8;
1953         }
1954
1955         /* Any 'ss' in the pattern should have been replaced by regcomp,
1956          * so we don't have to worry here about this single special case
1957          * in the Latin1 range */
1958         fold_array = PL_fold_latin1;
1959         folder = foldEQ_latin1;
1960
1961         /* FALLTHROUGH */
1962
1963       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1964                            are no glitches with fold-length differences
1965                            between the target string and pattern */
1966
1967         /* The idea in the non-utf8 EXACTF* cases is to first find the
1968          * first character of the EXACTF* node and then, if necessary,
1969          * case-insensitively compare the full text of the node.  c1 is the
1970          * first character.  c2 is its fold.  This logic will not work for
1971          * Unicode semantics and the german sharp ss, which hence should
1972          * not be compiled into a node that gets here. */
1973         pat_string = STRING(c);
1974         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1975
1976         /* We know that we have to match at least 'ln' bytes (which is the
1977          * same as characters, since not utf8).  If we have to match 3
1978          * characters, and there are only 2 availabe, we know without
1979          * trying that it will fail; so don't start a match past the
1980          * required minimum number from the far end */
1981         e = HOP3c(strend, -((SSize_t)ln), s);
1982         if (e < s)
1983             break;
1984
1985         c1 = *pat_string;
1986         c2 = fold_array[c1];
1987         if (c1 == c2) { /* If char and fold are the same */
1988             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1989         }
1990         else {
1991             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1992         }
1993         break;
1994
1995       do_exactf_utf8:
1996       {
1997         unsigned expansion;
1998
1999         /* If one of the operands is in utf8, we can't use the simpler folding
2000          * above, due to the fact that many different characters can have the
2001          * same fold, or portion of a fold, or different- length fold */
2002         pat_string = STRING(c);
2003         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2004         pat_end = pat_string + ln;
2005         lnc = is_utf8_pat       /* length to match in characters */
2006                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2007                 : ln;
2008
2009         /* We have 'lnc' characters to match in the pattern, but because of
2010          * multi-character folding, each character in the target can match
2011          * up to 3 characters (Unicode guarantees it will never exceed
2012          * this) if it is utf8-encoded; and up to 2 if not (based on the
2013          * fact that the Latin 1 folds are already determined, and the
2014          * only multi-char fold in that range is the sharp-s folding to
2015          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2016          * string character.  Adjust lnc accordingly, rounding up, so that
2017          * if we need to match at least 4+1/3 chars, that really is 5. */
2018         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2019         lnc = (lnc + expansion - 1) / expansion;
2020
2021         /* As in the non-UTF8 case, if we have to match 3 characters, and
2022          * only 2 are left, it's guaranteed to fail, so don't start a
2023          * match that would require us to go beyond the end of the string
2024          */
2025         e = HOP3c(strend, -((SSize_t)lnc), s);
2026
2027         /* XXX Note that we could recalculate e to stop the loop earlier,
2028          * as the worst case expansion above will rarely be met, and as we
2029          * go along we would usually find that e moves further to the left.
2030          * This would happen only after we reached the point in the loop
2031          * where if there were no expansion we should fail.  Unclear if
2032          * worth the expense */
2033
2034         while (s <= e) {
2035             char *my_strend= (char *)strend;
2036             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2037                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2038                 && (reginfo->intuit || regtry(reginfo, &s)) )
2039             {
2040                 goto got_it;
2041             }
2042             s += (utf8_target) ? UTF8SKIP(s) : 1;
2043         }
2044         break;
2045     }
2046
2047     case BOUNDL:
2048         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2049         if (FLAGS(c) != TRADITIONAL_BOUND) {
2050             if (! IN_UTF8_CTYPE_LOCALE) {
2051                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2052                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2053             }
2054             goto do_boundu;
2055         }
2056
2057         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2058         break;
2059
2060     case NBOUNDL:
2061         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2062         if (FLAGS(c) != TRADITIONAL_BOUND) {
2063             if (! IN_UTF8_CTYPE_LOCALE) {
2064                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2065                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2066             }
2067             goto do_nboundu;
2068         }
2069
2070         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2071         break;
2072
2073     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2074                    meaning */
2075         assert(FLAGS(c) == TRADITIONAL_BOUND);
2076
2077         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2078         break;
2079
2080     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2081                    meaning */
2082         assert(FLAGS(c) == TRADITIONAL_BOUND);
2083
2084         FBC_BOUND_A(isWORDCHAR_A);
2085         break;
2086
2087     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2088                    meaning */
2089         assert(FLAGS(c) == TRADITIONAL_BOUND);
2090
2091         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2092         break;
2093
2094     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2095                    meaning */
2096         assert(FLAGS(c) == TRADITIONAL_BOUND);
2097
2098         FBC_NBOUND_A(isWORDCHAR_A);
2099         break;
2100
2101     case NBOUNDU:
2102         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2103             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2104             break;
2105         }
2106
2107       do_nboundu:
2108
2109         to_complement = 1;
2110         /* FALLTHROUGH */
2111
2112     case BOUNDU:
2113       do_boundu:
2114         switch((bound_type) FLAGS(c)) {
2115             case TRADITIONAL_BOUND:
2116                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2117                 break;
2118             case GCB_BOUND:
2119                 if (s == reginfo->strbeg) {
2120                     if (reginfo->intuit || regtry(reginfo, &s))
2121                     {
2122                         goto got_it;
2123                     }
2124
2125                     /* Didn't match.  Try at the next position (if there is one) */
2126                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2127                     if (UNLIKELY(s >= reginfo->strend)) {
2128                         break;
2129                     }
2130                 }
2131
2132                 if (utf8_target) {
2133                     GCB_enum before = getGCB_VAL_UTF8(
2134                                                reghop3((U8*)s, -1,
2135                                                        (U8*)(reginfo->strbeg)),
2136                                                (U8*) reginfo->strend);
2137                     while (s < strend) {
2138                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2139                                                         (U8*) reginfo->strend);
2140                         if (   (to_complement ^ isGCB(before,
2141                                                       after,
2142                                                       (U8*) reginfo->strbeg,
2143                                                       (U8*) s,
2144                                                       utf8_target))
2145                             && (reginfo->intuit || regtry(reginfo, &s)))
2146                         {
2147                             goto got_it;
2148                         }
2149                         before = after;
2150                         s += UTF8SKIP(s);
2151                     }
2152                 }
2153                 else {  /* Not utf8.  Everything is a GCB except between CR and
2154                            LF */
2155                     while (s < strend) {
2156                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2157                                               || UCHARAT(s) != '\n'))
2158                             && (reginfo->intuit || regtry(reginfo, &s)))
2159                         {
2160                             goto got_it;
2161                         }
2162                         s++;
2163                     }
2164                 }
2165
2166                 /* And, since this is a bound, it can match after the final
2167                  * character in the string */
2168                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2169                     goto got_it;
2170                 }
2171                 break;
2172
2173             case LB_BOUND:
2174                 if (s == reginfo->strbeg) {
2175                     if (reginfo->intuit || regtry(reginfo, &s)) {
2176                         goto got_it;
2177                     }
2178                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2179                     if (UNLIKELY(s >= reginfo->strend)) {
2180                         break;
2181                     }
2182                 }
2183
2184                 if (utf8_target) {
2185                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2186                                                                -1,
2187                                                                (U8*)(reginfo->strbeg)),
2188                                                        (U8*) reginfo->strend);
2189                     while (s < strend) {
2190                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2191                         if (to_complement ^ isLB(before,
2192                                                  after,
2193                                                  (U8*) reginfo->strbeg,
2194                                                  (U8*) s,
2195                                                  (U8*) reginfo->strend,
2196                                                  utf8_target)
2197                             && (reginfo->intuit || regtry(reginfo, &s)))
2198                         {
2199                             goto got_it;
2200                         }
2201                         before = after;
2202                         s += UTF8SKIP(s);
2203                     }
2204                 }
2205                 else {  /* Not utf8. */
2206                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2207                     while (s < strend) {
2208                         LB_enum after = getLB_VAL_CP((U8) *s);
2209                         if (to_complement ^ isLB(before,
2210                                                  after,
2211                                                  (U8*) reginfo->strbeg,
2212                                                  (U8*) s,
2213                                                  (U8*) reginfo->strend,
2214                                                  utf8_target)
2215                             && (reginfo->intuit || regtry(reginfo, &s)))
2216                         {
2217                             goto got_it;
2218                         }
2219                         before = after;
2220                         s++;
2221                     }
2222                 }
2223
2224                 if (reginfo->intuit || regtry(reginfo, &s)) {
2225                     goto got_it;
2226                 }
2227
2228                 break;
2229
2230             case SB_BOUND:
2231                 if (s == reginfo->strbeg) {
2232                     if (reginfo->intuit || regtry(reginfo, &s)) {
2233                         goto got_it;
2234                     }
2235                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2236                     if (UNLIKELY(s >= reginfo->strend)) {
2237                         break;
2238                     }
2239                 }
2240
2241                 if (utf8_target) {
2242                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2243                                                         -1,
2244                                                         (U8*)(reginfo->strbeg)),
2245                                                       (U8*) reginfo->strend);
2246                     while (s < strend) {
2247                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2248                                                          (U8*) reginfo->strend);
2249                         if ((to_complement ^ isSB(before,
2250                                                   after,
2251                                                   (U8*) reginfo->strbeg,
2252                                                   (U8*) s,
2253                                                   (U8*) reginfo->strend,
2254                                                   utf8_target))
2255                             && (reginfo->intuit || regtry(reginfo, &s)))
2256                         {
2257                             goto got_it;
2258                         }
2259                         before = after;
2260                         s += UTF8SKIP(s);
2261                     }
2262                 }
2263                 else {  /* Not utf8. */
2264                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2265                     while (s < strend) {
2266                         SB_enum after = getSB_VAL_CP((U8) *s);
2267                         if ((to_complement ^ isSB(before,
2268                                                   after,
2269                                                   (U8*) reginfo->strbeg,
2270                                                   (U8*) s,
2271                                                   (U8*) reginfo->strend,
2272                                                   utf8_target))
2273                             && (reginfo->intuit || regtry(reginfo, &s)))
2274                         {
2275                             goto got_it;
2276                         }
2277                         before = after;
2278                         s++;
2279                     }
2280                 }
2281
2282                 /* Here are at the final position in the target string.  The SB
2283                  * value is always true here, so matches, depending on other
2284                  * constraints */
2285                 if (reginfo->intuit || regtry(reginfo, &s)) {
2286                     goto got_it;
2287                 }
2288
2289                 break;
2290
2291             case WB_BOUND:
2292                 if (s == reginfo->strbeg) {
2293                     if (reginfo->intuit || regtry(reginfo, &s)) {
2294                         goto got_it;
2295                     }
2296                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2297                     if (UNLIKELY(s >= reginfo->strend)) {
2298                         break;
2299                     }
2300                 }
2301
2302                 if (utf8_target) {
2303                     /* We are at a boundary between char_sub_0 and char_sub_1.
2304                      * We also keep track of the value for char_sub_-1 as we
2305                      * loop through the line.   Context may be needed to make a
2306                      * determination, and if so, this can save having to
2307                      * recalculate it */
2308                     WB_enum previous = WB_UNKNOWN;
2309                     WB_enum before = getWB_VAL_UTF8(
2310                                               reghop3((U8*)s,
2311                                                       -1,
2312                                                       (U8*)(reginfo->strbeg)),
2313                                               (U8*) reginfo->strend);
2314                     while (s < strend) {
2315                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2316                                                         (U8*) reginfo->strend);
2317                         if ((to_complement ^ isWB(previous,
2318                                                   before,
2319                                                   after,
2320                                                   (U8*) reginfo->strbeg,
2321                                                   (U8*) s,
2322                                                   (U8*) reginfo->strend,
2323                                                   utf8_target))
2324                             && (reginfo->intuit || regtry(reginfo, &s)))
2325                         {
2326                             goto got_it;
2327                         }
2328                         previous = before;
2329                         before = after;
2330                         s += UTF8SKIP(s);
2331                     }
2332                 }
2333                 else {  /* Not utf8. */
2334                     WB_enum previous = WB_UNKNOWN;
2335                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2336                     while (s < strend) {
2337                         WB_enum after = getWB_VAL_CP((U8) *s);
2338                         if ((to_complement ^ isWB(previous,
2339                                                   before,
2340                                                   after,
2341                                                   (U8*) reginfo->strbeg,
2342                                                   (U8*) s,
2343                                                   (U8*) reginfo->strend,
2344                                                   utf8_target))
2345                             && (reginfo->intuit || regtry(reginfo, &s)))
2346                         {
2347                             goto got_it;
2348                         }
2349                         previous = before;
2350                         before = after;
2351                         s++;
2352                     }
2353                 }
2354
2355                 if (reginfo->intuit || regtry(reginfo, &s)) {
2356                     goto got_it;
2357                 }
2358         }
2359         break;
2360
2361     case LNBREAK:
2362         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2363                         is_LNBREAK_latin1_safe(s, strend)
2364         );
2365         break;
2366
2367     /* The argument to all the POSIX node types is the class number to pass to
2368      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2369
2370     case NPOSIXL:
2371         to_complement = 1;
2372         /* FALLTHROUGH */
2373
2374     case POSIXL:
2375         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2376         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2377                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2378         break;
2379
2380     case NPOSIXD:
2381         to_complement = 1;
2382         /* FALLTHROUGH */
2383
2384     case POSIXD:
2385         if (utf8_target) {
2386             goto posix_utf8;
2387         }
2388         goto posixa;
2389
2390     case NPOSIXA:
2391         if (utf8_target) {
2392             /* The complement of something that matches only ASCII matches all
2393              * non-ASCII, plus everything in ASCII that isn't in the class. */
2394             REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
2395                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2396             break;
2397         }
2398
2399         to_complement = 1;
2400         /* FALLTHROUGH */
2401
2402     case POSIXA:
2403       posixa:
2404         /* Don't need to worry about utf8, as it can match only a single
2405          * byte invariant character. */
2406         REXEC_FBC_CLASS_SCAN(
2407                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2408         break;
2409
2410     case NPOSIXU:
2411         to_complement = 1;
2412         /* FALLTHROUGH */
2413
2414     case POSIXU:
2415         if (! utf8_target) {
2416             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2417                                                                     FLAGS(c))));
2418         }
2419         else {
2420
2421           posix_utf8:
2422             classnum = (_char_class_number) FLAGS(c);
2423             if (classnum < _FIRST_NON_SWASH_CC) {
2424                 while (s < strend) {
2425
2426                     /* We avoid loading in the swash as long as possible, but
2427                      * should we have to, we jump to a separate loop.  This
2428                      * extra 'if' statement is what keeps this code from being
2429                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2430                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2431                         goto found_above_latin1;
2432                     }
2433                     if ((UTF8_IS_INVARIANT(*s)
2434                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2435                                                                 classnum)))
2436                         || (   UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
2437                             && to_complement ^ cBOOL(
2438                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2439                                                                       *(s + 1)),
2440                                               classnum))))
2441                     {
2442                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2443                             goto got_it;
2444                         else {
2445                             tmp = doevery;
2446                         }
2447                     }
2448                     else {
2449                         tmp = 1;
2450                     }
2451                     s += UTF8SKIP(s);
2452                 }
2453             }
2454             else switch (classnum) {    /* These classes are implemented as
2455                                            macros */
2456                 case _CC_ENUM_SPACE:
2457                     REXEC_FBC_UTF8_CLASS_SCAN(
2458                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2459                     break;
2460
2461                 case _CC_ENUM_BLANK:
2462                     REXEC_FBC_UTF8_CLASS_SCAN(
2463                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2464                     break;
2465
2466                 case _CC_ENUM_XDIGIT:
2467                     REXEC_FBC_UTF8_CLASS_SCAN(
2468                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2469                     break;
2470
2471                 case _CC_ENUM_VERTSPACE:
2472                     REXEC_FBC_UTF8_CLASS_SCAN(
2473                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2474                     break;
2475
2476                 case _CC_ENUM_CNTRL:
2477                     REXEC_FBC_UTF8_CLASS_SCAN(
2478                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2479                     break;
2480
2481                 default:
2482                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2483                     NOT_REACHED; /* NOTREACHED */
2484             }
2485         }
2486         break;
2487
2488       found_above_latin1:   /* Here we have to load a swash to get the result
2489                                for the current code point */
2490         if (! PL_utf8_swash_ptrs[classnum]) {
2491             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2492             PL_utf8_swash_ptrs[classnum] =
2493                     _core_swash_init("utf8",
2494                                      "",
2495                                      &PL_sv_undef, 1, 0,
2496                                      PL_XPosix_ptrs[classnum], &flags);
2497         }
2498
2499         /* This is a copy of the loop above for swash classes, though using the
2500          * FBC macro instead of being expanded out.  Since we've loaded the
2501          * swash, we don't have to check for that each time through the loop */
2502         REXEC_FBC_UTF8_CLASS_SCAN(
2503                 to_complement ^ cBOOL(_generic_utf8_safe(
2504                                       classnum,
2505                                       s,
2506                                       strend,
2507                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2508                                                   (U8 *) s, TRUE))));
2509         break;
2510
2511     case AHOCORASICKC:
2512     case AHOCORASICK:
2513         {
2514             DECL_TRIE_TYPE(c);
2515             /* what trie are we using right now */
2516             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2517             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2518             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2519
2520             const char *last_start = strend - trie->minlen;
2521 #ifdef DEBUGGING
2522             const char *real_start = s;
2523 #endif
2524             STRLEN maxlen = trie->maxlen;
2525             SV *sv_points;
2526             U8 **points; /* map of where we were in the input string
2527                             when reading a given char. For ASCII this
2528                             is unnecessary overhead as the relationship
2529                             is always 1:1, but for Unicode, especially
2530                             case folded Unicode this is not true. */
2531             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2532             U8 *bitmap=NULL;
2533
2534
2535             GET_RE_DEBUG_FLAGS_DECL;
2536
2537             /* We can't just allocate points here. We need to wrap it in
2538              * an SV so it gets freed properly if there is a croak while
2539              * running the match */
2540             ENTER;
2541             SAVETMPS;
2542             sv_points=newSV(maxlen * sizeof(U8 *));
2543             SvCUR_set(sv_points,
2544                 maxlen * sizeof(U8 *));
2545             SvPOK_on(sv_points);
2546             sv_2mortal(sv_points);
2547             points=(U8**)SvPV_nolen(sv_points );
2548             if ( trie_type != trie_utf8_fold
2549                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2550             {
2551                 if (trie->bitmap)
2552                     bitmap=(U8*)trie->bitmap;
2553                 else
2554                     bitmap=(U8*)ANYOF_BITMAP(c);
2555             }
2556             /* this is the Aho-Corasick algorithm modified a touch
2557                to include special handling for long "unknown char" sequences.
2558                The basic idea being that we use AC as long as we are dealing
2559                with a possible matching char, when we encounter an unknown char
2560                (and we have not encountered an accepting state) we scan forward
2561                until we find a legal starting char.
2562                AC matching is basically that of trie matching, except that when
2563                we encounter a failing transition, we fall back to the current
2564                states "fail state", and try the current char again, a process
2565                we repeat until we reach the root state, state 1, or a legal
2566                transition. If we fail on the root state then we can either
2567                terminate if we have reached an accepting state previously, or
2568                restart the entire process from the beginning if we have not.
2569
2570              */
2571             while (s <= last_start) {
2572                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2573                 U8 *uc = (U8*)s;
2574                 U16 charid = 0;
2575                 U32 base = 1;
2576                 U32 state = 1;
2577                 UV uvc = 0;
2578                 STRLEN len = 0;
2579                 STRLEN foldlen = 0;
2580                 U8 *uscan = (U8*)NULL;
2581                 U8 *leftmost = NULL;
2582 #ifdef DEBUGGING
2583                 U32 accepted_word= 0;
2584 #endif
2585                 U32 pointpos = 0;
2586
2587                 while ( state && uc <= (U8*)strend ) {
2588                     int failed=0;
2589                     U32 word = aho->states[ state ].wordnum;
2590
2591                     if( state==1 ) {
2592                         if ( bitmap ) {
2593                             DEBUG_TRIE_EXECUTE_r(
2594                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2595                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2596                                         (char *)uc, utf8_target, 0 );
2597                                     Perl_re_printf( aTHX_
2598                                         " Scanning for legal start char...\n");
2599                                 }
2600                             );
2601                             if (utf8_target) {
2602                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2603                                     uc += UTF8SKIP(uc);
2604                                 }
2605                             } else {
2606                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2607                                     uc++;
2608                                 }
2609                             }
2610                             s= (char *)uc;
2611                         }
2612                         if (uc >(U8*)last_start) break;
2613                     }
2614
2615                     if ( word ) {
2616                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2617                         if (!leftmost || lpos < leftmost) {
2618                             DEBUG_r(accepted_word=word);
2619                             leftmost= lpos;
2620                         }
2621                         if (base==0) break;
2622
2623                     }
2624                     points[pointpos++ % maxlen]= uc;
2625                     if (foldlen || uc < (U8*)strend) {
2626                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2627                                          widecharmap, uc,
2628                                          uscan, len, uvc, charid, foldlen,
2629                                          foldbuf, uniflags);
2630                         DEBUG_TRIE_EXECUTE_r({
2631                             dump_exec_pos( (char *)uc, c, strend,
2632                                         real_start, s, utf8_target, 0);
2633                             Perl_re_printf( aTHX_
2634                                 " Charid:%3u CP:%4" UVxf " ",
2635                                  charid, uvc);
2636                         });
2637                     }
2638                     else {
2639                         len = 0;
2640                         charid = 0;
2641                     }
2642
2643
2644                     do {
2645 #ifdef DEBUGGING
2646                         word = aho->states[ state ].wordnum;
2647 #endif
2648                         base = aho->states[ state ].trans.base;
2649
2650                         DEBUG_TRIE_EXECUTE_r({
2651                             if (failed)
2652                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2653                                     s,   utf8_target, 0 );
2654                             Perl_re_printf( aTHX_
2655                                 "%sState: %4" UVxf ", word=%" UVxf,
2656                                 failed ? " Fail transition to " : "",
2657                                 (UV)state, (UV)word);
2658                         });
2659                         if ( base ) {
2660                             U32 tmp;
2661                             I32 offset;
2662                             if (charid &&
2663                                  ( ((offset = base + charid
2664                                     - 1 - trie->uniquecharcount)) >= 0)
2665                                  && ((U32)offset < trie->lasttrans)
2666                                  && trie->trans[offset].check == state
2667                                  && (tmp=trie->trans[offset].next))
2668                             {
2669                                 DEBUG_TRIE_EXECUTE_r(
2670                                     Perl_re_printf( aTHX_ " - legal\n"));
2671                                 state = tmp;
2672                                 break;
2673                             }
2674                             else {
2675                                 DEBUG_TRIE_EXECUTE_r(
2676                                     Perl_re_printf( aTHX_ " - fail\n"));
2677                                 failed = 1;
2678                                 state = aho->fail[state];
2679                             }
2680                         }
2681                         else {
2682                             /* we must be accepting here */
2683                             DEBUG_TRIE_EXECUTE_r(
2684                                     Perl_re_printf( aTHX_ " - accepting\n"));
2685                             failed = 1;
2686                             break;
2687                         }
2688                     } while(state);
2689                     uc += len;
2690                     if (failed) {
2691                         if (leftmost)
2692                             break;
2693                         if (!state) state = 1;
2694                     }
2695                 }
2696                 if ( aho->states[ state ].wordnum ) {
2697                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2698                     if (!leftmost || lpos < leftmost) {
2699                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2700                         leftmost = lpos;
2701                     }
2702                 }
2703                 if (leftmost) {
2704                     s = (char*)leftmost;
2705                     DEBUG_TRIE_EXECUTE_r({
2706                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
2707                             (UV)accepted_word, (IV)(s - real_start)
2708                         );
2709                     });
2710                     if (reginfo->intuit || regtry(reginfo, &s)) {
2711                         FREETMPS;
2712                         LEAVE;
2713                         goto got_it;
2714                     }
2715                     s = HOPc(s,1);
2716                     DEBUG_TRIE_EXECUTE_r({
2717                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
2718                     });
2719                 } else {
2720                     DEBUG_TRIE_EXECUTE_r(
2721                         Perl_re_printf( aTHX_ "No match.\n"));
2722                     break;
2723                 }
2724             }
2725             FREETMPS;
2726             LEAVE;
2727         }
2728         break;
2729     default:
2730         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2731     }
2732     return 0;
2733   got_it:
2734     return s;
2735 }
2736
2737 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2738  * flags have same meanings as with regexec_flags() */
2739
2740 static void
2741 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2742                             char *strbeg,
2743                             char *strend,
2744                             SV *sv,
2745                             U32 flags,
2746                             bool utf8_target)
2747 {
2748     struct regexp *const prog = ReANY(rx);
2749
2750     if (flags & REXEC_COPY_STR) {
2751 #ifdef PERL_ANY_COW
2752         if (SvCANCOW(sv)) {
2753             DEBUG_C(Perl_re_printf( aTHX_
2754                               "Copy on write: regexp capture, type %d\n",
2755                                     (int) SvTYPE(sv)));
2756             /* Create a new COW SV to share the match string and store
2757              * in saved_copy, unless the current COW SV in saved_copy
2758              * is valid and suitable for our purpose */
2759             if ((   prog->saved_copy
2760                  && SvIsCOW(prog->saved_copy)
2761                  && SvPOKp(prog->saved_copy)
2762                  && SvIsCOW(sv)
2763                  && SvPOKp(sv)
2764                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2765             {
2766                 /* just reuse saved_copy SV */
2767                 if (RXp_MATCH_COPIED(prog)) {
2768                     Safefree(prog->subbeg);
2769                     RXp_MATCH_COPIED_off(prog);
2770                 }
2771             }
2772             else {
2773                 /* create new COW SV to share string */
2774                 RXp_MATCH_COPY_FREE(prog);
2775                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2776             }
2777             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2778             assert (SvPOKp(prog->saved_copy));
2779             prog->sublen  = strend - strbeg;
2780             prog->suboffset = 0;
2781             prog->subcoffset = 0;
2782         } else
2783 #endif
2784         {
2785             SSize_t min = 0;
2786             SSize_t max = strend - strbeg;
2787             SSize_t sublen;
2788
2789             if (    (flags & REXEC_COPY_SKIP_POST)
2790                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2791                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2792             ) { /* don't copy $' part of string */
2793                 U32 n = 0;
2794                 max = -1;
2795                 /* calculate the right-most part of the string covered
2796                  * by a capture. Due to lookahead, this may be to
2797                  * the right of $&, so we have to scan all captures */
2798                 while (n <= prog->lastparen) {
2799                     if (prog->offs[n].end > max)
2800                         max = prog->offs[n].end;
2801                     n++;
2802                 }
2803                 if (max == -1)
2804                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2805                             ? prog->offs[0].start
2806                             : 0;
2807                 assert(max >= 0 && max <= strend - strbeg);
2808             }
2809
2810             if (    (flags & REXEC_COPY_SKIP_PRE)
2811                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2812                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2813             ) { /* don't copy $` part of string */
2814                 U32 n = 0;
2815                 min = max;
2816                 /* calculate the left-most part of the string covered
2817                  * by a capture. Due to lookbehind, this may be to
2818                  * the left of $&, so we have to scan all captures */
2819                 while (min && n <= prog->lastparen) {
2820                     if (   prog->offs[n].start != -1
2821                         && prog->offs[n].start < min)
2822                     {
2823                         min = prog->offs[n].start;
2824                     }
2825                     n++;
2826                 }
2827                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2828                     && min >  prog->offs[0].end
2829                 )
2830                     min = prog->offs[0].end;
2831
2832             }
2833
2834             assert(min >= 0 && min <= max && min <= strend - strbeg);
2835             sublen = max - min;
2836
2837             if (RXp_MATCH_COPIED(prog)) {
2838                 if (sublen > prog->sublen)
2839                     prog->subbeg =
2840                             (char*)saferealloc(prog->subbeg, sublen+1);
2841             }
2842             else
2843                 prog->subbeg = (char*)safemalloc(sublen+1);
2844             Copy(strbeg + min, prog->subbeg, sublen, char);
2845             prog->subbeg[sublen] = '\0';
2846             prog->suboffset = min;
2847             prog->sublen = sublen;
2848             RXp_MATCH_COPIED_on(prog);
2849         }
2850         prog->subcoffset = prog->suboffset;
2851         if (prog->suboffset && utf8_target) {
2852             /* Convert byte offset to chars.
2853              * XXX ideally should only compute this if @-/@+
2854              * has been seen, a la PL_sawampersand ??? */
2855
2856             /* If there's a direct correspondence between the
2857              * string which we're matching and the original SV,
2858              * then we can use the utf8 len cache associated with
2859              * the SV. In particular, it means that under //g,
2860              * sv_pos_b2u() will use the previously cached
2861              * position to speed up working out the new length of
2862              * subcoffset, rather than counting from the start of
2863              * the string each time. This stops
2864              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2865              * from going quadratic */
2866             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2867                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2868                                                 SV_GMAGIC|SV_CONST_RETURN);
2869             else
2870                 prog->subcoffset = utf8_length((U8*)strbeg,
2871                                     (U8*)(strbeg+prog->suboffset));
2872         }
2873     }
2874     else {
2875         RXp_MATCH_COPY_FREE(prog);
2876         prog->subbeg = strbeg;
2877         prog->suboffset = 0;
2878         prog->subcoffset = 0;
2879         prog->sublen = strend - strbeg;
2880     }
2881 }
2882
2883
2884
2885
2886 /*
2887  - regexec_flags - match a regexp against a string
2888  */
2889 I32
2890 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2891               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2892 /* stringarg: the point in the string at which to begin matching */
2893 /* strend:    pointer to null at end of string */
2894 /* strbeg:    real beginning of string */
2895 /* minend:    end of match must be >= minend bytes after stringarg. */
2896 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2897  *            itself is accessed via the pointers above */
2898 /* data:      May be used for some additional optimizations.
2899               Currently unused. */
2900 /* flags:     For optimizations. See REXEC_* in regexp.h */
2901
2902 {
2903     struct regexp *const prog = ReANY(rx);
2904     char *s;
2905     regnode *c;
2906     char *startpos;
2907     SSize_t minlen;             /* must match at least this many chars */
2908     SSize_t dontbother = 0;     /* how many characters not to try at end */
2909     const bool utf8_target = cBOOL(DO_UTF8(sv));
2910     I32 multiline;
2911     RXi_GET_DECL(prog,progi);
2912     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2913     regmatch_info *const reginfo = &reginfo_buf;
2914     regexp_paren_pair *swap = NULL;
2915     I32 oldsave;
2916     GET_RE_DEBUG_FLAGS_DECL;
2917
2918     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2919     PERL_UNUSED_ARG(data);
2920
2921     /* Be paranoid... */
2922     if (prog == NULL) {
2923         Perl_croak(aTHX_ "NULL regexp parameter");
2924     }
2925
2926     DEBUG_EXECUTE_r(
2927         debug_start_match(rx, utf8_target, stringarg, strend,
2928         "Matching");
2929     );
2930
2931     startpos = stringarg;
2932
2933     /* set these early as they may be used by the HOP macros below */
2934     reginfo->strbeg = strbeg;
2935     reginfo->strend = strend;
2936     reginfo->is_utf8_target = cBOOL(utf8_target);
2937
2938     if (prog->intflags & PREGf_GPOS_SEEN) {
2939         MAGIC *mg;
2940
2941         /* set reginfo->ganch, the position where \G can match */
2942
2943         reginfo->ganch =
2944             (flags & REXEC_IGNOREPOS)
2945             ? stringarg /* use start pos rather than pos() */
2946             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2947               /* Defined pos(): */
2948             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2949             : strbeg; /* pos() not defined; use start of string */
2950
2951         DEBUG_GPOS_r(Perl_re_printf( aTHX_
2952             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
2953
2954         /* in the presence of \G, we may need to start looking earlier in
2955          * the string than the suggested start point of stringarg:
2956          * if prog->gofs is set, then that's a known, fixed minimum
2957          * offset, such as
2958          * /..\G/:   gofs = 2
2959          * /ab|c\G/: gofs = 1
2960          * or if the minimum offset isn't known, then we have to go back
2961          * to the start of the string, e.g. /w+\G/
2962          */
2963
2964         if (prog->intflags & PREGf_ANCH_GPOS) {
2965             if (prog->gofs) {
2966                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
2967                 if (!startpos ||
2968                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
2969                 {
2970                     DEBUG_r(Perl_re_printf( aTHX_
2971                             "fail: ganch-gofs before earliest possible start\n"));
2972                     return 0;
2973                 }
2974             }
2975             else
2976                 startpos = reginfo->ganch;
2977         }
2978         else if (prog->gofs) {
2979             startpos = HOPBACKc(startpos, prog->gofs);
2980             if (!startpos)
2981                 startpos = strbeg;
2982         }
2983         else if (prog->intflags & PREGf_GPOS_FLOAT)
2984             startpos = strbeg;
2985     }
2986
2987     minlen = prog->minlen;
2988     if ((startpos + minlen) > strend || startpos < strbeg) {
2989         DEBUG_r(Perl_re_printf( aTHX_
2990                     "Regex match can't succeed, so not even tried\n"));
2991         return 0;
2992     }
2993
2994     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2995      * which will call destuctors to reset PL_regmatch_state, free higher
2996      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2997      * regmatch_info_aux_eval */
2998
2999     oldsave = PL_savestack_ix;
3000
3001     s = startpos;
3002
3003     if ((prog->extflags & RXf_USE_INTUIT)
3004         && !(flags & REXEC_CHECKED))
3005     {
3006         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3007                                     flags, NULL);
3008         if (!s)
3009             return 0;
3010
3011         if (prog->extflags & RXf_CHECK_ALL) {
3012             /* we can match based purely on the result of INTUIT.
3013              * Set up captures etc just for $& and $-[0]
3014              * (an intuit-only match wont have $1,$2,..) */
3015             assert(!prog->nparens);
3016
3017             /* s/// doesn't like it if $& is earlier than where we asked it to
3018              * start searching (which can happen on something like /.\G/) */
3019             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3020                     && (s < stringarg))
3021             {
3022                 /* this should only be possible under \G */
3023                 assert(prog->intflags & PREGf_GPOS_SEEN);
3024                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3025                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3026                 goto phooey;
3027             }
3028
3029             /* match via INTUIT shouldn't have any captures.
3030              * Let @-, @+, $^N know */
3031             prog->lastparen = prog->lastcloseparen = 0;
3032             RXp_MATCH_UTF8_set(prog, utf8_target);
3033             prog->offs[0].start = s - strbeg;
3034             prog->offs[0].end = utf8_target
3035                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3036                 : s - strbeg + prog->minlenret;
3037             if ( !(flags & REXEC_NOT_FIRST) )
3038                 S_reg_set_capture_string(aTHX_ rx,
3039                                         strbeg, strend,
3040                                         sv, flags, utf8_target);
3041
3042             return 1;
3043         }
3044     }
3045
3046     multiline = prog->extflags & RXf_PMf_MULTILINE;
3047
3048     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3049         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3050                               "String too short [regexec_flags]...\n"));
3051         goto phooey;
3052     }
3053
3054     /* Check validity of program. */
3055     if (UCHARAT(progi->program) != REG_MAGIC) {
3056         Perl_croak(aTHX_ "corrupted regexp program");
3057     }
3058
3059     RXp_MATCH_TAINTED_off(prog);
3060     RXp_MATCH_UTF8_set(prog, utf8_target);
3061
3062     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3063     reginfo->intuit = 0;
3064     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3065     reginfo->warned = FALSE;
3066     reginfo->sv = sv;
3067     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3068     /* see how far we have to get to not match where we matched before */
3069     reginfo->till = stringarg + minend;
3070
3071     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3072         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3073            S_cleanup_regmatch_info_aux has executed (registered by
3074            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3075            magic belonging to this SV.
3076            Not newSVsv, either, as it does not COW.
3077         */
3078         reginfo->sv = newSV(0);
3079         SvSetSV_nosteal(reginfo->sv, sv);
3080         SAVEFREESV(reginfo->sv);
3081     }
3082
3083     /* reserve next 2 or 3 slots in PL_regmatch_state:
3084      * slot N+0: may currently be in use: skip it
3085      * slot N+1: use for regmatch_info_aux struct
3086      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3087      * slot N+3: ready for use by regmatch()
3088      */
3089
3090     {
3091         regmatch_state *old_regmatch_state;
3092         regmatch_slab  *old_regmatch_slab;
3093         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3094
3095         /* on first ever match, allocate first slab */
3096         if (!PL_regmatch_slab) {
3097             Newx(PL_regmatch_slab, 1, regmatch_slab);
3098             PL_regmatch_slab->prev = NULL;
3099             PL_regmatch_slab->next = NULL;
3100             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3101         }
3102
3103         old_regmatch_state = PL_regmatch_state;
3104         old_regmatch_slab  = PL_regmatch_slab;
3105
3106         for (i=0; i <= max; i++) {
3107             if (i == 1)
3108                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3109             else if (i ==2)
3110                 reginfo->info_aux_eval =
3111                 reginfo->info_aux->info_aux_eval =
3112                             &(PL_regmatch_state->u.info_aux_eval);
3113
3114             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3115                 PL_regmatch_state = S_push_slab(aTHX);
3116         }
3117
3118         /* note initial PL_regmatch_state position; at end of match we'll
3119          * pop back to there and free any higher slabs */
3120
3121         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3122         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3123         reginfo->info_aux->poscache = NULL;
3124
3125         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3126
3127         if ((prog->extflags & RXf_EVAL_SEEN))
3128             S_setup_eval_state(aTHX_ reginfo);
3129         else
3130             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3131     }
3132
3133     /* If there is a "must appear" string, look for it. */
3134
3135     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3136         /* We have to be careful. If the previous successful match
3137            was from this regex we don't want a subsequent partially
3138            successful match to clobber the old results.
3139            So when we detect this possibility we add a swap buffer
3140            to the re, and switch the buffer each match. If we fail,
3141            we switch it back; otherwise we leave it swapped.
3142         */
3143         swap = prog->offs;
3144         /* do we need a save destructor here for eval dies? */
3145         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3146         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3147             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3148             0,
3149             PTR2UV(prog),
3150             PTR2UV(swap),
3151             PTR2UV(prog->offs)
3152         ));
3153     }
3154
3155     if (prog->recurse_locinput)
3156         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3157
3158     /* Simplest case: anchored match need be tried only once, or with
3159      * MBOL, only at the beginning of each line.
3160      *
3161      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3162      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3163      * match at the start of the string then it won't match anywhere else
3164      * either; while with /.*.../, if it doesn't match at the beginning,
3165      * the earliest it could match is at the start of the next line */
3166
3167     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3168         char *end;
3169
3170         if (regtry(reginfo, &s))
3171             goto got_it;
3172
3173         if (!(prog->intflags & PREGf_ANCH_MBOL))
3174             goto phooey;
3175
3176         /* didn't match at start, try at other newline positions */
3177
3178         if (minlen)
3179             dontbother = minlen - 1;
3180         end = HOP3c(strend, -dontbother, strbeg) - 1;
3181
3182         /* skip to next newline */
3183
3184         while (s <= end) { /* note it could be possible to match at the end of the string */
3185             /* NB: newlines are the same in unicode as they are in latin */
3186             if (*s++ != '\n')
3187                 continue;
3188             if (prog->check_substr || prog->check_utf8) {
3189             /* note that with PREGf_IMPLICIT, intuit can only fail
3190              * or return the start position, so it's of limited utility.
3191              * Nevertheless, I made the decision that the potential for
3192              * quick fail was still worth it - DAPM */
3193                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3194                 if (!s)
3195                     goto phooey;
3196             }
3197             if (regtry(reginfo, &s))
3198                 goto got_it;
3199         }
3200         goto phooey;
3201     } /* end anchored search */
3202
3203     if (prog->intflags & PREGf_ANCH_GPOS)
3204     {
3205         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3206         assert(prog->intflags & PREGf_GPOS_SEEN);
3207         /* For anchored \G, the only position it can match from is
3208          * (ganch-gofs); we already set startpos to this above; if intuit
3209          * moved us on from there, we can't possibly succeed */
3210         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3211         if (s == startpos && regtry(reginfo, &s))
3212             goto got_it;
3213         goto phooey;
3214     }
3215
3216     /* Messy cases:  unanchored match. */
3217     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3218         /* we have /x+whatever/ */
3219         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3220         char ch;
3221 #ifdef DEBUGGING
3222         int did_match = 0;
3223 #endif
3224         if (utf8_target) {
3225             if (! prog->anchored_utf8) {
3226                 to_utf8_substr(prog);
3227             }
3228             ch = SvPVX_const(prog->anchored_utf8)[0];
3229             REXEC_FBC_SCAN(
3230                 if (*s == ch) {
3231                     DEBUG_EXECUTE_r( did_match = 1 );
3232                     if (regtry(reginfo, &s)) goto got_it;
3233                     s += UTF8SKIP(s);
3234                     while (s < strend && *s == ch)
3235                         s += UTF8SKIP(s);
3236                 }
3237             );
3238
3239         }
3240         else {
3241             if (! prog->anchored_substr) {
3242                 if (! to_byte_substr(prog)) {
3243                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3244                 }
3245             }
3246             ch = SvPVX_const(prog->anchored_substr)[0];
3247             REXEC_FBC_SCAN(
3248                 if (*s == ch) {
3249                     DEBUG_EXECUTE_r( did_match = 1 );
3250                     if (regtry(reginfo, &s)) goto got_it;
3251                     s++;
3252                     while (s < strend && *s == ch)
3253                         s++;
3254                 }
3255             );
3256         }
3257         DEBUG_EXECUTE_r(if (!did_match)
3258                 Perl_re_printf( aTHX_
3259                                   "Did not find anchored character...\n")
3260                );
3261     }
3262     else if (prog->anchored_substr != NULL
3263               || prog->anchored_utf8 != NULL
3264               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3265                   && prog->float_max_offset < strend - s)) {
3266         SV *must;
3267         SSize_t back_max;
3268         SSize_t back_min;
3269         char *last;
3270         char *last1;            /* Last position checked before */
3271 #ifdef DEBUGGING
3272         int did_match = 0;
3273 #endif
3274         if (prog->anchored_substr || prog->anchored_utf8) {
3275             if (utf8_target) {
3276                 if (! prog->anchored_utf8) {
3277                     to_utf8_substr(prog);
3278                 }
3279                 must = prog->anchored_utf8;
3280             }
3281             else {
3282                 if (! prog->anchored_substr) {
3283                     if (! to_byte_substr(prog)) {
3284                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3285                     }
3286                 }
3287                 must = prog->anchored_substr;
3288             }
3289             back_max = back_min = prog->anchored_offset;
3290         } else {
3291             if (utf8_target) {
3292                 if (! prog->float_utf8) {
3293                     to_utf8_substr(prog);
3294                 }
3295                 must = prog->float_utf8;
3296             }
3297             else {
3298                 if (! prog->float_substr) {
3299                     if (! to_byte_substr(prog)) {
3300                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3301                     }
3302                 }
3303                 must = prog->float_substr;
3304             }
3305             back_max = prog->float_max_offset;
3306             back_min = prog->float_min_offset;
3307         }
3308
3309         if (back_min<0) {
3310             last = strend;
3311         } else {
3312             last = HOP3c(strend,        /* Cannot start after this */
3313                   -(SSize_t)(CHR_SVLEN(must)
3314                          - (SvTAIL(must) != 0) + back_min), strbeg);
3315         }
3316         if (s > reginfo->strbeg)
3317             last1 = HOPc(s, -1);
3318         else
3319             last1 = s - 1;      /* bogus */
3320
3321         /* XXXX check_substr already used to find "s", can optimize if
3322            check_substr==must. */
3323         dontbother = 0;
3324         strend = HOPc(strend, -dontbother);
3325         while ( (s <= last) &&
3326                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3327                                   (unsigned char*)strend, must,
3328                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3329             DEBUG_EXECUTE_r( did_match = 1 );
3330             if (HOPc(s, -back_max) > last1) {
3331                 last1 = HOPc(s, -back_min);
3332                 s = HOPc(s, -back_max);
3333             }
3334             else {
3335                 char * const t = (last1 >= reginfo->strbeg)
3336                                     ? HOPc(last1, 1) : last1 + 1;
3337
3338                 last1 = HOPc(s, -back_min);
3339                 s = t;
3340             }
3341             if (utf8_target) {
3342                 while (s <= last1) {
3343                     if (regtry(reginfo, &s))
3344                         goto got_it;
3345                     if (s >= last1) {
3346                         s++; /* to break out of outer loop */
3347                         break;
3348                     }
3349                     s += UTF8SKIP(s);
3350                 }
3351             }
3352             else {
3353                 while (s <= last1) {
3354                     if (regtry(reginfo, &s))
3355                         goto got_it;
3356                     s++;
3357                 }
3358             }
3359         }
3360         DEBUG_EXECUTE_r(if (!did_match) {
3361             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3362                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3363             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3364                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3365                                ? "anchored" : "floating"),
3366                 quoted, RE_SV_TAIL(must));
3367         });
3368         goto phooey;
3369     }
3370     else if ( (c = progi->regstclass) ) {
3371         if (minlen) {
3372             const OPCODE op = OP(progi->regstclass);
3373             /* don't bother with what can't match */
3374             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3375                 strend = HOPc(strend, -(minlen - 1));
3376         }
3377         DEBUG_EXECUTE_r({
3378             SV * const prop = sv_newmortal();
3379             regprop(prog, prop, c, reginfo, NULL);
3380             {
3381                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3382                     s,strend-s,PL_dump_re_max_len);
3383                 Perl_re_printf( aTHX_
3384                     "Matching stclass %.*s against %s (%d bytes)\n",
3385                     (int)SvCUR(prop), SvPVX_const(prop),
3386                      quoted, (int)(strend - s));
3387             }
3388         });
3389         if (find_byclass(prog, c, s, strend, reginfo))
3390             goto got_it;
3391         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3392     }
3393     else {
3394         dontbother = 0;
3395         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3396             /* Trim the end. */
3397             char *last= NULL;
3398             SV* float_real;
3399             STRLEN len;
3400             const char *little;
3401
3402             if (utf8_target) {
3403                 if (! prog->float_utf8) {
3404                     to_utf8_substr(prog);
3405                 }
3406                 float_real = prog->float_utf8;
3407             }
3408             else {
3409                 if (! prog->float_substr) {
3410                     if (! to_byte_substr(prog)) {
3411                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3412                     }
3413                 }
3414                 float_real = prog->float_substr;
3415             }
3416
3417             little = SvPV_const(float_real, len);
3418             if (SvTAIL(float_real)) {
3419                     /* This means that float_real contains an artificial \n on
3420                      * the end due to the presence of something like this:
3421                      * /foo$/ where we can match both "foo" and "foo\n" at the
3422                      * end of the string.  So we have to compare the end of the
3423                      * string first against the float_real without the \n and
3424                      * then against the full float_real with the string.  We
3425                      * have to watch out for cases where the string might be
3426                      * smaller than the float_real or the float_real without
3427                      * the \n. */
3428                     char *checkpos= strend - len;
3429                     DEBUG_OPTIMISE_r(
3430                         Perl_re_printf( aTHX_
3431                             "%sChecking for float_real.%s\n",
3432                             PL_colors[4], PL_colors[5]));
3433                     if (checkpos + 1 < strbeg) {
3434                         /* can't match, even if we remove the trailing \n
3435                          * string is too short to match */
3436                         DEBUG_EXECUTE_r(
3437                             Perl_re_printf( aTHX_
3438                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3439                                 PL_colors[4], PL_colors[5]));
3440                         goto phooey;
3441                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3442                         /* can match, the end of the string matches without the
3443                          * "\n" */
3444                         last = checkpos + 1;
3445                     } else if (checkpos < strbeg) {
3446                         /* cant match, string is too short when the "\n" is
3447                          * included */
3448                         DEBUG_EXECUTE_r(
3449                             Perl_re_printf( aTHX_
3450                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3451                                 PL_colors[4], PL_colors[5]));
3452                         goto phooey;
3453                     } else if (!multiline) {
3454                         /* non multiline match, so compare with the "\n" at the
3455                          * end of the string */
3456                         if (memEQ(checkpos, little, len)) {
3457                             last= checkpos;
3458                         } else {
3459                             DEBUG_EXECUTE_r(
3460                                 Perl_re_printf( aTHX_
3461                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3462                                     PL_colors[4], PL_colors[5]));
3463                             goto phooey;
3464                         }
3465                     } else {
3466                         /* multiline match, so we have to search for a place
3467                          * where the full string is located */
3468                         goto find_last;
3469                     }
3470             } else {
3471                   find_last:
3472                     if (len)
3473                         last = rninstr(s, strend, little, little + len);
3474                     else
3475                         last = strend;  /* matching "$" */
3476             }
3477             if (!last) {
3478                 /* at one point this block contained a comment which was
3479                  * probably incorrect, which said that this was a "should not
3480                  * happen" case.  Even if it was true when it was written I am
3481                  * pretty sure it is not anymore, so I have removed the comment
3482                  * and replaced it with this one. Yves */
3483                 DEBUG_EXECUTE_r(
3484                     Perl_re_printf( aTHX_
3485                         "%sString does not contain required substring, cannot match.%s\n",
3486                         PL_colors[4], PL_colors[5]
3487                     ));
3488                 goto phooey;
3489             }
3490             dontbother = strend - last + prog->float_min_offset;
3491         }
3492         if (minlen && (dontbother < minlen))
3493             dontbother = minlen - 1;
3494         strend -= dontbother;              /* this one's always in bytes! */
3495         /* We don't know much -- general case. */
3496         if (utf8_target) {
3497             for (;;) {
3498                 if (regtry(reginfo, &s))
3499                     goto got_it;
3500                 if (s >= strend)
3501                     break;
3502                 s += UTF8SKIP(s);
3503             };
3504         }
3505         else {
3506             do {
3507                 if (regtry(reginfo, &s))
3508                     goto got_it;
3509             } while (s++ < strend);
3510         }
3511     }
3512
3513     /* Failure. */
3514     goto phooey;
3515
3516   got_it:
3517     /* s/// doesn't like it if $& is earlier than where we asked it to
3518      * start searching (which can happen on something like /.\G/) */
3519     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3520             && (prog->offs[0].start < stringarg - strbeg))
3521     {
3522         /* this should only be possible under \G */
3523         assert(prog->intflags & PREGf_GPOS_SEEN);
3524         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3525             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3526         goto phooey;
3527     }
3528
3529     DEBUG_BUFFERS_r(
3530         if (swap)
3531             Perl_re_exec_indentf( aTHX_
3532                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3533                 0,
3534                 PTR2UV(prog),
3535                 PTR2UV(swap)
3536             );
3537     );
3538     Safefree(swap);
3539
3540     /* clean up; this will trigger destructors that will free all slabs
3541      * above the current one, and cleanup the regmatch_info_aux
3542      * and regmatch_info_aux_eval sructs */
3543
3544     LEAVE_SCOPE(oldsave);
3545
3546     if (RXp_PAREN_NAMES(prog))
3547         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3548
3549     /* make sure $`, $&, $', and $digit will work later */
3550     if ( !(flags & REXEC_NOT_FIRST) )
3551         S_reg_set_capture_string(aTHX_ rx,
3552                                     strbeg, reginfo->strend,
3553                                     sv, flags, utf8_target);
3554
3555     return 1;
3556
3557   phooey:
3558     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3559                           PL_colors[4], PL_colors[5]));
3560
3561     /* clean up; this will trigger destructors that will free all slabs
3562      * above the current one, and cleanup the regmatch_info_aux
3563      * and regmatch_info_aux_eval sructs */
3564
3565     LEAVE_SCOPE(oldsave);
3566
3567     if (swap) {
3568         /* we failed :-( roll it back */
3569         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3570             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3571             0,
3572             PTR2UV(prog),
3573             PTR2UV(prog->offs),
3574             PTR2UV(swap)
3575         ));
3576         Safefree(prog->offs);
3577         prog->offs = swap;
3578     }
3579     return 0;
3580 }
3581
3582
3583 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3584  * Do inc before dec, in case old and new rex are the same */
3585 #define SET_reg_curpm(Re2)                          \
3586     if (reginfo->info_aux_eval) {                   \
3587         (void)ReREFCNT_inc(Re2);                    \
3588         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3589         PM_SETRE((PL_reg_curpm), (Re2));            \
3590     }
3591
3592
3593 /*
3594  - regtry - try match at specific point
3595  */
3596 STATIC bool                     /* 0 failure, 1 success */
3597 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3598 {
3599     CHECKPOINT lastcp;
3600     REGEXP *const rx = reginfo->prog;
3601     regexp *const prog = ReANY(rx);
3602     SSize_t result;
3603 #ifdef DEBUGGING
3604     U32 depth = 0; /* used by REGCP_SET */
3605 #endif
3606     RXi_GET_DECL(prog,progi);
3607     GET_RE_DEBUG_FLAGS_DECL;
3608
3609     PERL_ARGS_ASSERT_REGTRY;
3610
3611     reginfo->cutpoint=NULL;
3612
3613     prog->offs[0].start = *startposp - reginfo->strbeg;
3614     prog->lastparen = 0;
3615     prog->lastcloseparen = 0;
3616
3617     /* XXXX What this code is doing here?!!!  There should be no need
3618        to do this again and again, prog->lastparen should take care of
3619        this!  --ilya*/
3620
3621     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3622      * Actually, the code in regcppop() (which Ilya may be meaning by
3623      * prog->lastparen), is not needed at all by the test suite
3624      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3625      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3626      * Meanwhile, this code *is* needed for the
3627      * above-mentioned test suite tests to succeed.  The common theme
3628      * on those tests seems to be returning null fields from matches.
3629      * --jhi updated by dapm */
3630
3631     /* After encountering a variant of the issue mentioned above I think
3632      * the point Ilya was making is that if we properly unwind whenever
3633      * we set lastparen to a smaller value then we should not need to do
3634      * this every time, only when needed. So if we have tests that fail if
3635      * we remove this, then it suggests somewhere else we are improperly
3636      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3637      * places it is called, and related regcp() routines. - Yves */
3638 #if 1
3639     if (prog->nparens) {
3640         regexp_paren_pair *pp = prog->offs;
3641         I32 i;
3642         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3643             ++pp;
3644             pp->start = -1;
3645             pp->end = -1;
3646         }
3647     }
3648 #endif
3649     REGCP_SET(lastcp);
3650     result = regmatch(reginfo, *startposp, progi->program + 1);
3651     if (result != -1) {
3652         prog->offs[0].end = result;
3653         return 1;
3654     }
3655     if (reginfo->cutpoint)
3656         *startposp= reginfo->cutpoint;
3657     REGCP_UNWIND(lastcp);
3658     return 0;
3659 }
3660
3661
3662 #define sayYES goto yes
3663 #define sayNO goto no
3664 #define sayNO_SILENT goto no_silent
3665
3666 /* we dont use STMT_START/END here because it leads to
3667    "unreachable code" warnings, which are bogus, but distracting. */
3668 #define CACHEsayNO \
3669     if (ST.cache_mask) \
3670        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3671     sayNO
3672
3673 /* this is used to determine how far from the left messages like
3674    'failed...' are printed in regexec.c. It should be set such that
3675    messages are inline with the regop output that created them.
3676 */
3677 #define REPORT_CODE_OFF 29
3678 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3679 #ifdef DEBUGGING
3680 int
3681 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3682 {
3683     va_list ap;
3684     int result;
3685     PerlIO *f= Perl_debug_log;
3686     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3687     va_start(ap, depth);
3688     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3689     result = PerlIO_vprintf(f, fmt, ap);
3690     va_end(ap);
3691     return result;
3692 }
3693 #endif /* DEBUGGING */
3694
3695
3696 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3697 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3698 #define CHRTEST_NOT_A_CP_1 -999
3699 #define CHRTEST_NOT_A_CP_2 -998
3700
3701 /* grab a new slab and return the first slot in it */
3702
3703 STATIC regmatch_state *
3704 S_push_slab(pTHX)
3705 {
3706     regmatch_slab *s = PL_regmatch_slab->next;
3707     if (!s) {
3708         Newx(s, 1, regmatch_slab);
3709         s->prev = PL_regmatch_slab;
3710         s->next = NULL;
3711         PL_regmatch_slab->next = s;
3712     }
3713     PL_regmatch_slab = s;
3714     return SLAB_FIRST(s);
3715 }
3716
3717
3718 /* push a new state then goto it */
3719
3720 #define PUSH_STATE_GOTO(state, node, input) \
3721     pushinput = input; \
3722     scan = node; \
3723     st->resume_state = state; \
3724     goto push_state;
3725
3726 /* push a new state with success backtracking, then goto it */
3727
3728 #define PUSH_YES_STATE_GOTO(state, node, input) \
3729     pushinput = input; \
3730     scan = node; \
3731     st->resume_state = state; \
3732     goto push_yes_state;
3733
3734
3735
3736
3737 /*
3738
3739 regmatch() - main matching routine
3740
3741 This is basically one big switch statement in a loop. We execute an op,
3742 set 'next' to point the next op, and continue. If we come to a point which
3743 we may need to backtrack to on failure such as (A|B|C), we push a
3744 backtrack state onto the backtrack stack. On failure, we pop the top
3745 state, and re-enter the loop at the state indicated. If there are no more
3746 states to pop, we return failure.
3747
3748 Sometimes we also need to backtrack on success; for example /A+/, where
3749 after successfully matching one A, we need to go back and try to
3750 match another one; similarly for lookahead assertions: if the assertion
3751 completes successfully, we backtrack to the state just before the assertion
3752 and then carry on.  In these cases, the pushed state is marked as
3753 'backtrack on success too'. This marking is in fact done by a chain of
3754 pointers, each pointing to the previous 'yes' state. On success, we pop to
3755 the nearest yes state, discarding any intermediate failure-only states.
3756 Sometimes a yes state is pushed just to force some cleanup code to be
3757 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3758 it to free the inner regex.
3759
3760 Note that failure backtracking rewinds the cursor position, while
3761 success backtracking leaves it alone.
3762
3763 A pattern is complete when the END op is executed, while a subpattern
3764 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3765 ops trigger the "pop to last yes state if any, otherwise return true"
3766 behaviour.
3767
3768 A common convention in this function is to use A and B to refer to the two
3769 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3770 the subpattern to be matched possibly multiple times, while B is the entire
3771 rest of the pattern. Variable and state names reflect this convention.
3772
3773 The states in the main switch are the union of ops and failure/success of
3774 substates associated with with that op.  For example, IFMATCH is the op
3775 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3776 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3777 successfully matched A and IFMATCH_A_fail is a state saying that we have
3778 just failed to match A. Resume states always come in pairs. The backtrack
3779 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3780 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3781 on success or failure.
3782
3783 The struct that holds a backtracking state is actually a big union, with
3784 one variant for each major type of op. The variable st points to the
3785 top-most backtrack struct. To make the code clearer, within each
3786 block of code we #define ST to alias the relevant union.
3787
3788 Here's a concrete example of a (vastly oversimplified) IFMATCH
3789 implementation:
3790
3791     switch (state) {
3792     ....
3793
3794 #define ST st->u.ifmatch
3795
3796     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3797         ST.foo = ...; // some state we wish to save
3798         ...
3799         // push a yes backtrack state with a resume value of
3800         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3801         // first node of A:
3802         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3803         // NOTREACHED
3804
3805     case IFMATCH_A: // we have successfully executed A; now continue with B
3806         next = B;
3807         bar = ST.foo; // do something with the preserved value
3808         break;
3809
3810     case IFMATCH_A_fail: // A failed, so the assertion failed
3811         ...;   // do some housekeeping, then ...
3812         sayNO; // propagate the failure
3813
3814 #undef ST
3815
3816     ...
3817     }
3818
3819 For any old-timers reading this who are familiar with the old recursive
3820 approach, the code above is equivalent to:
3821
3822     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3823     {
3824         int foo = ...
3825         ...
3826         if (regmatch(A)) {
3827             next = B;
3828             bar = foo;
3829             break;
3830         }
3831         ...;   // do some housekeeping, then ...
3832         sayNO; // propagate the failure
3833     }
3834
3835 The topmost backtrack state, pointed to by st, is usually free. If you
3836 want to claim it, populate any ST.foo fields in it with values you wish to
3837 save, then do one of
3838
3839         PUSH_STATE_GOTO(resume_state, node, newinput);
3840         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3841
3842 which sets that backtrack state's resume value to 'resume_state', pushes a
3843 new free entry to the top of the backtrack stack, then goes to 'node'.
3844 On backtracking, the free slot is popped, and the saved state becomes the
3845 new free state. An ST.foo field in this new top state can be temporarily
3846 accessed to retrieve values, but once the main loop is re-entered, it
3847 becomes available for reuse.
3848
3849 Note that the depth of the backtrack stack constantly increases during the
3850 left-to-right execution of the pattern, rather than going up and down with
3851 the pattern nesting. For example the stack is at its maximum at Z at the
3852 end of the pattern, rather than at X in the following:
3853
3854     /(((X)+)+)+....(Y)+....Z/
3855
3856 The only exceptions to this are lookahead/behind assertions and the cut,
3857 (?>A), which pop all the backtrack states associated with A before
3858 continuing.
3859
3860 Backtrack state structs are allocated in slabs of about 4K in size.
3861 PL_regmatch_state and st always point to the currently active state,
3862 and PL_regmatch_slab points to the slab currently containing
3863 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3864 allocated, and is never freed until interpreter destruction. When the slab
3865 is full, a new one is allocated and chained to the end. At exit from
3866 regmatch(), slabs allocated since entry are freed.
3867
3868 */
3869
3870
3871 #define DEBUG_STATE_pp(pp)                                  \
3872     DEBUG_STATE_r({                                         \
3873         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
3874         Perl_re_printf( aTHX_                                           \
3875             "%*s" pp " %s%s%s%s%s\n",                       \
3876             INDENT_CHARS(depth), "",                        \
3877             PL_reg_name[st->resume_state],                  \
3878             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3879             ((st==yes_state) ? "Y" : ""),                   \
3880             ((st==mark_state) ? "M" : ""),                  \
3881             ((st==yes_state||st==mark_state) ? "]" : "")    \
3882         );                                                  \
3883     });
3884
3885
3886 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3887
3888 #ifdef DEBUGGING
3889
3890 STATIC void
3891 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3892     const char *start, const char *end, const char *blurb)
3893 {
3894     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3895
3896     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3897
3898     if (!PL_colorset)
3899             reginitcolors();
3900     {
3901         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3902             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
3903
3904         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3905             start, end - start, PL_dump_re_max_len);
3906
3907         Perl_re_printf( aTHX_
3908             "%s%s REx%s %s against %s\n",
3909                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3910
3911         if (utf8_target||utf8_pat)
3912             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
3913                 utf8_pat ? "pattern" : "",
3914                 utf8_pat && utf8_target ? " and " : "",
3915                 utf8_target ? "string" : ""
3916             );
3917     }
3918 }
3919
3920 STATIC void
3921 S_dump_exec_pos(pTHX_ const char *locinput,
3922                       const regnode *scan,
3923                       const char *loc_regeol,
3924                       const char *loc_bostr,
3925                       const char *loc_reg_starttry,
3926                       const bool utf8_target,
3927                       const U32 depth
3928                 )
3929 {
3930     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3931     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3932     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3933     /* The part of the string before starttry has one color
3934        (pref0_len chars), between starttry and current
3935        position another one (pref_len - pref0_len chars),
3936        after the current position the third one.
3937        We assume that pref0_len <= pref_len, otherwise we
3938        decrease pref0_len.  */
3939     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3940         ? (5 + taill) - l : locinput - loc_bostr;
3941     int pref0_len;
3942
3943     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3944
3945     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3946         pref_len++;
3947     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3948     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3949         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3950               ? (5 + taill) - pref_len : loc_regeol - locinput);
3951     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3952         l--;
3953     if (pref0_len < 0)
3954         pref0_len = 0;
3955     if (pref0_len > pref_len)
3956         pref0_len = pref_len;
3957     {
3958         const int is_uni = utf8_target ? 1 : 0;
3959
3960         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3961             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
3962
3963         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3964                     (locinput - pref_len + pref0_len),
3965                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
3966
3967         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3968                     locinput, loc_regeol - locinput, 10, 0, 1);
3969
3970         const STRLEN tlen=len0+len1+len2;
3971         Perl_re_printf( aTHX_
3972                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
3973                     (IV)(locinput - loc_bostr),
3974                     len0, s0,
3975                     len1, s1,
3976                     (docolor ? "" : "> <"),
3977                     len2, s2,
3978                     (int)(tlen > 19 ? 0 :  19 - tlen),
3979                     "",
3980                     depth);
3981     }
3982 }
3983
3984 #endif
3985
3986 /* reg_check_named_buff_matched()
3987  * Checks to see if a named buffer has matched. The data array of
3988  * buffer numbers corresponding to the buffer is expected to reside
3989  * in the regexp->data->data array in the slot stored in the ARG() of
3990  * node involved. Note that this routine doesn't actually care about the
3991  * name, that information is not preserved from compilation to execution.
3992  * Returns the index of the leftmost defined buffer with the given name
3993  * or 0 if non of the buffers matched.
3994  */
3995 STATIC I32
3996 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3997 {
3998     I32 n;
3999     RXi_GET_DECL(rex,rexi);
4000     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4001     I32 *nums=(I32*)SvPVX(sv_dat);
4002
4003     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4004
4005     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4006         if ((I32)rex->lastparen >= nums[n] &&
4007             rex->offs[nums[n]].end != -1)
4008         {
4009             return nums[n];
4010         }
4011     }
4012     return 0;
4013 }
4014
4015
4016 static bool
4017 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4018         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4019 {
4020     /* This function determines if there are one or two characters that match
4021      * the first character of the passed-in EXACTish node <text_node>, and if
4022      * so, returns them in the passed-in pointers.
4023      *
4024      * If it determines that no possible character in the target string can
4025      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4026      * the first character in <text_node> requires UTF-8 to represent, and the
4027      * target string isn't in UTF-8.)
4028      *
4029      * If there are more than two characters that could match the beginning of
4030      * <text_node>, or if more context is required to determine a match or not,
4031      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4032      *
4033      * The motiviation behind this function is to allow the caller to set up
4034      * tight loops for matching.  If <text_node> is of type EXACT, there is
4035      * only one possible character that can match its first character, and so
4036      * the situation is quite simple.  But things get much more complicated if
4037      * folding is involved.  It may be that the first character of an EXACTFish
4038      * node doesn't participate in any possible fold, e.g., punctuation, so it
4039      * can be matched only by itself.  The vast majority of characters that are
4040      * in folds match just two things, their lower and upper-case equivalents.
4041      * But not all are like that; some have multiple possible matches, or match
4042      * sequences of more than one character.  This function sorts all that out.
4043      *
4044      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4045      * loop of trying to match A*, we know we can't exit where the thing
4046      * following it isn't a B.  And something can't be a B unless it is the
4047      * beginning of B.  By putting a quick test for that beginning in a tight
4048      * loop, we can rule out things that can't possibly be B without having to
4049      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4050      * character, we can make a tight loop matching A*, using the outputs of
4051      * this function.
4052      *
4053      * If the target string to match isn't in UTF-8, and there aren't
4054      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4055      * the one or two possible octets (which are characters in this situation)
4056      * that can match.  In all cases, if there is only one character that can
4057      * match, *<c1p> and *<c2p> will be identical.
4058      *
4059      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4060      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4061      * can match the beginning of <text_node>.  They should be declared with at
4062      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4063      * undefined what these contain.)  If one or both of the buffers are
4064      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4065      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4066      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4067      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4068      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4069
4070     const bool utf8_target = reginfo->is_utf8_target;
4071
4072     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4073     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4074     bool use_chrtest_void = FALSE;
4075     const bool is_utf8_pat = reginfo->is_utf8_pat;
4076
4077     /* Used when we have both utf8 input and utf8 output, to avoid converting
4078      * to/from code points */
4079     bool utf8_has_been_setup = FALSE;
4080
4081     dVAR;
4082
4083     U8 *pat = (U8*)STRING(text_node);
4084     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4085
4086     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4087
4088         /* In an exact node, only one thing can be matched, that first
4089          * character.  If both the pat and the target are UTF-8, we can just
4090          * copy the input to the output, avoiding finding the code point of
4091          * that character */
4092         if (!is_utf8_pat) {
4093             c2 = c1 = *pat;
4094         }
4095         else if (utf8_target) {
4096             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4097             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4098             utf8_has_been_setup = TRUE;
4099         }
4100         else {
4101             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4102         }
4103     }
4104     else { /* an EXACTFish node */
4105         U8 *pat_end = pat + STR_LEN(text_node);
4106
4107         /* An EXACTFL node has at least some characters unfolded, because what
4108          * they match is not known until now.  So, now is the time to fold
4109          * the first few of them, as many as are needed to determine 'c1' and
4110          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4111          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4112          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4113          * need to fold as many characters as a single character can fold to,
4114          * so that later we can check if the first ones are such a multi-char
4115          * fold.  But, in such a pattern only locale-problematic characters
4116          * aren't folded, so we can skip this completely if the first character
4117          * in the node isn't one of the tricky ones */
4118         if (OP(text_node) == EXACTFL) {
4119
4120             if (! is_utf8_pat) {
4121                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4122                 {
4123                     folded[0] = folded[1] = 's';
4124                     pat = folded;
4125                     pat_end = folded + 2;
4126                 }
4127             }
4128             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4129                 U8 *s = pat;
4130                 U8 *d = folded;
4131                 int i;
4132
4133                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4134                     if (isASCII(*s)) {
4135                         *(d++) = (U8) toFOLD_LC(*s);
4136                         s++;
4137                     }
4138                     else {
4139                         STRLEN len;
4140                         _toFOLD_utf8_flags(s,
4141                                            pat_end,
4142                                            d,
4143                                            &len,
4144                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4145                         d += len;
4146                         s += UTF8SKIP(s);
4147                     }
4148                 }
4149
4150                 pat = folded;
4151                 pat_end = d;
4152             }
4153         }
4154
4155         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4156              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4157         {
4158             /* Multi-character folds require more context to sort out.  Also
4159              * PL_utf8_foldclosures used below doesn't handle them, so have to
4160              * be handled outside this routine */
4161             use_chrtest_void = TRUE;
4162         }
4163         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4164             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4165             if (c1 > 255) {
4166                 /* Load the folds hash, if not already done */
4167                 SV** listp;
4168                 if (! PL_utf8_foldclosures) {
4169                     _load_PL_utf8_foldclosures();
4170                 }
4171
4172                 /* The fold closures data structure is a hash with the keys
4173                  * being the UTF-8 of every character that is folded to, like
4174                  * 'k', and the values each an array of all code points that
4175                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4176                  * Multi-character folds are not included */
4177                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4178                                         (char *) pat,
4179                                         UTF8SKIP(pat),
4180                                         FALSE))))
4181                 {
4182                     /* Not found in the hash, therefore there are no folds
4183                     * containing it, so there is only a single character that
4184                     * could match */
4185                     c2 = c1;
4186                 }
4187                 else {  /* Does participate in folds */
4188                     AV* list = (AV*) *listp;
4189                     if (av_tindex_skip_len_mg(list) != 1) {
4190
4191                         /* If there aren't exactly two folds to this, it is
4192                          * outside the scope of this function */
4193                         use_chrtest_void = TRUE;
4194                     }
4195                     else {  /* There are two.  Get them */
4196                         SV** c_p = av_fetch(list, 0, FALSE);
4197                         if (c_p == NULL) {
4198                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4199                         }
4200                         c1 = SvUV(*c_p);
4201
4202                         c_p = av_fetch(list, 1, FALSE);
4203                         if (c_p == NULL) {
4204                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4205                         }
4206                         c2 = SvUV(*c_p);
4207
4208                         /* Folds that cross the 255/256 boundary are forbidden
4209                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4210                          * one is ASCIII.  Since the pattern character is above
4211                          * 255, and its only other match is below 256, the only
4212                          * legal match will be to itself.  We have thrown away
4213                          * the original, so have to compute which is the one
4214                          * above 255. */
4215                         if ((c1 < 256) != (c2 < 256)) {
4216                             if ((OP(text_node) == EXACTFL
4217                                  && ! IN_UTF8_CTYPE_LOCALE)
4218                                 || ((OP(text_node) == EXACTFA
4219                                     || OP(text_node) == EXACTFA_NO_TRIE)
4220                                     && (isASCII(c1) || isASCII(c2))))
4221                             {
4222                                 if (c1 < 256) {
4223                                     c1 = c2;
4224                                 }
4225                                 else {
4226                                     c2 = c1;
4227                                 }
4228                             }
4229                         }
4230                     }
4231                 }
4232             }
4233             else /* Here, c1 is <= 255 */
4234                 if (utf8_target
4235                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4236                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4237                     && ((OP(text_node) != EXACTFA
4238                         && OP(text_node) != EXACTFA_NO_TRIE)
4239                         || ! isASCII(c1)))
4240             {
4241                 /* Here, there could be something above Latin1 in the target
4242                  * which folds to this character in the pattern.  All such
4243                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4244                  * than two characters involved in their folds, so are outside
4245                  * the scope of this function */
4246                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4247                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4248                 }
4249                 else {
4250                     use_chrtest_void = TRUE;
4251                 }
4252             }
4253             else { /* Here nothing above Latin1 can fold to the pattern
4254                       character */
4255                 switch (OP(text_node)) {
4256
4257                     case EXACTFL:   /* /l rules */
4258                         c2 = PL_fold_locale[c1];
4259                         break;
4260
4261                     case EXACTF:   /* This node only generated for non-utf8
4262                                     patterns */
4263                         assert(! is_utf8_pat);
4264                         if (! utf8_target) {    /* /d rules */
4265                             c2 = PL_fold[c1];
4266                             break;
4267                         }
4268                         /* FALLTHROUGH */
4269                         /* /u rules for all these.  This happens to work for
4270                         * EXACTFA as nothing in Latin1 folds to ASCII */
4271                     case EXACTFA_NO_TRIE:   /* This node only generated for
4272                                             non-utf8 patterns */
4273                         assert(! is_utf8_pat);
4274                         /* FALLTHROUGH */
4275                     case EXACTFA:
4276                     case EXACTFU_SS:
4277                     case EXACTFU:
4278                         c2 = PL_fold_latin1[c1];
4279                         break;
4280
4281                     default:
4282                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4283                         NOT_REACHED; /* NOTREACHED */
4284                 }
4285             }
4286         }
4287     }
4288
4289     /* Here have figured things out.  Set up the returns */
4290     if (use_chrtest_void) {
4291         *c2p = *c1p = CHRTEST_VOID;
4292     }
4293     else if (utf8_target) {
4294         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4295             uvchr_to_utf8(c1_utf8, c1);
4296             uvchr_to_utf8(c2_utf8, c2);
4297         }
4298
4299         /* Invariants are stored in both the utf8 and byte outputs; Use
4300          * negative numbers otherwise for the byte ones.  Make sure that the
4301          * byte ones are the same iff the utf8 ones are the same */
4302         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4303         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4304                 ? *c2_utf8
4305                 : (c1 == c2)
4306                   ? CHRTEST_NOT_A_CP_1
4307                   : CHRTEST_NOT_A_CP_2;
4308     }
4309     else if (c1 > 255) {
4310        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4311                            can represent */
4312            return FALSE;
4313        }
4314
4315        *c1p = *c2p = c2;    /* c2 is the only representable value */
4316     }
4317     else {  /* c1 is representable; see about c2 */
4318        *c1p = c1;
4319        *c2p = (c2 < 256) ? c2 : c1;
4320     }
4321
4322     return TRUE;
4323 }
4324
4325 STATIC bool
4326 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4327 {
4328     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4329      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4330
4331     PERL_ARGS_ASSERT_ISGCB;
4332
4333     switch (GCB_table[before][after]) {
4334         case GCB_BREAKABLE:
4335             return TRUE;
4336
4337         case GCB_NOBREAK:
4338             return FALSE;
4339
4340         case GCB_RI_then_RI:
4341             {
4342                 int RI_count = 1;
4343                 U8 * temp_pos = (U8 *) curpos;
4344
4345                 /* Do not break within emoji flag sequences. That is, do not
4346                  * break between regional indicator (RI) symbols if there is an
4347                  * odd number of RI characters before the break point.
4348                  *  GB12   sot (RI RI)* RI × RI
4349                  *  GB13 [^RI] (RI RI)* RI × RI */
4350
4351                 while (backup_one_GCB(strbeg,
4352                                     &temp_pos,
4353                                     utf8_target) == GCB_Regional_Indicator)
4354                 {
4355                     RI_count++;
4356                 }
4357
4358                 return RI_count % 2 != 1;
4359             }
4360
4361         case GCB_EX_then_EM:
4362
4363             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4364             {
4365                 U8 * temp_pos = (U8 *) curpos;
4366                 GCB_enum prev;
4367
4368                 do {
4369                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4370                 }
4371                 while (prev == GCB_Extend);
4372
4373                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4374             }
4375
4376         default:
4377             break;
4378     }
4379
4380 #ifdef DEBUGGING
4381     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4382                                   before, after, GCB_table[before][after]);
4383     assert(0);
4384 #endif
4385     return TRUE;
4386 }
4387
4388 STATIC GCB_enum
4389 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4390 {
4391     GCB_enum gcb;
4392
4393     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4394
4395     if (*curpos < strbeg) {
4396         return GCB_EDGE;
4397     }
4398
4399     if (utf8_target) {
4400         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4401         U8 * prev_prev_char_pos;
4402
4403         if (! prev_char_pos) {
4404             return GCB_EDGE;
4405         }
4406
4407         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4408             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4409             *curpos = prev_char_pos;
4410             prev_char_pos = prev_prev_char_pos;
4411         }
4412         else {
4413             *curpos = (U8 *) strbeg;
4414             return GCB_EDGE;
4415         }
4416     }
4417     else {
4418         if (*curpos - 2 < strbeg) {
4419             *curpos = (U8 *) strbeg;
4420             return GCB_EDGE;
4421         }
4422         (*curpos)--;
4423         gcb = getGCB_VAL_CP(*(*curpos - 1));
4424     }
4425
4426     return gcb;
4427 }
4428
4429 /* Combining marks attach to most classes that precede them, but this defines
4430  * the exceptions (from TR14) */
4431 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4432                                      || prev == LB_Mandatory_Break      \
4433                                      || prev == LB_Carriage_Return      \
4434                                      || prev == LB_Line_Feed            \
4435                                      || prev == LB_Next_Line            \
4436                                      || prev == LB_Space                \
4437                                      || prev == LB_ZWSpace))
4438
4439 STATIC bool
4440 S_isLB(pTHX_ LB_enum before,
4441              LB_enum after,
4442              const U8 * const strbeg,
4443              const U8 * const curpos,
4444              const U8 * const strend,
4445              const bool utf8_target)
4446 {
4447     U8 * temp_pos = (U8 *) curpos;
4448     LB_enum prev = before;
4449
4450     /* Is the boundary between 'before' and 'after' line-breakable?
4451      * Most of this is just a table lookup of a generated table from Unicode
4452      * rules.  But some rules require context to decide, and so have to be
4453      * implemented in code */
4454
4455     PERL_ARGS_ASSERT_ISLB;
4456
4457     /* Rule numbers in the comments below are as of Unicode 9.0 */
4458
4459   redo:
4460     before = prev;
4461     switch (LB_table[before][after]) {
4462         case LB_BREAKABLE:
4463             return TRUE;
4464
4465         case LB_NOBREAK:
4466         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4467             return FALSE;
4468
4469         case LB_SP_foo + LB_BREAKABLE:
4470         case LB_SP_foo + LB_NOBREAK:
4471         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4472
4473             /* When we have something following a SP, we have to look at the
4474              * context in order to know what to do.
4475              *
4476              * SP SP should not reach here because LB7: Do not break before
4477              * spaces.  (For two spaces in a row there is nothing that
4478              * overrides that) */
4479             assert(after != LB_Space);
4480
4481             /* Here we have a space followed by a non-space.  Mostly this is a
4482              * case of LB18: "Break after spaces".  But there are complications
4483              * as the handling of spaces is somewhat tricky.  They are in a
4484              * number of rules, which have to be applied in priority order, but
4485              * something earlier in the string can cause a rule to be skipped
4486              * and a lower priority rule invoked.  A prime example is LB7 which
4487              * says don't break before a space.  But rule LB8 (lower priority)
4488              * says that the first break opportunity after a ZW is after any
4489              * span of spaces immediately after it.  If a ZW comes before a SP
4490              * in the input, rule LB8 applies, and not LB7.  Other such rules
4491              * involve combining marks which are rules 9 and 10, but they may
4492              * override higher priority rules if they come earlier in the
4493              * string.  Since we're doing random access into the middle of the
4494              * string, we have to look for rules that should get applied based
4495              * on both string position and priority.  Combining marks do not
4496              * attach to either ZW nor SP, so we don't have to consider them
4497              * until later.
4498              *
4499              * To check for LB8, we have to find the first non-space character
4500              * before this span of spaces */
4501             do {
4502                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4503             }
4504             while (prev == LB_Space);
4505
4506             /* LB8 Break before any character following a zero-width space,
4507              * even if one or more spaces intervene.
4508              *      ZW SP* ÷
4509              * So if we have a ZW just before this span, and to get here this
4510              * is the final space in the span. */
4511             if (prev == LB_ZWSpace) {
4512                 return TRUE;
4513             }
4514
4515             /* Here, not ZW SP+.  There are several rules that have higher
4516              * priority than LB18 and can be resolved now, as they don't depend
4517              * on anything earlier in the string (except ZW, which we have
4518              * already handled).  One of these rules is LB11 Do not break
4519              * before Word joiner, but we have specially encoded that in the
4520              * lookup table so it is caught by the single test below which
4521              * catches the other ones. */
4522             if (LB_table[LB_Space][after] - LB_SP_foo
4523                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4524             {
4525                 return FALSE;
4526             }
4527
4528             /* If we get here, we have to XXX consider combining marks. */
4529             if (prev == LB_Combining_Mark) {
4530
4531                 /* What happens with these depends on the character they
4532                  * follow.  */
4533                 do {
4534                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4535                 }
4536                 while (prev == LB_Combining_Mark);
4537
4538                 /* Most times these attach to and inherit the characteristics
4539                  * of that character, but not always, and when not, they are to
4540                  * be treated as AL by rule LB10. */
4541                 if (! LB_CM_ATTACHES_TO(prev)) {
4542                     prev = LB_Alphabetic;
4543                 }
4544             }
4545
4546             /* Here, we have the character preceding the span of spaces all set
4547              * up.  We follow LB18: "Break after spaces" unless the table shows
4548              * that is overriden */
4549             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4550
4551         case LB_CM_ZWJ_foo:
4552
4553             /* We don't know how to treat the CM except by looking at the first
4554              * non-CM character preceding it.  ZWJ is treated as CM */
4555             do {
4556                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4557             }
4558             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4559
4560             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4561              * attatches to it, then it inherits the behavior of 'prev'.  If it
4562              * doesn't attach, it is to be treated as an AL */
4563             if (! LB_CM_ATTACHES_TO(prev)) {
4564                 prev = LB_Alphabetic;
4565             }
4566
4567             goto redo;
4568
4569         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4570         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4571
4572             /* LB21a Don't break after Hebrew + Hyphen.
4573              * HL (HY | BA) × */
4574
4575             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4576                                                           == LB_Hebrew_Letter)
4577             {
4578                 return FALSE;
4579             }
4580
4581             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4582
4583         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4584         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4585
4586             /* LB25a (PR | PO) × ( OP | HY )? NU */
4587             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4588                 return FALSE;
4589             }
4590
4591             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4592                                                                 == LB_BREAKABLE;
4593
4594         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4595         case LB_SY_or_IS_then_various + LB_NOBREAK:
4596         {
4597             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4598
4599             LB_enum temp = prev;
4600             do {
4601                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4602             }
4603             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4604             if (temp == LB_Numeric) {
4605                 return FALSE;
4606             }
4607
4608             return LB_table[prev][after] - LB_SY_or_IS_then_various
4609                                                                == LB_BREAKABLE;
4610         }
4611
4612         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4613         case LB_various_then_PO_or_PR + LB_NOBREAK:
4614         {
4615             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4616
4617             LB_enum temp = prev;
4618             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4619             {
4620                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4621             }
4622             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4623                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4624             }
4625             if (temp == LB_Numeric) {
4626                 return FALSE;
4627             }
4628             return LB_various_then_PO_or_PR;
4629         }
4630
4631         case LB_RI_then_RI + LB_NOBREAK:
4632         case LB_RI_then_RI + LB_BREAKABLE:
4633             {
4634                 int RI_count = 1;
4635
4636                 /* LB30a Break between two regional indicator symbols if and
4637                  * only if there are an even number of regional indicators
4638                  * preceding the position of the break.
4639                  *
4640                  *    sot (RI RI)* RI × RI
4641                  *  [^RI] (RI RI)* RI × RI */
4642
4643                 while (backup_one_LB(strbeg,
4644                                      &temp_pos,
4645                                      utf8_target) == LB_Regional_Indicator)
4646                 {
4647                     RI_count++;
4648                 }
4649
4650                 return RI_count % 2 == 0;
4651             }
4652
4653         default:
4654             break;
4655     }
4656
4657 #ifdef DEBUGGING
4658     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4659                                   before, after, LB_table[before][after]);
4660     assert(0);
4661 #endif
4662     return TRUE;
4663 }
4664
4665 STATIC LB_enum
4666 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4667 {
4668     LB_enum lb;
4669
4670     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4671
4672     if (*curpos >= strend) {
4673         return LB_EDGE;
4674     }
4675
4676     if (utf8_target) {
4677         *curpos += UTF8SKIP(*curpos);
4678         if (*curpos >= strend) {
4679             return LB_EDGE;
4680         }
4681         lb = getLB_VAL_UTF8(*curpos, strend);
4682     }
4683     else {
4684         (*curpos)++;
4685         if (*curpos >= strend) {
4686             return LB_EDGE;
4687         }
4688         lb = getLB_VAL_CP(**curpos);
4689     }
4690
4691     return lb;
4692 }
4693
4694 STATIC LB_enum
4695 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4696 {
4697     LB_enum lb;
4698
4699     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4700
4701     if (*curpos < strbeg) {
4702         return LB_EDGE;
4703     }
4704
4705     if (utf8_target) {
4706         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4707         U8 * prev_prev_char_pos;
4708
4709         if (! prev_char_pos) {
4710             return LB_EDGE;
4711         }
4712
4713         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4714             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4715             *curpos = prev_char_pos;
4716             prev_char_pos = prev_prev_char_pos;
4717         }
4718         else {
4719             *curpos = (U8 *) strbeg;
4720             return LB_EDGE;
4721         }
4722     }
4723     else {
4724         if (*curpos - 2 < strbeg) {
4725             *curpos = (U8 *) strbeg;
4726             return LB_EDGE;
4727         }
4728         (*curpos)--;
4729         lb = getLB_VAL_CP(*(*curpos - 1));
4730     }
4731
4732     return lb;
4733 }
4734
4735 STATIC bool
4736 S_isSB(pTHX_ SB_enum before,
4737              SB_enum after,
4738              const U8 * const strbeg,
4739              const U8 * const curpos,
4740              const U8 * const strend,
4741              const bool utf8_target)
4742 {
4743     /* returns a boolean indicating if there is a Sentence Boundary Break
4744      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4745
4746     U8 * lpos = (U8 *) curpos;
4747     bool has_para_sep = FALSE;
4748     bool has_sp = FALSE;
4749
4750     PERL_ARGS_ASSERT_ISSB;
4751
4752     /* Break at the start and end of text.
4753         SB1.  sot  ÷
4754         SB2.  ÷  eot
4755       But unstated in Unicode is don't break if the text is empty */
4756     if (before == SB_EDGE || after == SB_EDGE) {
4757         return before != after;
4758     }
4759
4760     /* SB 3: Do not break within CRLF. */
4761     if (before == SB_CR && after == SB_LF) {
4762         return FALSE;
4763     }
4764
4765     /* Break after paragraph separators.  CR and LF are considered
4766      * so because Unicode views text as like word processing text where there
4767      * are no newlines except between paragraphs, and the word processor takes
4768      * care of wrapping without there being hard line-breaks in the text *./
4769        SB4.  Sep | CR | LF  ÷ */
4770     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4771         return TRUE;
4772     }
4773
4774     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4775      * (See Section 6.2, Replacing Ignore Rules.)
4776         SB5.  X (Extend | Format)*  →  X */
4777     if (after == SB_Extend || after == SB_Format) {
4778
4779         /* Implied is that the these characters attach to everything
4780          * immediately prior to them except for those separator-type
4781          * characters.  And the rules earlier have already handled the case
4782          * when one of those immediately precedes the extend char */
4783         return FALSE;
4784     }
4785
4786     if (before == SB_Extend || before == SB_Format) {
4787         U8 * temp_pos = lpos;
4788         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4789         if (   backup != SB_EDGE
4790             && backup != SB_Sep
4791             && backup != SB_CR
4792             && backup != SB_LF)
4793         {
4794             before = backup;
4795             lpos = temp_pos;
4796         }
4797
4798         /* Here, both 'before' and 'backup' are these types; implied is that we
4799          * don't break between them */
4800         if (backup == SB_Extend || backup == SB_Format) {
4801             return FALSE;
4802         }
4803     }
4804
4805     /* Do not break after ambiguous terminators like period, if they are
4806      * immediately followed by a number or lowercase letter, if they are
4807      * between uppercase letters, if the first following letter (optionally
4808      * after certain punctuation) is lowercase, or if they are followed by
4809      * "continuation" punctuation such as comma, colon, or semicolon. For
4810      * example, a period may be an abbreviation or numeric period, and thus may
4811      * not mark the end of a sentence.
4812
4813      * SB6. ATerm  ×  Numeric */
4814     if (before == SB_ATerm && after == SB_Numeric) {
4815         return FALSE;
4816     }
4817
4818     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4819     if (before == SB_ATerm && after == SB_Upper) {
4820         U8 * temp_pos = lpos;
4821         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4822         if (backup == SB_Upper || backup == SB_Lower) {
4823             return FALSE;
4824         }
4825     }
4826
4827     /* The remaining rules that aren't the final one, all require an STerm or
4828      * an ATerm after having backed up over some Close* Sp*, and in one case an
4829      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4830      * So do that backup now, setting flags if either Sp or a paragraph
4831      * separator are found */
4832
4833     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4834         has_para_sep = TRUE;
4835         before = backup_one_SB(strbeg, &lpos, utf8_target);
4836     }
4837
4838     if (before == SB_Sp) {
4839         has_sp = TRUE;
4840         do {
4841             before = backup_one_SB(strbeg, &lpos, utf8_target);
4842         }
4843         while (before == SB_Sp);
4844     }
4845
4846     while (before == SB_Close) {
4847         before = backup_one_SB(strbeg, &lpos, utf8_target);
4848     }
4849
4850     /* The next few rules apply only when the backed-up-to is an ATerm, and in
4851      * most cases an STerm */
4852     if (before == SB_STerm || before == SB_ATerm) {
4853
4854         /* So, here the lhs matches
4855          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
4856          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
4857          * The rules that apply here are:
4858          *
4859          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
4860                                            | LF | STerm | ATerm) )* Lower
4861            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
4862            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
4863            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
4864            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
4865          */
4866
4867         /* And all but SB11 forbid having seen a paragraph separator */
4868         if (! has_para_sep) {
4869             if (before == SB_ATerm) {          /* SB8 */
4870                 U8 * rpos = (U8 *) curpos;
4871                 SB_enum later = after;
4872
4873                 while (    later != SB_OLetter
4874                         && later != SB_Upper
4875                         && later != SB_Lower
4876                         && later != SB_Sep
4877                         && later != SB_CR
4878                         && later != SB_LF
4879                         && later != SB_STerm
4880                         && later != SB_ATerm
4881                         && later != SB_EDGE)
4882                 {
4883                     later = advance_one_SB(&rpos, strend, utf8_target);
4884                 }
4885                 if (later == SB_Lower) {
4886                     return FALSE;
4887                 }
4888             }
4889
4890             if (   after == SB_SContinue    /* SB8a */
4891                 || after == SB_STerm
4892                 || after == SB_ATerm)
4893             {
4894                 return FALSE;
4895             }
4896
4897             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
4898                 if (   after == SB_Close
4899                     || after == SB_Sp
4900                     || after == SB_Sep
4901                     || after == SB_CR
4902                     || after == SB_LF)
4903                 {
4904                     return FALSE;
4905                 }
4906             }
4907
4908             /* SB10.  This and SB9 could probably be combined some way, but khw
4909              * has decided to follow the Unicode rule book precisely for
4910              * simplified maintenance */
4911             if (   after == SB_Sp
4912                 || after == SB_Sep
4913                 || after == SB_CR
4914                 || after == SB_LF)
4915             {
4916                 return FALSE;
4917             }
4918         }
4919
4920         /* SB11.  */
4921         return TRUE;
4922     }
4923
4924     /* Otherwise, do not break.
4925     SB12.  Any  ×  Any */
4926
4927     return FALSE;
4928 }
4929
4930 STATIC SB_enum
4931 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4932 {
4933     SB_enum sb;
4934
4935     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4936
4937     if (*curpos >= strend) {
4938         return SB_EDGE;
4939     }
4940
4941     if (utf8_target) {
4942         do {
4943             *curpos += UTF8SKIP(*curpos);
4944             if (*curpos >= strend) {
4945                 return SB_EDGE;
4946             }
4947             sb = getSB_VAL_UTF8(*curpos, strend);
4948         } while (sb == SB_Extend || sb == SB_Format);
4949     }
4950     else {
4951         do {
4952             (*curpos)++;
4953             if (*curpos >= strend) {
4954                 return SB_EDGE;
4955             }
4956             sb = getSB_VAL_CP(**curpos);
4957         } while (sb == SB_Extend || sb == SB_Format);
4958     }
4959
4960     return sb;
4961 }
4962
4963 STATIC SB_enum
4964 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4965 {
4966     SB_enum sb;
4967
4968     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4969
4970     if (*curpos < strbeg) {
4971         return SB_EDGE;
4972     }
4973
4974     if (utf8_target) {
4975         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4976         if (! prev_char_pos) {
4977             return SB_EDGE;
4978         }
4979
4980         /* Back up over Extend and Format.  curpos is always just to the right
4981          * of the characater whose value we are getting */
4982         do {
4983             U8 * prev_prev_char_pos;
4984             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4985                                                                       strbeg)))
4986             {
4987                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4988                 *curpos = prev_char_pos;
4989                 prev_char_pos = prev_prev_char_pos;
4990             }
4991             else {
4992                 *curpos = (U8 *) strbeg;
4993                 return SB_EDGE;
4994             }
4995         } while (sb == SB_Extend || sb == SB_Format);
4996     }
4997     else {
4998         do {
4999             if (*curpos - 2 < strbeg) {
5000                 *curpos = (U8 *) strbeg;
5001                 return SB_EDGE;
5002             }
5003             (*curpos)--;
5004             sb = getSB_VAL_CP(*(*curpos - 1));
5005         } while (sb == SB_Extend || sb == SB_Format);
5006     }
5007
5008     return sb;
5009 }
5010
5011 STATIC bool
5012 S_isWB(pTHX_ WB_enum previous,
5013              WB_enum before,
5014              WB_enum after,
5015              const U8 * const strbeg,
5016              const U8 * const curpos,
5017              const U8 * const strend,
5018              const bool utf8_target)
5019 {
5020     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5021      *  a Unicode word break, using their published algorithm, but tailored for
5022      *  Perl by treating spans of white space as one unit.  Context may be
5023      *  needed to make this determination.  If the value for the character
5024      *  before 'before' is known, it is passed as 'previous'; otherwise that
5025      *  should be set to WB_UNKNOWN.  The other input parameters give the
5026      *  boundaries and current position in the matching of the string.  That
5027      *  is, 'curpos' marks the position where the character whose wb value is
5028      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5029
5030     U8 * before_pos = (U8 *) curpos;
5031     U8 * after_pos = (U8 *) curpos;
5032     WB_enum prev = before;
5033     WB_enum next;
5034
5035     PERL_ARGS_ASSERT_ISWB;
5036
5037     /* Rule numbers in the comments below are as of Unicode 9.0 */
5038
5039   redo:
5040     before = prev;
5041     switch (WB_table[before][after]) {
5042         case WB_BREAKABLE:
5043             return TRUE;
5044
5045         case WB_NOBREAK:
5046             return FALSE;
5047
5048         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5049             next = advance_one_WB(&after_pos, strend, utf8_target,
5050                                  FALSE /* Don't skip Extend nor Format */ );
5051             /* A space immediately preceeding an Extend or Format is attached
5052              * to by them, and hence gets separated from previous spaces.
5053              * Otherwise don't break between horizontal white space */
5054             return next == WB_Extend || next == WB_Format;
5055
5056         /* WB4 Ignore Format and Extend characters, except when they appear at
5057          * the beginning of a region of text.  This code currently isn't
5058          * general purpose, but it works as the rules are currently and likely
5059          * to be laid out.  The reason it works is that when 'they appear at
5060          * the beginning of a region of text', the rule is to break before
5061          * them, just like any other character.  Therefore, the default rule
5062          * applies and we don't have to look in more depth.  Should this ever
5063          * change, we would have to have 2 'case' statements, like in the rules
5064          * below, and backup a single character (not spacing over the extend
5065          * ones) and then see if that is one of the region-end characters and
5066          * go from there */
5067         case WB_Ex_or_FO_or_ZWJ_then_foo:
5068             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5069             goto redo;
5070
5071         case WB_DQ_then_HL + WB_BREAKABLE:
5072         case WB_DQ_then_HL + WB_NOBREAK:
5073
5074             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5075
5076             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5077                                                             == WB_Hebrew_Letter)
5078             {
5079                 return FALSE;
5080             }
5081
5082              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5083
5084         case WB_HL_then_DQ + WB_BREAKABLE:
5085         case WB_HL_then_DQ + WB_NOBREAK:
5086
5087             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5088
5089             if (advance_one_WB(&after_pos, strend, utf8_target,
5090                                        TRUE /* Do skip Extend and Format */ )
5091                                                             == WB_Hebrew_Letter)
5092             {
5093                 return FALSE;
5094             }
5095
5096             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5097
5098         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5099         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5100
5101             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5102              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5103
5104             next = advance_one_WB(&after_pos, strend, utf8_target,
5105                                        TRUE /* Do skip Extend and Format */ );
5106
5107             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5108             {
5109                 return FALSE;
5110             }
5111
5112             return WB_table[before][after]
5113                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5114
5115         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5116         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5117
5118             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5119              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5120
5121             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5122             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5123             {
5124                 return FALSE;
5125             }
5126
5127             return WB_table[before][after]
5128                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5129
5130         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5131         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5132
5133             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5134              * */
5135
5136             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5137                                                             == WB_Numeric)
5138             {
5139                 return FALSE;
5140             }
5141
5142             return WB_table[before][after]
5143                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5144
5145         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5146         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5147
5148             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5149
5150             if (advance_one_WB(&after_pos, strend, utf8_target,
5151                                        TRUE /* Do skip Extend and Format */ )
5152                                                             == WB_Numeric)
5153             {
5154                 return FALSE;
5155             }
5156
5157             return WB_table[before][after]
5158                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5159
5160         case WB_RI_then_RI + WB_NOBREAK:
5161         case WB_RI_then_RI + WB_BREAKABLE:
5162             {
5163                 int RI_count = 1;
5164
5165                 /* Do not break within emoji flag sequences. That is, do not
5166                  * break between regional indicator (RI) symbols if there is an
5167                  * odd number of RI characters before the potential break
5168                  * point.
5169                  *
5170                  * WB15   sot (RI RI)* RI × RI
5171                  * WB16 [^RI] (RI RI)* RI × RI */
5172
5173                 while (backup_one_WB(&previous,
5174                                      strbeg,
5175                                      &before_pos,
5176                                      utf8_target) == WB_Regional_Indicator)
5177                 {
5178                     RI_count++;
5179                 }
5180
5181                 return RI_count % 2 != 1;
5182             }
5183
5184         default:
5185             break;
5186     }
5187
5188 #ifdef DEBUGGING
5189     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5190                                   before, after, WB_table[before][after]);
5191     assert(0);
5192 #endif
5193     return TRUE;
5194 }
5195
5196 STATIC WB_enum
5197 S_advance_one_WB(pTHX_ U8 ** curpos,
5198                        const U8 * const strend,
5199                        const bool utf8_target,
5200                        const bool skip_Extend_Format)
5201 {
5202     WB_enum wb;
5203
5204     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5205
5206     if (*curpos >= strend) {
5207         return WB_EDGE;
5208     }
5209
5210     if (utf8_target) {
5211
5212         /* Advance over Extend and Format */
5213         do {
5214             *curpos += UTF8SKIP(*curpos);
5215             if (*curpos >= strend) {
5216                 return WB_EDGE;
5217             }
5218             wb = getWB_VAL_UTF8(*curpos, strend);
5219         } while (    skip_Extend_Format
5220                  && (wb == WB_Extend || wb == WB_Format));
5221     }
5222     else {
5223         do {
5224             (*curpos)++;
5225             if (*curpos >= strend) {
5226                 return WB_EDGE;
5227             }
5228             wb = getWB_VAL_CP(**curpos);
5229         } while (    skip_Extend_Format
5230                  && (wb == WB_Extend || wb == WB_Format));
5231     }
5232
5233     return wb;
5234 }
5235
5236 STATIC WB_enum
5237 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5238 {
5239     WB_enum wb;
5240
5241     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5242
5243     /* If we know what the previous character's break value is, don't have
5244         * to look it up */
5245     if (*previous != WB_UNKNOWN) {
5246         wb = *previous;
5247
5248         /* But we need to move backwards by one */
5249         if (utf8_target) {
5250             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5251             if (! *curpos) {
5252                 *previous = WB_EDGE;
5253                 *curpos = (U8 *) strbeg;
5254             }
5255             else {
5256                 *previous = WB_UNKNOWN;
5257             }
5258         }
5259         else {
5260             (*curpos)--;
5261             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5262         }
5263
5264         /* And we always back up over these three types */
5265         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5266             return wb;
5267         }
5268     }
5269
5270     if (*curpos < strbeg) {
5271         return WB_EDGE;
5272     }
5273
5274     if (utf8_target) {
5275         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5276         if (! prev_char_pos) {
5277             return WB_EDGE;
5278         }
5279
5280         /* Back up over Extend and Format.  curpos is always just to the right
5281          * of the characater whose value we are getting */
5282         do {
5283             U8 * prev_prev_char_pos;
5284             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5285                                                    -1,
5286                                                    strbeg)))
5287             {
5288                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5289                 *curpos = prev_char_pos;
5290                 prev_char_pos = prev_prev_char_pos;
5291             }
5292             else {
5293                 *curpos = (U8 *) strbeg;
5294                 return WB_EDGE;
5295             }
5296         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5297     }
5298     else {
5299         do {
5300             if (*curpos - 2 < strbeg) {
5301                 *curpos = (U8 *) strbeg;
5302                 return WB_EDGE;
5303             }
5304             (*curpos)--;
5305             wb = getWB_VAL_CP(*(*curpos - 1));
5306         } while (wb == WB_Extend || wb == WB_Format);
5307     }
5308
5309     return wb;
5310 }
5311
5312 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5313 (                                                           \
5314     (   ( st )                                         ) && \
5315     (   ( st )->u.eval.close_paren                     ) && \
5316     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5317 )
5318
5319 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5320 (                                                           \
5321     (   ( st )                                         ) && \
5322     (   ( st )->u.eval.close_paren                     ) && \
5323     (   ( expr )                                       ) && \
5324     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5325 )
5326
5327
5328 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5329     (st)->u.eval.close_paren = ( (expr) + 1 )
5330
5331 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5332     (st)->u.eval.close_paren = 0
5333
5334 /* returns -1 on failure, $+[0] on success */
5335 STATIC SSize_t
5336 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5337 {
5338     dVAR;
5339     const bool utf8_target = reginfo->is_utf8_target;
5340     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5341     REGEXP *rex_sv = reginfo->prog;
5342     regexp *rex = ReANY(rex_sv);
5343     RXi_GET_DECL(rex,rexi);
5344     /* the current state. This is a cached copy of PL_regmatch_state */
5345     regmatch_state *st;
5346     /* cache heavy used fields of st in registers */
5347     regnode *scan;
5348     regnode *next;
5349     U32 n = 0;  /* general value; init to avoid compiler warning */
5350     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5351     SSize_t endref = 0; /* offset of end of backref when ln is start */
5352     char *locinput = startpos;
5353     char *pushinput; /* where to continue after a PUSH */
5354     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5355
5356     bool result = 0;        /* return value of S_regmatch */
5357     U32 depth = 0;            /* depth of backtrack stack */
5358     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5359     const U32 max_nochange_depth =
5360         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5361         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5362     regmatch_state *yes_state = NULL; /* state to pop to on success of
5363                                                             subpattern */
5364     /* mark_state piggy backs on the yes_state logic so that when we unwind
5365        the stack on success we can update the mark_state as we go */
5366     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5367     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5368     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5369     U32 state_num;
5370     bool no_final = 0;      /* prevent failure from backtracking? */
5371     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5372     char *startpoint = locinput;
5373     SV *popmark = NULL;     /* are we looking for a mark? */
5374     SV *sv_commit = NULL;   /* last mark name seen in failure */
5375     SV *sv_yes_mark = NULL; /* last mark name we have seen
5376                                during a successful match */
5377     U32 lastopen = 0;       /* last open we saw */
5378     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5379     SV* const oreplsv = GvSVn(PL_replgv);
5380     /* these three flags are set by various ops to signal information to
5381      * the very next op. They have a useful lifetime of exactly one loop
5382      * iteration, and are not preserved or restored by state pushes/pops
5383      */
5384     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5385     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5386     int logical = 0;        /* the following EVAL is:
5387                                 0: (?{...})
5388                                 1: (?(?{...})X|Y)
5389                                 2: (??{...})
5390                                or the following IFMATCH/UNLESSM is:
5391                                 false: plain (?=foo)
5392                                 true:  used as a condition: (?(?=foo))
5393                             */
5394     PAD* last_pad = NULL;
5395     dMULTICALL;
5396     U8 gimme = G_SCALAR;
5397     CV *caller_cv = NULL;       /* who called us */
5398     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5399     U32 maxopenparen = 0;       /* max '(' index seen so far */
5400     int to_complement;  /* Invert the result? */
5401     _char_class_number classnum;
5402     bool is_utf8_pat = reginfo->is_utf8_pat;
5403     bool match = FALSE;
5404     I32 orig_savestack_ix = PL_savestack_ix;
5405
5406 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5407 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5408 #  define SOLARIS_BAD_OPTIMIZER
5409     const U32 *pl_charclass_dup = PL_charclass;
5410 #  define PL_charclass pl_charclass_dup
5411 #endif
5412
5413 #ifdef DEBUGGING
5414     GET_RE_DEBUG_FLAGS_DECL;
5415 #endif
5416
5417     /* protect against undef(*^R) */
5418     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5419
5420     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5421     multicall_oldcatch = 0;
5422     PERL_UNUSED_VAR(multicall_cop);
5423
5424     PERL_ARGS_ASSERT_REGMATCH;
5425
5426     st = PL_regmatch_state;
5427
5428     /* Note that nextchr is a byte even in UTF */
5429     SET_nextchr;
5430     scan = prog;
5431
5432     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5433             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5434             Perl_re_printf( aTHX_ "regmatch start\n" );
5435     }));
5436
5437     while (scan != NULL) {
5438
5439
5440         next = scan + NEXT_OFF(scan);
5441         if (next == scan)
5442             next = NULL;
5443         state_num = OP(scan);
5444
5445       reenter_switch:
5446         DEBUG_EXECUTE_r(
5447             if (state_num <= REGNODE_MAX) {
5448                 SV * const prop = sv_newmortal();
5449                 regnode *rnext = regnext(scan);
5450
5451                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5452                 regprop(rex, prop, scan, reginfo, NULL);
5453                 Perl_re_printf( aTHX_
5454                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5455                     INDENT_CHARS(depth), "",
5456                     (IV)(scan - rexi->program),
5457                     SvPVX_const(prop),
5458                     (PL_regkind[OP(scan)] == END || !rnext) ?
5459                         0 : (IV)(rnext - rexi->program));
5460             }
5461         );
5462
5463         to_complement = 0;
5464
5465         SET_nextchr;
5466         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5467
5468         switch (state_num) {
5469         case SBOL: /*  /^../ and /\A../  */
5470             if (locinput == reginfo->strbeg)
5471                 break;
5472             sayNO;
5473
5474         case MBOL: /*  /^../m  */
5475             if (locinput == reginfo->strbeg ||
5476                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5477             {
5478                 break;
5479             }
5480             sayNO;
5481
5482         case GPOS: /*  \G  */
5483             if (locinput == reginfo->ganch)
5484                 break;
5485             sayNO;
5486
5487         case KEEPS: /*   \K  */
5488             /* update the startpoint */
5489             st->u.keeper.val = rex->offs[0].start;
5490             rex->offs[0].start = locinput - reginfo->strbeg;
5491             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5492             NOT_REACHED; /* NOTREACHED */
5493
5494         case KEEPS_next_fail:
5495             /* rollback the start point change */
5496             rex->offs[0].start = st->u.keeper.val;
5497             sayNO_SILENT;
5498             NOT_REACHED; /* NOTREACHED */
5499
5500         case MEOL: /* /..$/m  */
5501             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5502                 sayNO;
5503             break;
5504
5505         case SEOL: /* /..$/  */
5506             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5507                 sayNO;
5508             if (reginfo->strend - locinput > 1)
5509                 sayNO;
5510             break;
5511
5512         case EOS: /*  \z  */
5513             if (!NEXTCHR_IS_EOS)
5514                 sayNO;
5515             break;
5516
5517         case SANY: /*  /./s  */
5518             if (NEXTCHR_IS_EOS)
5519                 sayNO;
5520             goto increment_locinput;
5521
5522         case REG_ANY: /*  /./  */
5523             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5524                 sayNO;
5525             goto increment_locinput;
5526
5527
5528 #undef  ST
5529 #define ST st->u.trie
5530         case TRIEC: /* (ab|cd) with known charclass */
5531             /* In this case the charclass data is available inline so
5532                we can fail fast without a lot of extra overhead.
5533              */
5534             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5535                 DEBUG_EXECUTE_r(
5536                     Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5537                               depth, PL_colors[4], PL_colors[5])
5538                 );
5539                 sayNO_SILENT;
5540                 NOT_REACHED; /* NOTREACHED */
5541             }
5542             /* FALLTHROUGH */
5543         case TRIE:  /* (ab|cd)  */
5544             /* the basic plan of execution of the trie is:
5545              * At the beginning, run though all the states, and
5546              * find the longest-matching word. Also remember the position
5547              * of the shortest matching word. For example, this pattern:
5548              *    1  2 3 4    5
5549              *    ab|a|x|abcd|abc
5550              * when matched against the string "abcde", will generate
5551              * accept states for all words except 3, with the longest
5552              * matching word being 4, and the shortest being 2 (with
5553              * the position being after char 1 of the string).
5554              *
5555              * Then for each matching word, in word order (i.e. 1,2,4,5),
5556              * we run the remainder of the pattern; on each try setting
5557              * the current position to the character following the word,
5558              * returning to try the next word on failure.
5559              *
5560              * We avoid having to build a list of words at runtime by
5561              * using a compile-time structure, wordinfo[].prev, which
5562              * gives, for each word, the previous accepting word (if any).
5563              * In the case above it would contain the mappings 1->2, 2->0,
5564              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5565              * the longest word (4 above), a list of all words, by
5566              * following the list of prev pointers; this gives us the
5567              * unordered list 4,5,1,2. Then given the current word we have
5568              * just tried, we can go through the list and find the
5569              * next-biggest word to try (so if we just failed on word 2,
5570              * the next in the list is 4).
5571              *
5572              * Since at runtime we don't record the matching position in
5573              * the string for each word, we have to work that out for
5574              * each word we're about to process. The wordinfo table holds
5575              * the character length of each word; given that we recorded
5576              * at the start: the position of the shortest word and its
5577              * length in chars, we just need to move the pointer the
5578              * difference between the two char lengths. Depending on
5579              * Unicode status and folding, that's cheap or expensive.
5580              *
5581              * This algorithm is optimised for the case where are only a
5582              * small number of accept states, i.e. 0,1, or maybe 2.
5583              * With lots of accepts states, and having to try all of them,
5584              * it becomes quadratic on number of accept states to find all
5585              * the next words.
5586              */
5587
5588             {
5589                 /* what type of TRIE am I? (utf8 makes this contextual) */
5590                 DECL_TRIE_TYPE(scan);
5591
5592                 /* what trie are we using right now */
5593                 reg_trie_data * const trie
5594                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5595                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5596                 U32 state = trie->startstate;
5597
5598                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5599                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5600                     if (utf8_target
5601                         && nextchr >= 0 /* guard against negative EOS value in nextchr */
5602                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5603                         && scan->flags == EXACTL)
5604                     {
5605                         /* We only output for EXACTL, as we let the folder
5606                          * output this message for EXACTFLU8 to avoid
5607                          * duplication */
5608                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5609                                                                reginfo->strend);
5610                     }
5611                 }
5612                 if (   trie->bitmap
5613                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5614                 {
5615                     if (trie->states[ state ].wordnum) {
5616                          DEBUG_EXECUTE_r(
5617                             Perl_re_exec_indentf( aTHX_  "%smatched empty string...%s\n",
5618                                           depth, PL_colors[4], PL_colors[5])
5619                         );
5620                         if (!trie->jump)
5621                             break;
5622                     } else {
5623                         DEBUG_EXECUTE_r(
5624                             Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5625                                           depth, PL_colors[4], PL_colors[5])
5626                         );
5627                         sayNO_SILENT;
5628                    }
5629                 }
5630
5631             {
5632                 U8 *uc = ( U8* )locinput;
5633
5634                 STRLEN len = 0;
5635                 STRLEN foldlen = 0;
5636                 U8 *uscan = (U8*)NULL;
5637                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5638                 U32 charcount = 0; /* how many input chars we have matched */
5639                 U32 accepted = 0; /* have we seen any accepting states? */
5640
5641                 ST.jump = trie->jump;
5642                 ST.me = scan;
5643                 ST.firstpos = NULL;
5644                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5645                 ST.nextword = 0;
5646
5647                 /* fully traverse the TRIE; note the position of the
5648                    shortest accept state and the wordnum of the longest
5649                    accept state */
5650
5651                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5652                     U32 base = trie->states[ state ].trans.base;
5653                     UV uvc = 0;
5654                     U16 charid = 0;
5655                     U16 wordnum;
5656                     wordnum = trie->states[ state ].wordnum;
5657
5658                     if (wordnum) { /* it's an accept state */
5659                         if (!accepted) {
5660                             accepted = 1;
5661                             /* record first match position */
5662                             if (ST.longfold) {
5663                                 ST.firstpos = (U8*)locinput;
5664                                 ST.firstchars = 0;
5665                             }
5666                             else {
5667                                 ST.firstpos = uc;
5668                                 ST.firstchars = charcount;
5669                             }
5670                         }
5671                         if (!ST.nextword || wordnum < ST.nextword)
5672                             ST.nextword = wordnum;
5673                         ST.topword = wordnum;
5674                     }
5675
5676                     DEBUG_TRIE_EXECUTE_r({
5677                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5678                                 /* HERE */
5679                                 PerlIO_printf( Perl_debug_log,
5680                                     "%*s%sState: %4" UVxf " Accepted: %c ",
5681                                     INDENT_CHARS(depth), "", PL_colors[4],
5682                                     (UV)state, (accepted ? 'Y' : 'N'));
5683                     });
5684
5685                     /* read a char and goto next state */
5686                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5687                         I32 offset;
5688                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5689                                              uscan, len, uvc, charid, foldlen,
5690                                              foldbuf, uniflags);
5691                         charcount++;
5692                         if (foldlen>0)
5693                             ST.longfold = TRUE;
5694                         if (charid &&
5695                              ( ((offset =
5696                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5697
5698                              && ((U32)offset < trie->lasttrans)
5699                              && trie->trans[offset].check == state)
5700                         {
5701                             state = trie->trans[offset].next;
5702                         }
5703                         else {
5704                             state = 0;
5705                         }
5706                         uc += len;
5707
5708                     }
5709                     else {
5710                         state = 0;
5711                     }
5712                     DEBUG_TRIE_EXECUTE_r(
5713                         Perl_re_printf( aTHX_
5714                             "Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
5715                             charid, uvc, (UV)state, PL_colors[5] );
5716                     );
5717                 }
5718                 if (!accepted)
5719                    sayNO;
5720
5721                 /* calculate total number of accept states */
5722                 {
5723                     U16 w = ST.topword;
5724                     accepted = 0;
5725                     while (w) {
5726                         w = trie->wordinfo[w].prev;
5727                         accepted++;
5728                     }
5729                     ST.accepted = accepted;
5730                 }
5731
5732                 DEBUG_EXECUTE_r(
5733                     Perl_re_exec_indentf( aTHX_  "%sgot %" IVdf " possible matches%s\n",
5734                         depth,
5735                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5736                 );
5737                 goto trie_first_try; /* jump into the fail handler */
5738             }}
5739             NOT_REACHED; /* NOTREACHED */
5740
5741         case TRIE_next_fail: /* we failed - try next alternative */
5742         {
5743             U8 *uc;
5744             if ( ST.jump ) {
5745                 /* undo any captures done in the tail part of a branch,
5746                  * e.g.
5747                  *    /(?:X(.)(.)|Y(.)).../
5748                  * where the trie just matches X then calls out to do the
5749                  * rest of the branch */
5750                 REGCP_UNWIND(ST.cp);
5751                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5752             }
5753             if (!--ST.accepted) {
5754                 DEBUG_EXECUTE_r({
5755                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
5756                         depth,
5757                         PL_colors[4],
5758                         PL_colors[5] );
5759                 });
5760                 sayNO_SILENT;
5761             }
5762             {
5763                 /* Find next-highest word to process.  Note that this code
5764                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5765                 U16 min = 0;
5766                 U16 word;
5767                 U16 const nextword = ST.nextword;
5768                 reg_trie_wordinfo * const wordinfo
5769                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5770                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5771                     if (word > nextword && (!min || word < min))
5772                         min = word;
5773                 }
5774                 ST.nextword = min;
5775             }
5776
5777           trie_first_try:
5778             if (do_cutgroup) {
5779                 do_cutgroup = 0;
5780                 no_final = 0;
5781             }
5782
5783             if ( ST.jump ) {
5784                 ST.lastparen = rex->lastparen;
5785                 ST.lastcloseparen = rex->lastcloseparen;
5786                 REGCP_SET(ST.cp);
5787             }
5788
5789             /* find start char of end of current word */
5790             {
5791                 U32 chars; /* how many chars to skip */
5792                 reg_trie_data * const trie
5793                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5794
5795                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5796                             >=  ST.firstchars);
5797                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5798                             - ST.firstchars;
5799                 uc = ST.firstpos;
5800
5801                 if (ST.longfold) {
5802                     /* the hard option - fold each char in turn and find
5803                      * its folded length (which may be different */
5804                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5805                     STRLEN foldlen;
5806                     STRLEN len;
5807                     UV uvc;
5808                     U8 *uscan;
5809
5810                     while (chars) {
5811                         if (utf8_target) {
5812                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5813                                                     uniflags);
5814                             uc += len;
5815                         }
5816                         else {
5817                             uvc = *uc;
5818                             uc++;
5819                         }
5820                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5821                         uscan = foldbuf;
5822                         while (foldlen) {
5823                             if (!--chars)
5824                                 break;
5825                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5826                                             uniflags);
5827                             uscan += len;
5828                             foldlen -= len;
5829                         }
5830                     }
5831                 }
5832                 else {
5833                     if (utf8_target)
5834                         while (chars--)
5835                             uc += UTF8SKIP(uc);
5836                     else
5837                         uc += chars;
5838                 }
5839             }
5840
5841             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5842                             ? ST.jump[ST.nextword]
5843                             : NEXT_OFF(ST.me));
5844
5845             DEBUG_EXECUTE_r({
5846                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
5847                     depth,
5848                     PL_colors[4],
5849                     ST.nextword,
5850                     PL_colors[5]
5851                     );
5852             });
5853
5854             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
5855                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5856                 NOT_REACHED; /* NOTREACHED */
5857             }
5858             /* only one choice left - just continue */
5859             DEBUG_EXECUTE_r({
5860                 AV *const trie_words
5861                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5862                 SV ** const tmp = trie_words
5863                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5864                 SV *sv= tmp ? sv_newmortal() : NULL;
5865
5866                 Perl_re_exec_indentf( aTHX_  "%sonly one match left, short-circuiting: #%d <%s>%s\n",
5867                     depth, PL_colors[4],
5868                     ST.nextword,
5869                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5870                             PL_colors[0], PL_colors[1],
5871                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5872                         )
5873                     : "not compiled under -Dr",
5874                     PL_colors[5] );
5875             });
5876
5877             locinput = (char*)uc;
5878             continue; /* execute rest of RE */
5879             /* NOTREACHED */
5880         }
5881 #undef  ST
5882
5883         case EXACTL:             /*  /abc/l       */
5884             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5885
5886             /* Complete checking would involve going through every character
5887              * matched by the string to see if any is above latin1.  But the
5888              * comparision otherwise might very well be a fast assembly
5889              * language routine, and I (khw) don't think slowing things down
5890              * just to check for this warning is worth it.  So this just checks
5891              * the first character */
5892             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5893                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5894             }
5895             /* FALLTHROUGH */
5896         case EXACT: {            /*  /abc/        */
5897             char *s = STRING(scan);
5898             ln = STR_LEN(scan);
5899             if (utf8_target != is_utf8_pat) {
5900                 /* The target and the pattern have differing utf8ness. */
5901                 char *l = locinput;
5902                 const char * const e = s + ln;
5903
5904                 if (utf8_target) {
5905                     /* The target is utf8, the pattern is not utf8.
5906                      * Above-Latin1 code points can't match the pattern;
5907                      * invariants match exactly, and the other Latin1 ones need
5908                      * to be downgraded to a single byte in order to do the
5909                      * comparison.  (If we could be confident that the target
5910                      * is not malformed, this could be refactored to have fewer
5911                      * tests by just assuming that if the first bytes match, it
5912                      * is an invariant, but there are tests in the test suite
5913                      * dealing with (??{...}) which violate this) */
5914                     while (s < e) {
5915                         if (l >= reginfo->strend
5916                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5917                         {
5918                             sayNO;
5919                         }
5920                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
5921                             if (*l != *s) {
5922                                 sayNO;
5923                             }
5924                             l++;
5925                         }
5926                         else {
5927                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5928                             {
5929                                 sayNO;
5930                             }
5931                             l += 2;
5932                         }
5933                         s++;
5934                     }
5935                 }
5936                 else {
5937                     /* The target is not utf8, the pattern is utf8. */
5938                     while (s < e) {
5939                         if (l >= reginfo->strend
5940                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5941                         {
5942                             sayNO;
5943                         }
5944                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
5945                             if (*s != *l) {
5946                                 sayNO;
5947                             }
5948                             s++;
5949                         }
5950                         else {
5951                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5952                             {
5953                                 sayNO;
5954                             }
5955                             s += 2;
5956                         }
5957                         l++;
5958                     }
5959                 }
5960                 locinput = l;
5961             }
5962             else {
5963                 /* The target and the pattern have the same utf8ness. */
5964                 /* Inline the first character, for speed. */
5965                 if (reginfo->strend - locinput < ln
5966                     || UCHARAT(s) != nextchr
5967                     || (ln > 1 && memNE(s, locinput, ln)))
5968                 {
5969                     sayNO;
5970                 }
5971                 locinput += ln;
5972             }
5973             break;
5974             }
5975
5976         case EXACTFL: {          /*  /abc/il      */
5977             re_fold_t folder;
5978             const U8 * fold_array;
5979             const char * s;
5980             U32 fold_utf8_flags;
5981
5982             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5983             folder = foldEQ_locale;
5984             fold_array = PL_fold_locale;
5985             fold_utf8_flags = FOLDEQ_LOCALE;
5986             goto do_exactf;
5987
5988         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
5989                                       is effectively /u; hence to match, target
5990                                       must be UTF-8. */
5991             if (! utf8_target) {
5992                 sayNO;
5993             }
5994             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5995                                              | FOLDEQ_S1_FOLDS_SANE;
5996             folder = foldEQ_latin1;
5997             fold_array = PL_fold_latin1;
5998             goto do_exactf;
5999
6000         case EXACTFU_SS:         /*  /\x{df}/iu   */
6001         case EXACTFU:            /*  /abc/iu      */
6002             folder = foldEQ_latin1;
6003             fold_array = PL_fold_latin1;
6004             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6005             goto do_exactf;
6006
6007         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
6008                                    patterns */
6009             assert(! is_utf8_pat);
6010             /* FALLTHROUGH */
6011         case EXACTFA:            /*  /abc/iaa     */
6012             folder = foldEQ_latin1;
6013             fold_array = PL_fold_latin1;
6014             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6015             goto do_exactf;
6016
6017         case EXACTF:             /*  /abc/i    This node only generated for
6018                                                non-utf8 patterns */
6019             assert(! is_utf8_pat);
6020             folder = foldEQ;
6021             fold_array = PL_fold;
6022             fold_utf8_flags = 0;
6023
6024           do_exactf:
6025             s = STRING(scan);
6026             ln = STR_LEN(scan);
6027
6028             if (utf8_target
6029                 || is_utf8_pat
6030                 || state_num == EXACTFU_SS
6031                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6032             {
6033               /* Either target or the pattern are utf8, or has the issue where
6034                * the fold lengths may differ. */
6035                 const char * const l = locinput;
6036                 char *e = reginfo->strend;
6037
6038                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6039                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6040                 {
6041                     sayNO;
6042                 }
6043                 locinput = e;
6044                 break;
6045             }
6046
6047             /* Neither the target nor the pattern are utf8 */
6048             if (UCHARAT(s) != nextchr
6049                 && !NEXTCHR_IS_EOS
6050                 && UCHARAT(s) != fold_array[nextchr])
6051             {
6052                 sayNO;
6053             }
6054             if (reginfo->strend - locinput < ln)
6055                 sayNO;
6056             if (ln > 1 && ! folder(s, locinput, ln))
6057                 sayNO;
6058             locinput += ln;
6059             break;
6060         }
6061
6062         case NBOUNDL: /*  /\B/l  */
6063             to_complement = 1;
6064             /* FALLTHROUGH */
6065
6066         case BOUNDL:  /*  /\b/l  */
6067         {
6068             bool b1, b2;
6069             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6070
6071             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6072                 if (! IN_UTF8_CTYPE_LOCALE) {
6073                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6074                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6075                 }
6076                 goto boundu;
6077             }
6078
6079             if (utf8_target) {
6080                 if (locinput == reginfo->strbeg)
6081                     b1 = isWORDCHAR_LC('\n');
6082                 else {
6083                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6084                                                         (U8*)(reginfo->strbeg)),
6085                                                  (U8*)(reginfo->strend));
6086                 }
6087                 b2 = (NEXTCHR_IS_EOS)
6088                     ? isWORDCHAR_LC('\n')
6089                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6090                                               (U8*) reginfo->strend);
6091             }
6092             else { /* Here the string isn't utf8 */
6093                 b1 = (locinput == reginfo->strbeg)
6094                      ? isWORDCHAR_LC('\n')
6095                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6096                 b2 = (NEXTCHR_IS_EOS)
6097                     ? isWORDCHAR_LC('\n')
6098                     : isWORDCHAR_LC(nextchr);
6099             }
6100             if (to_complement ^ (b1 == b2)) {
6101                 sayNO;
6102             }
6103             break;
6104         }
6105
6106         case NBOUND:  /*  /\B/   */
6107             to_complement = 1;
6108             /* FALLTHROUGH */
6109
6110         case BOUND:   /*  /\b/   */
6111             if (utf8_target) {
6112                 goto bound_utf8;
6113             }
6114             goto bound_ascii_match_only;
6115
6116         case NBOUNDA: /*  /\B/a  */
6117             to_complement = 1;
6118             /* FALLTHROUGH */
6119
6120         case BOUNDA:  /*  /\b/a  */
6121         {
6122             bool b1, b2;
6123
6124           bound_ascii_match_only:
6125             /* Here the string isn't utf8, or is utf8 and only ascii characters
6126              * are to match \w.  In the latter case looking at the byte just
6127              * prior to the current one may be just the final byte of a
6128              * multi-byte character.  This is ok.  There are two cases:
6129              * 1) it is a single byte character, and then the test is doing
6130              *    just what it's supposed to.
6131              * 2) it is a multi-byte character, in which case the final byte is
6132              *    never mistakable for ASCII, and so the test will say it is
6133              *    not a word character, which is the correct answer. */
6134             b1 = (locinput == reginfo->strbeg)
6135                  ? isWORDCHAR_A('\n')
6136                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6137             b2 = (NEXTCHR_IS_EOS)
6138                 ? isWORDCHAR_A('\n')
6139                 : isWORDCHAR_A(nextchr);
6140             if (to_complement ^ (b1 == b2)) {
6141                 sayNO;
6142             }
6143             break;
6144         }
6145
6146         case NBOUNDU: /*  /\B/u  */
6147             to_complement = 1;
6148             /* FALLTHROUGH */
6149
6150         case BOUNDU:  /*  /\b/u  */
6151
6152           boundu:
6153             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6154                 match = FALSE;
6155             }
6156             else if (utf8_target) {
6157               bound_utf8:
6158                 switch((bound_type) FLAGS(scan)) {
6159                     case TRADITIONAL_BOUND:
6160                     {
6161                         bool b1, b2;
6162                         b1 = (locinput == reginfo->strbeg)
6163                              ? 0 /* isWORDCHAR_L1('\n') */
6164                              : isWORDCHAR_utf8_safe(
6165                                                reghop3((U8*)locinput,
6166                                                        -1,
6167                                                        (U8*)(reginfo->strbeg)),
6168                                                     (U8*) reginfo->strend);
6169                         b2 = (NEXTCHR_IS_EOS)
6170                             ? 0 /* isWORDCHAR_L1('\n') */
6171                             : isWORDCHAR_utf8_safe((U8*)locinput,
6172                                                    (U8*) reginfo->strend);
6173                         match = cBOOL(b1 != b2);
6174                         break;
6175                     }
6176                     case GCB_BOUND:
6177                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6178                             match = TRUE; /* GCB always matches at begin and
6179                                              end */
6180                         }
6181                         else {
6182                             /* Find the gcb values of previous and current
6183                              * chars, then see if is a break point */
6184                             match = isGCB(getGCB_VAL_UTF8(
6185                                                 reghop3((U8*)locinput,
6186                                                         -1,
6187                                                         (U8*)(reginfo->strbeg)),
6188                                                 (U8*) reginfo->strend),
6189                                           getGCB_VAL_UTF8((U8*) locinput,
6190                                                         (U8*) reginfo->strend),
6191                                           (U8*) reginfo->strbeg,
6192                                           (U8*) locinput,
6193                                           utf8_target);
6194                         }
6195                         break;
6196
6197                     case LB_BOUND:
6198                         if (locinput == reginfo->strbeg) {
6199                             match = FALSE;
6200                         }
6201                         else if (NEXTCHR_IS_EOS) {
6202                             match = TRUE;
6203                         }
6204                         else {
6205                             match = isLB(getLB_VAL_UTF8(
6206                                                 reghop3((U8*)locinput,
6207                                                         -1,
6208                                                         (U8*)(reginfo->strbeg)),
6209                                                 (U8*) reginfo->strend),
6210                                           getLB_VAL_UTF8((U8*) locinput,
6211                                                         (U8*) reginfo->strend),
6212                                           (U8*) reginfo->strbeg,
6213                                           (U8*) locinput,
6214                                           (U8*) reginfo->strend,
6215                                           utf8_target);
6216                         }
6217                         break;
6218
6219                     case SB_BOUND: /* Always matches at begin and end */
6220                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6221                             match = TRUE;
6222                         }
6223                         else {
6224                             match = isSB(getSB_VAL_UTF8(
6225                                                 reghop3((U8*)locinput,
6226                                                         -1,
6227                                                         (U8*)(reginfo->strbeg)),
6228                                                 (U8*) reginfo->strend),
6229                                           getSB_VAL_UTF8((U8*) locinput,
6230                                                         (U8*) reginfo->strend),
6231                                           (U8*) reginfo->strbeg,
6232                                           (U8*) locinput,
6233                                           (U8*) reginfo->strend,
6234                                           utf8_target);
6235                         }
6236                         break;
6237
6238                     case WB_BOUND:
6239                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6240                             match = TRUE;
6241                         }
6242                         else {
6243                             match = isWB(WB_UNKNOWN,
6244                                          getWB_VAL_UTF8(
6245                                                 reghop3((U8*)locinput,
6246                                                         -1,
6247                                                         (U8*)(reginfo->strbeg)),
6248                                                 (U8*) reginfo->strend),
6249                                           getWB_VAL_UTF8((U8*) locinput,
6250                                                         (U8*) reginfo->strend),
6251                                           (U8*) reginfo->strbeg,
6252                                           (U8*) locinput,
6253                                           (U8*) reginfo->strend,
6254                                           utf8_target);
6255                         }
6256                         break;
6257                 }
6258             }
6259             else {  /* Not utf8 target */
6260                 switch((bound_type) FLAGS(scan)) {
6261                     case TRADITIONAL_BOUND:
6262                     {
6263                         bool b1, b2;
6264                         b1 = (locinput == reginfo->strbeg)
6265                             ? 0 /* isWORDCHAR_L1('\n') */
6266                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6267                         b2 = (NEXTCHR_IS_EOS)
6268                             ? 0 /* isWORDCHAR_L1('\n') */
6269                             : isWORDCHAR_L1(nextchr);
6270                         match = cBOOL(b1 != b2);
6271                         break;
6272                     }
6273
6274                     case GCB_BOUND:
6275                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6276                             match = TRUE; /* GCB always matches at begin and
6277                                              end */
6278                         }
6279                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6280                                    range */
6281                             match =    UCHARAT(locinput - 1) != '\r'
6282                                     || UCHARAT(locinput) != '\n';
6283                         }
6284                         break;
6285
6286                     case LB_BOUND:
6287                         if (locinput == reginfo->strbeg) {
6288                             match = FALSE;
6289                         }
6290                         else if (NEXTCHR_IS_EOS) {
6291                             match = TRUE;
6292                         }
6293                         else {
6294                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6295                                          getLB_VAL_CP(UCHARAT(locinput)),
6296                                          (U8*) reginfo->strbeg,
6297                                          (U8*) locinput,
6298                                          (U8*) reginfo->strend,
6299                                          utf8_target);
6300                         }
6301                         break;
6302
6303                     case SB_BOUND: /* Always matches at begin and end */
6304                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6305                             match = TRUE;
6306                         }
6307                         else {
6308                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6309                                          getSB_VAL_CP(UCHARAT(locinput)),
6310                                          (U8*) reginfo->strbeg,
6311                                          (U8*) locinput,
6312                                          (U8*) reginfo->strend,
6313                                          utf8_target);
6314                         }
6315                         break;
6316
6317                     case WB_BOUND:
6318                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6319                             match = TRUE;
6320                         }
6321                         else {
6322                             match = isWB(WB_UNKNOWN,
6323                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6324                                          getWB_VAL_CP(UCHARAT(locinput)),
6325                                          (U8*) reginfo->strbeg,
6326                                          (U8*) locinput,
6327                                          (U8*) reginfo->strend,
6328                                          utf8_target);
6329                         }
6330                         break;
6331                 }
6332             }
6333
6334             if (to_complement ^ ! match) {
6335                 sayNO;
6336             }
6337             break;
6338
6339         case ANYOFL:  /*  /[abc]/l      */
6340             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6341
6342             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6343             {
6344               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6345             }
6346             /* FALLTHROUGH */
6347         case ANYOFD:  /*   /[abc]/d       */
6348         case ANYOF:  /*   /[abc]/       */
6349             if (NEXTCHR_IS_EOS)
6350                 sayNO;
6351             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6352                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6353                                                                    utf8_target))
6354                     sayNO;
6355                 locinput += UTF8SKIP(locinput);
6356             }
6357             else {
6358                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6359                     sayNO;
6360                 locinput++;
6361             }
6362             break;
6363
6364         /* The argument (FLAGS) to all the POSIX node types is the class number
6365          * */
6366
6367         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6368             to_complement = 1;
6369             /* FALLTHROUGH */
6370
6371         case POSIXL:    /* \w or [:punct:] etc. under /l */
6372             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6373             if (NEXTCHR_IS_EOS)
6374                 sayNO;
6375
6376             /* Use isFOO_lc() for characters within Latin1.  (Note that
6377              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6378              * wouldn't be invariant) */
6379             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6380                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6381                     sayNO;
6382                 }
6383
6384                 locinput++;
6385                 break;
6386             }
6387
6388             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6389                 /* An above Latin-1 code point, or malformed */
6390                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6391                                                        reginfo->strend);
6392                 goto utf8_posix_above_latin1;
6393             }
6394
6395             /* Here is a UTF-8 variant code point below 256 and the target is
6396              * UTF-8 */
6397             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6398                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6399                                             *(locinput + 1))))))
6400             {
6401                 sayNO;
6402             }
6403
6404             goto increment_locinput;
6405
6406         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6407             to_complement = 1;
6408             /* FALLTHROUGH */
6409
6410         case POSIXD:    /* \w or [:punct:] etc. under /d */
6411             if (utf8_target) {
6412                 goto utf8_posix;
6413             }
6414             goto posixa;
6415
6416         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6417
6418             if (NEXTCHR_IS_EOS) {
6419                 sayNO;
6420             }
6421
6422             /* All UTF-8 variants match */
6423             if (! UTF8_IS_INVARIANT(nextchr)) {
6424                 goto increment_locinput;
6425             }
6426
6427             to_complement = 1;
6428             goto join_nposixa;
6429
6430         case POSIXA:    /* \w or [:punct:] etc. under /a */
6431
6432           posixa:
6433             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6434              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6435              * character is a single byte */
6436
6437             if (NEXTCHR_IS_EOS) {
6438                 sayNO;
6439             }
6440
6441           join_nposixa:
6442
6443             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6444                                                                 FLAGS(scan)))))
6445             {
6446                 sayNO;
6447             }
6448
6449             /* Here we are either not in utf8, or we matched a utf8-invariant,
6450              * so the next char is the next byte */
6451             locinput++;
6452             break;
6453
6454         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6455             to_complement = 1;
6456             /* FALLTHROUGH */
6457
6458         case POSIXU:    /* \w or [:punct:] etc. under /u */
6459           utf8_posix:
6460             if (NEXTCHR_IS_EOS) {
6461                 sayNO;
6462             }
6463
6464             /* Use _generic_isCC() for characters within Latin1.  (Note that
6465              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6466              * wouldn't be invariant) */
6467             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6468                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6469                                                            FLAGS(scan)))))
6470                 {
6471                     sayNO;
6472                 }
6473                 locinput++;
6474             }
6475             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6476                 if (! (to_complement
6477                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6478                                                                *(locinput + 1)),
6479                                              FLAGS(scan)))))
6480                 {
6481                     sayNO;
6482                 }
6483                 locinput += 2;
6484             }
6485             else {  /* Handle above Latin-1 code points */
6486               utf8_posix_above_latin1:
6487                 classnum = (_char_class_number) FLAGS(scan);
6488                 if (classnum < _FIRST_NON_SWASH_CC) {
6489
6490                     /* Here, uses a swash to find such code points.  Load if if
6491                      * not done already */
6492                     if (! PL_utf8_swash_ptrs[classnum]) {
6493                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6494                         PL_utf8_swash_ptrs[classnum]
6495                                 = _core_swash_init("utf8",
6496                                         "",
6497                                         &PL_sv_undef, 1, 0,
6498                                         PL_XPosix_ptrs[classnum], &flags);
6499                     }
6500                     if (! (to_complement
6501                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6502                                                (U8 *) locinput, TRUE))))
6503                     {
6504                         sayNO;
6505                     }
6506                 }
6507                 else {  /* Here, uses macros to find above Latin-1 code points */
6508                     switch (classnum) {
6509                         case _CC_ENUM_SPACE:
6510                             if (! (to_complement
6511                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
6512                             {
6513                                 sayNO;
6514                             }
6515                             break;
6516                         case _CC_ENUM_BLANK:
6517                             if (! (to_complement
6518                                             ^ cBOOL(is_HORIZWS_high(locinput))))
6519                             {
6520                                 sayNO;
6521                             }
6522                             break;
6523                         case _CC_ENUM_XDIGIT:
6524                             if (! (to_complement
6525                                             ^ cBOOL(is_XDIGIT_high(locinput))))
6526                             {
6527                                 sayNO;
6528                             }
6529                             break;
6530                         case _CC_ENUM_VERTSPACE:
6531                             if (! (to_complement
6532                                             ^ cBOOL(is_VERTWS_high(locinput))))
6533                             {
6534                                 sayNO;
6535                             }
6536                             break;
6537                         default:    /* The rest, e.g. [:cntrl:], can't match
6538                                        above Latin1 */
6539                             if (! to_complement) {
6540                                 sayNO;
6541                             }
6542                             break;
6543                     }
6544                 }
6545                 locinput += UTF8SKIP(locinput);
6546             }
6547             break;
6548
6549         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6550                        a Unicode extended Grapheme Cluster */
6551             if (NEXTCHR_IS_EOS)
6552                 sayNO;
6553             if  (! utf8_target) {
6554
6555                 /* Match either CR LF  or '.', as all the other possibilities
6556                  * require utf8 */
6557                 locinput++;         /* Match the . or CR */
6558                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6559                                        match the LF */
6560                     && locinput < reginfo->strend
6561                     && UCHARAT(locinput) == '\n')
6562                 {
6563                     locinput++;
6564                 }
6565             }
6566             else {
6567
6568                 /* Get the gcb type for the current character */
6569                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6570                                                        (U8*) reginfo->strend);
6571
6572                 /* Then scan through the input until we get to the first
6573                  * character whose type is supposed to be a gcb with the
6574                  * current character.  (There is always a break at the
6575                  * end-of-input) */
6576                 locinput += UTF8SKIP(locinput);
6577                 while (locinput < reginfo->strend) {
6578                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6579                                                          (U8*) reginfo->strend);
6580                     if (isGCB(prev_gcb, cur_gcb,
6581                               (U8*) reginfo->strbeg, (U8*) locinput,
6582                               utf8_target))
6583                     {
6584                         break;
6585                     }
6586
6587                     prev_gcb = cur_gcb;
6588                     locinput += UTF8SKIP(locinput);
6589                 }
6590
6591
6592             }
6593             break;
6594
6595         case NREFFL:  /*  /\g{name}/il  */
6596         {   /* The capture buffer cases.  The ones beginning with N for the
6597                named buffers just convert to the equivalent numbered and
6598                pretend they were called as the corresponding numbered buffer
6599                op.  */
6600             /* don't initialize these in the declaration, it makes C++
6601                unhappy */
6602             const char *s;
6603             char type;
6604             re_fold_t folder;
6605             const U8 *fold_array;
6606             UV utf8_fold_flags;
6607
6608             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6609             folder = foldEQ_locale;
6610             fold_array = PL_fold_locale;
6611             type = REFFL;
6612             utf8_fold_flags = FOLDEQ_LOCALE;
6613             goto do_nref;
6614
6615         case NREFFA:  /*  /\g{name}/iaa  */
6616             folder = foldEQ_latin1;
6617             fold_array = PL_fold_latin1;
6618             type = REFFA;
6619             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6620             goto do_nref;
6621
6622         case NREFFU:  /*  /\g{name}/iu  */
6623             folder = foldEQ_latin1;
6624             fold_array = PL_fold_latin1;
6625             type = REFFU;
6626             utf8_fold_flags = 0;
6627             goto do_nref;
6628
6629         case NREFF:  /*  /\g{name}/i  */
6630             folder = foldEQ;
6631             fold_array = PL_fold;
6632             type = REFF;
6633             utf8_fold_flags = 0;
6634             goto do_nref;
6635
6636         case NREF:  /*  /\g{name}/   */
6637             type = REF;
6638             folder = NULL;
6639             fold_array = NULL;
6640             utf8_fold_flags = 0;
6641           do_nref:
6642
6643             /* For the named back references, find the corresponding buffer
6644              * number */
6645             n = reg_check_named_buff_matched(rex,scan);
6646
6647             if ( ! n ) {
6648                 sayNO;
6649             }
6650             goto do_nref_ref_common;
6651
6652         case REFFL:  /*  /\1/il  */
6653             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6654             folder = foldEQ_locale;
6655             fold_array = PL_fold_locale;
6656             utf8_fold_flags = FOLDEQ_LOCALE;
6657             goto do_ref;
6658
6659         case REFFA:  /*  /\1/iaa  */
6660             folder = foldEQ_latin1;
6661             fold_array = PL_fold_latin1;
6662             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6663             goto do_ref;
6664
6665         case REFFU:  /*  /\1/iu  */
6666             folder = foldEQ_latin1;
6667             fold_array = PL_fold_latin1;
6668             utf8_fold_flags = 0;
6669             goto do_ref;
6670
6671         case REFF:  /*  /\1/i  */
6672             folder = foldEQ;
6673             fold_array = PL_fold;
6674             utf8_fold_flags = 0;
6675             goto do_ref;
6676
6677         case REF:  /*  /\1/    */
6678             folder = NULL;
6679             fold_array = NULL;
6680             utf8_fold_flags = 0;
6681
6682           do_ref:
6683             type = OP(scan);
6684             n = ARG(scan);  /* which paren pair */
6685
6686           do_nref_ref_common:
6687             ln = rex->offs[n].start;
6688             endref = rex->offs[n].end;
6689             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6690             if (rex->lastparen < n || ln == -1 || endref == -1)
6691                 sayNO;                  /* Do not match unless seen CLOSEn. */
6692             if (ln == endref)
6693                 break;
6694
6695             s = reginfo->strbeg + ln;
6696             if (type != REF     /* REF can do byte comparison */
6697                 && (utf8_target || type == REFFU || type == REFFL))
6698             {
6699                 char * limit = reginfo->strend;
6700
6701                 /* This call case insensitively compares the entire buffer
6702                     * at s, with the current input starting at locinput, but
6703                     * not going off the end given by reginfo->strend, and
6704                     * returns in <limit> upon success, how much of the
6705                     * current input was matched */
6706                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
6707                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6708                 {
6709                     sayNO;
6710                 }
6711                 locinput = limit;
6712                 break;
6713             }
6714
6715             /* Not utf8:  Inline the first character, for speed. */
6716             if (!NEXTCHR_IS_EOS &&
6717                 UCHARAT(s) != nextchr &&
6718                 (type == REF ||
6719                  UCHARAT(s) != fold_array[nextchr]))
6720                 sayNO;
6721             ln = endref - ln;
6722             if (locinput + ln > reginfo->strend)
6723                 sayNO;
6724             if (ln > 1 && (type == REF
6725                            ? memNE(s, locinput, ln)
6726                            : ! folder(s, locinput, ln)))
6727                 sayNO;
6728             locinput += ln;
6729             break;
6730         }
6731
6732         case NOTHING: /* null op; e.g. the 'nothing' following
6733                        * the '*' in m{(a+|b)*}' */
6734             break;
6735         case TAIL: /* placeholder while compiling (A|B|C) */
6736             break;
6737
6738 #undef  ST
6739 #define ST st->u.eval
6740 #define CUR_EVAL cur_eval->u.eval
6741
6742         {
6743             SV *ret;
6744             REGEXP *re_sv;
6745             regexp *re;
6746             regexp_internal *rei;
6747             regnode *startpoint;
6748             U32 arg;
6749
6750         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6751             arg= (U32)ARG(scan);
6752             if (cur_eval && cur_eval->locinput == locinput) {
6753                 if ( ++nochange_depth > max_nochange_depth )
6754                     Perl_croak(aTHX_
6755                         "Pattern subroutine nesting without pos change"
6756                         " exceeded limit in regex");
6757             } else {
6758                 nochange_depth = 0;
6759             }
6760             re_sv = rex_sv;
6761             re = rex;
6762             rei = rexi;
6763             startpoint = scan + ARG2L(scan);
6764             EVAL_CLOSE_PAREN_SET( st, arg );
6765             /* Detect infinite recursion
6766              *
6767              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6768              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6769              * So we track the position in the string we are at each time
6770              * we recurse and if we try to enter the same routine twice from
6771              * the same position we throw an error.
6772              */
6773             if ( rex->recurse_locinput[arg] == locinput ) {
6774                 /* FIXME: we should show the regop that is failing as part
6775                  * of the error message. */
6776                 Perl_croak(aTHX_ "Infinite recursion in regex");
6777             } else {
6778                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6779                 rex->recurse_locinput[arg]= locinput;
6780
6781                 DEBUG_r({
6782                     GET_RE_DEBUG_FLAGS_DECL;
6783                     DEBUG_STACK_r({
6784                         Perl_re_exec_indentf( aTHX_
6785                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6786                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6787                         );
6788                     });
6789                 });
6790             }
6791
6792             /* Save all the positions seen so far. */
6793             ST.cp = regcppush(rex, 0, maxopenparen);
6794             REGCP_SET(ST.lastcp);
6795
6796             /* and then jump to the code we share with EVAL */
6797             goto eval_recurse_doit;
6798             /* NOTREACHED */
6799
6800         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
6801             if (cur_eval && cur_eval->locinput==locinput) {
6802                 if ( ++nochange_depth > max_nochange_depth )
6803                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6804             } else {
6805                 nochange_depth = 0;
6806             }
6807             {
6808                 /* execute the code in the {...} */
6809
6810                 dSP;
6811                 IV before;
6812                 OP * const oop = PL_op;
6813                 COP * const ocurcop = PL_curcop;
6814                 OP *nop;
6815                 CV *newcv;
6816
6817                 /* save *all* paren positions */
6818                 regcppush(rex, 0, maxopenparen);
6819                 REGCP_SET(ST.lastcp);
6820
6821                 if (!caller_cv)
6822                     caller_cv = find_runcv(NULL);
6823
6824                 n = ARG(scan);
6825
6826                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6827                     newcv = (ReANY(
6828                                     (REGEXP*)(rexi->data->data[n])
6829                             ))->qr_anoncv;
6830                     nop = (OP*)rexi->data->data[n+1];
6831                 }
6832                 else if (rexi->data->what[n] == 'l') { /* literal code */
6833                     newcv = caller_cv;
6834                     nop = (OP*)rexi->data->data[n];
6835                     assert(CvDEPTH(newcv));
6836                 }
6837                 else {
6838                     /* literal with own CV */
6839                     assert(rexi->data->what[n] == 'L');
6840                     newcv = rex->qr_anoncv;
6841                     nop = (OP*)rexi->data->data[n];
6842                 }
6843
6844                 /* Some notes about MULTICALL and the context and save stacks.
6845                  *
6846                  * In something like
6847                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
6848                  * since codeblocks don't introduce a new scope (so that
6849                  * local() etc accumulate), at the end of a successful
6850                  * match there will be a SAVEt_CLEARSV on the savestack
6851                  * for each of $x, $y, $z. If the three code blocks above
6852                  * happen to have come from different CVs (e.g. via
6853                  * embedded qr//s), then we must ensure that during any
6854                  * savestack unwinding, PL_comppad always points to the
6855                  * right pad at each moment. We achieve this by
6856                  * interleaving SAVEt_COMPPAD's on the savestack whenever
6857                  * there is a change of pad.
6858                  * In theory whenever we call a code block, we should
6859                  * push a CXt_SUB context, then pop it on return from
6860                  * that code block. This causes a bit of an issue in that
6861                  * normally popping a context also clears the savestack
6862                  * back to cx->blk_oldsaveix, but here we specifically
6863                  * don't want to clear the save stack on exit from the
6864                  * code block.
6865                  * Also for efficiency we don't want to keep pushing and
6866                  * popping the single SUB context as we backtrack etc.
6867                  * So instead, we push a single context the first time
6868                  * we need, it, then hang onto it until the end of this
6869                  * function. Whenever we encounter a new code block, we
6870                  * update the CV etc if that's changed. During the times
6871                  * in this function where we're not executing a code
6872                  * block, having the SUB context still there is a bit
6873                  * naughty - but we hope that no-one notices.
6874                  * When the SUB context is initially pushed, we fake up
6875                  * cx->blk_oldsaveix to be as if we'd pushed this context
6876                  * on first entry to S_regmatch rather than at some random
6877                  * point during the regexe execution. That way if we
6878                  * croak, popping the context stack will ensure that
6879                  * *everything* SAVEd by this function is undone and then
6880                  * the context popped, rather than e.g., popping the
6881                  * context (and restoring the original PL_comppad) then
6882                  * popping more of the savestack and restoring a bad
6883                  * PL_comppad.
6884                  */
6885
6886                 /* If this is the first EVAL, push a MULTICALL. On
6887                  * subsequent calls, if we're executing a different CV, or
6888                  * if PL_comppad has got messed up from backtracking
6889                  * through SAVECOMPPADs, then refresh the context.
6890                  */
6891                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6892                 {
6893                     U8 flags = (CXp_SUB_RE |
6894                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6895                     SAVECOMPPAD();
6896                     if (last_pushed_cv) {
6897                         CHANGE_MULTICALL_FLAGS(newcv, flags);
6898                     }
6899                     else {
6900                         PUSH_MULTICALL_FLAGS(newcv, flags);
6901                     }
6902                     /* see notes above */
6903                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
6904
6905                     last_pushed_cv = newcv;
6906                 }
6907                 else {
6908                     /* these assignments are just to silence compiler
6909                      * warnings */
6910                     multicall_cop = NULL;
6911                 }
6912                 last_pad = PL_comppad;
6913
6914                 /* the initial nextstate you would normally execute
6915                  * at the start of an eval (which would cause error
6916                  * messages to come from the eval), may be optimised
6917                  * away from the execution path in the regex code blocks;
6918                  * so manually set PL_curcop to it initially */
6919                 {
6920                     OP *o = cUNOPx(nop)->op_first;
6921                     assert(o->op_type == OP_NULL);
6922                     if (o->op_targ == OP_SCOPE) {
6923                         o = cUNOPo->op_first;
6924                     }
6925                     else {
6926                         assert(o->op_targ == OP_LEAVE);
6927                         o = cUNOPo->op_first;
6928                         assert(o->op_type == OP_ENTER);
6929                         o = OpSIBLING(o);
6930                     }
6931
6932                     if (o->op_type != OP_STUB) {
6933                         assert(    o->op_type == OP_NEXTSTATE
6934                                 || o->op_type == OP_DBSTATE
6935                                 || (o->op_type == OP_NULL
6936                                     &&  (  o->op_targ == OP_NEXTSTATE
6937                                         || o->op_targ == OP_DBSTATE
6938                                         )
6939                                     )
6940                         );
6941                         PL_curcop = (COP*)o;
6942                     }
6943                 }
6944                 nop = nop->op_next;
6945
6946                 DEBUG_STATE_r( Perl_re_printf( aTHX_
6947                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
6948
6949                 rex->offs[0].end = locinput - reginfo->strbeg;
6950                 if (reginfo->info_aux_eval->pos_magic)
6951                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6952                                   reginfo->sv, reginfo->strbeg,
6953                                   locinput - reginfo->strbeg);
6954
6955                 if (sv_yes_mark) {
6956                     SV *sv_mrk = get_sv("REGMARK", 1);
6957                     sv_setsv(sv_mrk, sv_yes_mark);
6958                 }
6959
6960                 /* we don't use MULTICALL here as we want to call the
6961                  * first op of the block of interest, rather than the
6962                  * first op of the sub. Also, we don't want to free
6963                  * the savestack frame */
6964                 before = (IV)(SP-PL_stack_base);
6965                 PL_op = nop;
6966                 CALLRUNOPS(aTHX);                       /* Scalar context. */
6967                 SPAGAIN;
6968                 if ((IV)(SP-PL_stack_base) == before)
6969                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
6970                 else {
6971                     ret = POPs;
6972                     PUTBACK;
6973                 }
6974
6975                 /* before restoring everything, evaluate the returned
6976                  * value, so that 'uninit' warnings don't use the wrong
6977                  * PL_op or pad. Also need to process any magic vars
6978                  * (e.g. $1) *before* parentheses are restored */
6979
6980                 PL_op = NULL;
6981
6982                 re_sv = NULL;
6983                 if (logical == 0)        /*   (?{})/   */
6984                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6985                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
6986                     sw = cBOOL(SvTRUE_NN(ret));
6987                     logical = 0;
6988                 }
6989                 else {                   /*  /(??{})  */
6990                     /*  if its overloaded, let the regex compiler handle
6991                      *  it; otherwise extract regex, or stringify  */
6992                     if (SvGMAGICAL(ret))
6993                         ret = sv_mortalcopy(ret);
6994                     if (!SvAMAGIC(ret)) {
6995                         SV *sv = ret;
6996                         if (SvROK(sv))
6997                             sv = SvRV(sv);
6998                         if (SvTYPE(sv) == SVt_REGEXP)
6999                             re_sv = (REGEXP*) sv;
7000                         else if (SvSMAGICAL(ret)) {
7001                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7002                             if (mg)
7003                                 re_sv = (REGEXP *) mg->mg_obj;
7004                         }
7005
7006                         /* force any undef warnings here */
7007                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7008                             ret = sv_mortalcopy(ret);
7009                             (void) SvPV_force_nolen(ret);
7010                         }
7011                     }
7012
7013                 }
7014
7015                 /* *** Note that at this point we don't restore
7016                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7017                  * be used again soon. This is safe as long as nothing
7018                  * in the regexp code uses the pad ! */
7019                 PL_op = oop;
7020                 PL_curcop = ocurcop;
7021                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7022                 PL_curpm_under = PL_curpm;
7023                 PL_curpm = PL_reg_curpm;
7024
7025                 if (logical != 2) {
7026                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7027                     /* NOTREACHED */
7028                 }
7029             }
7030
7031                 /* only /(??{})/  from now on */
7032                 logical = 0;
7033                 {
7034                     /* extract RE object from returned value; compiling if
7035                      * necessary */
7036
7037                     if (re_sv) {
7038                         re_sv = reg_temp_copy(NULL, re_sv);
7039                     }
7040                     else {
7041                         U32 pm_flags = 0;
7042
7043                         if (SvUTF8(ret) && IN_BYTES) {
7044                             /* In use 'bytes': make a copy of the octet
7045                              * sequence, but without the flag on */
7046                             STRLEN len;
7047                             const char *const p = SvPV(ret, len);
7048                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7049                         }
7050                         if (rex->intflags & PREGf_USE_RE_EVAL)
7051                             pm_flags |= PMf_USE_RE_EVAL;
7052
7053                         /* if we got here, it should be an engine which
7054                          * supports compiling code blocks and stuff */
7055                         assert(rex->engine && rex->engine->op_comp);
7056                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7057                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7058                                     rex->engine, NULL, NULL,
7059                                     /* copy /msixn etc to inner pattern */
7060                                     ARG2L(scan),
7061                                     pm_flags);
7062
7063                         if (!(SvFLAGS(ret)
7064                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7065                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7066                             /* This isn't a first class regexp. Instead, it's
7067                                caching a regexp onto an existing, Perl visible
7068                                scalar.  */
7069                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7070                         }
7071                     }
7072                     SAVEFREESV(re_sv);
7073                     re = ReANY(re_sv);
7074                 }
7075                 RXp_MATCH_COPIED_off(re);
7076                 re->subbeg = rex->subbeg;
7077                 re->sublen = rex->sublen;
7078                 re->suboffset = rex->suboffset;
7079                 re->subcoffset = rex->subcoffset;
7080                 re->lastparen = 0;
7081                 re->lastcloseparen = 0;
7082                 rei = RXi_GET(re);
7083                 DEBUG_EXECUTE_r(
7084                     debug_start_match(re_sv, utf8_target, locinput,
7085                                     reginfo->strend, "Matching embedded");
7086                 );
7087                 startpoint = rei->program + 1;
7088                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7089                                              * close_paren only for GOSUB */
7090                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7091                 /* Save all the seen positions so far. */
7092                 ST.cp = regcppush(rex, 0, maxopenparen);
7093                 REGCP_SET(ST.lastcp);
7094                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7095                 maxopenparen = 0;
7096                 /* run the pattern returned from (??{...}) */
7097
7098               eval_recurse_doit: /* Share code with GOSUB below this line
7099                             * At this point we expect the stack context to be
7100                             * set up correctly */
7101
7102                 /* invalidate the S-L poscache. We're now executing a
7103                  * different set of WHILEM ops (and their associated
7104                  * indexes) against the same string, so the bits in the
7105                  * cache are meaningless. Setting maxiter to zero forces
7106                  * the cache to be invalidated and zeroed before reuse.
7107                  * XXX This is too dramatic a measure. Ideally we should
7108                  * save the old cache and restore when running the outer
7109                  * pattern again */
7110                 reginfo->poscache_maxiter = 0;
7111
7112                 /* the new regexp might have a different is_utf8_pat than we do */
7113                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7114
7115                 ST.prev_rex = rex_sv;
7116                 ST.prev_curlyx = cur_curlyx;
7117                 rex_sv = re_sv;
7118                 SET_reg_curpm(rex_sv);
7119                 rex = re;
7120                 rexi = rei;
7121                 cur_curlyx = NULL;
7122                 ST.B = next;
7123                 ST.prev_eval = cur_eval;
7124                 cur_eval = st;
7125                 /* now continue from first node in postoned RE */
7126                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7127                 NOT_REACHED; /* NOTREACHED */
7128         }
7129
7130         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7131             /* note: this is called twice; first after popping B, then A */
7132             DEBUG_STACK_r({
7133                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7134                     depth, cur_eval, ST.prev_eval);
7135             });
7136
7137 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7138             if ( cur_eval && CUR_EVAL.close_paren ) {\
7139                 DEBUG_STACK_r({ \
7140                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7141                         depth,    \
7142                         CUR_EVAL.close_paren - 1,\
7143                         cur_eval, \
7144                         VAL);     \
7145                 });               \
7146                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7147             }
7148
7149             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7150
7151             rex_sv = ST.prev_rex;
7152             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7153             SET_reg_curpm(rex_sv);
7154             rex = ReANY(rex_sv);
7155             rexi = RXi_GET(rex);
7156             {
7157                 /* preserve $^R across LEAVE's. See Bug 121070. */
7158                 SV *save_sv= GvSV(PL_replgv);
7159                 SvREFCNT_inc(save_sv);
7160                 regcpblow(ST.cp); /* LEAVE in disguise */
7161                 sv_setsv(GvSV(PL_replgv), save_sv);
7162                 SvREFCNT_dec(save_sv);
7163             }
7164             cur_eval = ST.prev_eval;
7165             cur_curlyx = ST.prev_curlyx;
7166
7167             /* Invalidate cache. See "invalidate" comment above. */
7168             reginfo->poscache_maxiter = 0;
7169             if ( nochange_depth )
7170                 nochange_depth--;
7171
7172             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7173             sayYES;
7174
7175
7176         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7177             REGCP_UNWIND(ST.lastcp);
7178             sayNO;
7179
7180         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7181             /* note: this is called twice; first after popping B, then A */
7182             DEBUG_STACK_r({
7183                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7184                     depth, cur_eval, ST.prev_eval);
7185             });
7186
7187             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7188
7189             rex_sv = ST.prev_rex;
7190             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7191             SET_reg_curpm(rex_sv);
7192             rex = ReANY(rex_sv);
7193             rexi = RXi_GET(rex);
7194
7195             REGCP_UNWIND(ST.lastcp);
7196             regcppop(rex, &maxopenparen);
7197             cur_eval = ST.prev_eval;
7198             cur_curlyx = ST.prev_curlyx;
7199
7200             /* Invalidate cache. See "invalidate" comment above. */
7201             reginfo->poscache_maxiter = 0;
7202             if ( nochange_depth )
7203                 nochange_depth--;
7204
7205             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7206             sayNO_SILENT;
7207 #undef ST
7208
7209         case OPEN: /*  (  */
7210             n = ARG(scan);  /* which paren pair */
7211             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7212             if (n > maxopenparen)
7213                 maxopenparen = n;
7214             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7215                 "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7216                 depth,
7217                 PTR2UV(rex),
7218                 PTR2UV(rex->offs),
7219                 (UV)n,
7220                 (IV)rex->offs[n].start_tmp,
7221                 (UV)maxopenparen
7222             ));
7223             lastopen = n;
7224             break;
7225
7226 /* XXX really need to log other places start/end are set too */
7227 #define CLOSE_CAPTURE                                                      \
7228     rex->offs[n].start = rex->offs[n].start_tmp;                           \
7229     rex->offs[n].end = locinput - reginfo->strbeg;                         \
7230     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
7231         "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
7232         depth,                                                             \
7233         PTR2UV(rex),                                                       \
7234         PTR2UV(rex->offs),                                                 \
7235         (UV)n,                                                             \
7236         (IV)rex->offs[n].start,                                            \
7237         (IV)rex->offs[n].end                                               \
7238     ))
7239
7240         case CLOSE:  /*  )  */
7241             n = ARG(scan);  /* which paren pair */
7242             CLOSE_CAPTURE;
7243             if (n > rex->lastparen)
7244                 rex->lastparen = n;
7245             rex->lastcloseparen = n;
7246             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7247                 goto fake_end;
7248
7249             break;
7250
7251         case ACCEPT:  /*  (*ACCEPT)  */
7252             if (scan->flags)
7253                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7254             if (ARG2L(scan)){
7255                 regnode *cursor;
7256                 for (cursor=scan;
7257                      cursor && OP(cursor)!=END;
7258                      cursor=regnext(cursor))
7259                 {
7260                     if ( OP(cursor)==CLOSE ){
7261                         n = ARG(cursor);
7262                         if ( n <= lastopen ) {
7263                             CLOSE_CAPTURE;
7264                             if (n > rex->lastparen)
7265                                 rex->lastparen = n;
7266                             rex->lastcloseparen = n;
7267                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7268                                 break;
7269                         }
7270                     }
7271                 }
7272             }
7273             goto fake_end;
7274             /* NOTREACHED */
7275
7276         case GROUPP:  /*  (?(1))  */
7277             n = ARG(scan);  /* which paren pair */
7278             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7279             break;
7280
7281         case NGROUPP:  /*  (?(<name>))  */
7282             /* reg_check_named_buff_matched returns 0 for no match */
7283             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7284             break;
7285
7286         case INSUBP:   /*  (?(R))  */
7287             n = ARG(scan);
7288             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7289              * of SCAN is already set up as matches a eval.close_paren */
7290             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7291             break;
7292
7293         case DEFINEP:  /*  (?(DEFINE))  */
7294             sw = 0;
7295             break;
7296
7297         case IFTHEN:   /*  (?(cond)A|B)  */
7298             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7299             if (sw)
7300                 next = NEXTOPER(NEXTOPER(scan));
7301             else {
7302                 next = scan + ARG(scan);
7303                 if (OP(next) == IFTHEN) /* Fake one. */
7304                     next = NEXTOPER(NEXTOPER(next));
7305             }
7306             break;
7307
7308         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7309             logical = scan->flags;
7310             break;
7311
7312 /*******************************************************************
7313
7314 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7315 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7316 STAR/PLUS/CURLY/CURLYN are used instead.)
7317
7318 A*B is compiled as <CURLYX><A><WHILEM><B>
7319
7320 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7321 state, which contains the current count, initialised to -1. It also sets
7322 cur_curlyx to point to this state, with any previous value saved in the
7323 state block.
7324
7325 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7326 since the pattern may possibly match zero times (i.e. it's a while {} loop
7327 rather than a do {} while loop).
7328
7329 Each entry to WHILEM represents a successful match of A. The count in the
7330 CURLYX block is incremented, another WHILEM state is pushed, and execution
7331 passes to A or B depending on greediness and the current count.
7332
7333 For example, if matching against the string a1a2a3b (where the aN are
7334 substrings that match /A/), then the match progresses as follows: (the
7335 pushed states are interspersed with the bits of strings matched so far):
7336
7337     <CURLYX cnt=-1>
7338     <CURLYX cnt=0><WHILEM>
7339     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7340     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7341     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7342     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7343
7344 (Contrast this with something like CURLYM, which maintains only a single
7345 backtrack state:
7346
7347     <CURLYM cnt=0> a1
7348     a1 <CURLYM cnt=1> a2
7349     a1 a2 <CURLYM cnt=2> a3
7350     a1 a2 a3 <CURLYM cnt=3> b
7351 )
7352
7353 Each WHILEM state block marks a point to backtrack to upon partial failure
7354 of A or B, and also contains some minor state data related to that
7355 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7356 overall state, such as the count, and pointers to the A and B ops.
7357
7358 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7359 must always point to the *current* CURLYX block, the rules are:
7360
7361 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7362 and set cur_curlyx to point the new block.
7363
7364 When popping the CURLYX block after a successful or unsuccessful match,
7365 restore the previous cur_curlyx.
7366
7367 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7368 to the outer one saved in the CURLYX block.
7369
7370 When popping the WHILEM block after a successful or unsuccessful B match,
7371 restore the previous cur_curlyx.
7372
7373 Here's an example for the pattern (AI* BI)*BO
7374 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7375
7376 cur_
7377 curlyx backtrack stack
7378 ------ ---------------
7379 NULL
7380 CO     <CO prev=NULL> <WO>
7381 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7382 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7383 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7384
7385 At this point the pattern succeeds, and we work back down the stack to
7386 clean up, restoring as we go:
7387
7388 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7389 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7390 CO     <CO prev=NULL> <WO>
7391 NULL
7392
7393 *******************************************************************/
7394
7395 #define ST st->u.curlyx
7396
7397         case CURLYX:    /* start of /A*B/  (for complex A) */
7398         {
7399             /* No need to save/restore up to this paren */
7400             I32 parenfloor = scan->flags;
7401
7402             assert(next); /* keep Coverity happy */
7403             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7404                 next += ARG(next);
7405
7406             /* XXXX Probably it is better to teach regpush to support
7407                parenfloor > maxopenparen ... */
7408             if (parenfloor > (I32)rex->lastparen)
7409                 parenfloor = rex->lastparen; /* Pessimization... */
7410
7411             ST.prev_curlyx= cur_curlyx;
7412             cur_curlyx = st;
7413             ST.cp = PL_savestack_ix;
7414
7415             /* these fields contain the state of the current curly.
7416              * they are accessed by subsequent WHILEMs */
7417             ST.parenfloor = parenfloor;
7418             ST.me = scan;
7419             ST.B = next;
7420             ST.minmod = minmod;
7421             minmod = 0;
7422             ST.count = -1;      /* this will be updated by WHILEM */
7423             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7424
7425             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7426             NOT_REACHED; /* NOTREACHED */
7427         }
7428
7429         case CURLYX_end: /* just finished matching all of A*B */
7430             cur_curlyx = ST.prev_curlyx;
7431             sayYES;
7432             NOT_REACHED; /* NOTREACHED */
7433
7434         case CURLYX_end_fail: /* just failed to match all of A*B */
7435             regcpblow(ST.cp);
7436             cur_curlyx = ST.prev_curlyx;
7437             sayNO;
7438             NOT_REACHED; /* NOTREACHED */
7439
7440
7441 #undef ST
7442 #define ST st->u.whilem
7443
7444         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7445         {
7446             /* see the discussion above about CURLYX/WHILEM */
7447             I32 n;
7448             int min, max;
7449             regnode *A;
7450
7451             assert(cur_curlyx); /* keep Coverity happy */
7452
7453             min = ARG1(cur_curlyx->u.curlyx.me);
7454             max = ARG2(cur_curlyx->u.curlyx.me);
7455             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7456             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7457             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7458             ST.cache_offset = 0;
7459             ST.cache_mask = 0;
7460
7461
7462             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: matched %ld out of %d..%d\n",
7463                   depth, (long)n, min, max)
7464             );
7465
7466             /* First just match a string of min A's. */
7467
7468             if (n < min) {
7469                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7470                 cur_curlyx->u.curlyx.lastloc = locinput;
7471                 REGCP_SET(ST.lastcp);
7472
7473                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7474                 NOT_REACHED; /* NOTREACHED */
7475             }
7476
7477             /* If degenerate A matches "", assume A done. */
7478
7479             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7480                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: empty match detected, trying continuation...\n",
7481                    depth)
7482                 );
7483                 goto do_whilem_B_max;
7484             }
7485
7486             /* super-linear cache processing.
7487              *
7488              * The idea here is that for certain types of CURLYX/WHILEM -
7489              * principally those whose upper bound is infinity (and
7490              * excluding regexes that have things like \1 and other very
7491              * non-regular expresssiony things), then if a pattern like
7492              * /....A*.../ fails and we backtrack to the WHILEM, then we
7493              * make a note that this particular WHILEM op was at string
7494              * position 47 (say) when the rest of pattern failed. Then, if
7495              * we ever find ourselves back at that WHILEM, and at string
7496              * position 47 again, we can just fail immediately rather than
7497              * running the rest of the pattern again.
7498              *
7499              * This is very handy when patterns start to go
7500              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7501              * with a combinatorial explosion of backtracking.
7502              *
7503              * The cache is implemented as a bit array, with one bit per
7504              * string byte position per WHILEM op (up to 16) - so its
7505              * between 0.25 and 2x the string size.
7506              *
7507              * To avoid allocating a poscache buffer every time, we do an
7508              * initially countdown; only after we have  executed a WHILEM
7509              * op (string-length x #WHILEMs) times do we allocate the
7510              * cache.
7511              *
7512              * The top 4 bits of scan->flags byte say how many different
7513              * relevant CURLLYX/WHILEM op pairs there are, while the
7514              * bottom 4-bits is the identifying index number of this
7515              * WHILEM.
7516              */
7517
7518             if (scan->flags) {
7519
7520                 if (!reginfo->poscache_maxiter) {
7521                     /* start the countdown: Postpone detection until we
7522                      * know the match is not *that* much linear. */
7523                     reginfo->poscache_maxiter
7524                         =    (reginfo->strend - reginfo->strbeg + 1)
7525                            * (scan->flags>>4);
7526                     /* possible overflow for long strings and many CURLYX's */
7527                     if (reginfo->poscache_maxiter < 0)
7528                         reginfo->poscache_maxiter = I32_MAX;
7529                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7530                 }
7531
7532                 if (reginfo->poscache_iter-- == 0) {
7533                     /* initialise cache */
7534                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7535                     regmatch_info_aux *const aux = reginfo->info_aux;
7536                     if (aux->poscache) {
7537                         if ((SSize_t)reginfo->poscache_size < size) {
7538                             Renew(aux->poscache, size, char);
7539                             reginfo->poscache_size = size;
7540                         }
7541                         Zero(aux->poscache, size, char);
7542                     }
7543                     else {
7544                         reginfo->poscache_size = size;
7545                         Newxz(aux->poscache, size, char);
7546                     }
7547                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7548       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
7549                               PL_colors[4], PL_colors[5])
7550                     );
7551                 }
7552
7553                 if (reginfo->poscache_iter < 0) {
7554                     /* have we already failed at this position? */
7555                     SSize_t offset, mask;
7556
7557                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7558                     offset  = (scan->flags & 0xf) - 1
7559                                 +   (locinput - reginfo->strbeg)
7560                                   * (scan->flags>>4);
7561                     mask    = 1 << (offset % 8);
7562                     offset /= 8;
7563                     if (reginfo->info_aux->poscache[offset] & mask) {
7564                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: (cache) already tried at this position...\n",
7565                             depth)
7566                         );
7567                         cur_curlyx->u.curlyx.count--;
7568                         sayNO; /* cache records failure */
7569                     }
7570                     ST.cache_offset = offset;
7571                     ST.cache_mask   = mask;
7572                 }
7573             }
7574
7575             /* Prefer B over A for minimal matching. */
7576
7577             if (cur_curlyx->u.curlyx.minmod) {
7578                 ST.save_curlyx = cur_curlyx;
7579                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7580                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7581                                     locinput);
7582                 NOT_REACHED; /* NOTREACHED */
7583             }
7584
7585             /* Prefer A over B for maximal matching. */
7586
7587             if (n < max) { /* More greed allowed? */
7588                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7589                             maxopenparen);
7590                 cur_curlyx->u.curlyx.lastloc = locinput;
7591                 REGCP_SET(ST.lastcp);
7592                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7593                 NOT_REACHED; /* NOTREACHED */
7594             }
7595             goto do_whilem_B_max;
7596         }
7597         NOT_REACHED; /* NOTREACHED */
7598
7599         case WHILEM_B_min: /* just matched B in a minimal match */
7600         case WHILEM_B_max: /* just matched B in a maximal match */
7601             cur_curlyx = ST.save_curlyx;
7602             sayYES;
7603             NOT_REACHED; /* NOTREACHED */
7604
7605         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7606             cur_curlyx = ST.save_curlyx;
7607             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7608             cur_curlyx->u.curlyx.count--;
7609             CACHEsayNO;
7610             NOT_REACHED; /* NOTREACHED */
7611
7612         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7613             REGCP_UNWIND(ST.lastcp);
7614             regcppop(rex, &maxopenparen);
7615             /* FALLTHROUGH */
7616         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7617             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7618             cur_curlyx->u.curlyx.count--;
7619             CACHEsayNO;
7620             NOT_REACHED; /* NOTREACHED */
7621
7622         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7623             REGCP_UNWIND(ST.lastcp);
7624             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
7625             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "whilem: failed, trying continuation...\n",
7626                 depth)
7627             );
7628           do_whilem_B_max:
7629             if (cur_curlyx->u.curlyx.count >= REG_INFTY
7630                 && ckWARN(WARN_REGEXP)
7631                 && !reginfo->warned)
7632             {
7633                 reginfo->warned = TRUE;
7634                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7635                      "Complex regular subexpression recursion limit (%d) "
7636                      "exceeded",
7637                      REG_INFTY - 1);
7638             }
7639
7640             /* now try B */
7641             ST.save_curlyx = cur_curlyx;
7642             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7643             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7644                                 locinput);
7645             NOT_REACHED; /* NOTREACHED */
7646
7647         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7648             cur_curlyx = ST.save_curlyx;
7649
7650             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
7651                 /* Maximum greed exceeded */
7652                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7653                     && ckWARN(WARN_REGEXP)
7654                     && !reginfo->warned)
7655                 {
7656                     reginfo->warned     = TRUE;
7657                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7658                         "Complex regular subexpression recursion "
7659                         "limit (%d) exceeded",
7660                         REG_INFTY - 1);
7661                 }
7662                 cur_curlyx->u.curlyx.count--;
7663                 CACHEsayNO;
7664             }
7665
7666             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "trying longer...\n", depth)
7667             );
7668             /* Try grabbing another A and see if it helps. */
7669             cur_curlyx->u.curlyx.lastloc = locinput;
7670             PUSH_STATE_GOTO(WHILEM_A_min,
7671                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
7672                 locinput);
7673             NOT_REACHED; /* NOTREACHED */
7674
7675 #undef  ST
7676 #define ST st->u.branch
7677
7678         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
7679             next = scan + ARG(scan);
7680             if (next == scan)
7681                 next = NULL;
7682             scan = NEXTOPER(scan);
7683             /* FALLTHROUGH */
7684
7685         case BRANCH:        /*  /(...|A|...)/ */
7686             scan = NEXTOPER(scan); /* scan now points to inner node */
7687             ST.lastparen = rex->lastparen;
7688             ST.lastcloseparen = rex->lastcloseparen;
7689             ST.next_branch = next;
7690             REGCP_SET(ST.cp);
7691
7692             /* Now go into the branch */
7693             if (has_cutgroup) {
7694                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7695             } else {
7696                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7697             }
7698             NOT_REACHED; /* NOTREACHED */
7699
7700         case CUTGROUP:  /*  /(*THEN)/  */
7701             sv_yes_mark = st->u.mark.mark_name = scan->flags
7702                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7703                 : NULL;
7704             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7705             NOT_REACHED; /* NOTREACHED */
7706
7707         case CUTGROUP_next_fail:
7708             do_cutgroup = 1;
7709             no_final = 1;
7710             if (st->u.mark.mark_name)
7711                 sv_commit = st->u.mark.mark_name;
7712             sayNO;
7713             NOT_REACHED; /* NOTREACHED */
7714
7715         case BRANCH_next:
7716             sayYES;
7717             NOT_REACHED; /* NOTREACHED */
7718
7719         case BRANCH_next_fail: /* that branch failed; try the next, if any */
7720             if (do_cutgroup) {
7721                 do_cutgroup = 0;
7722                 no_final = 0;
7723             }
7724             REGCP_UNWIND(ST.cp);
7725             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7726             scan = ST.next_branch;
7727             /* no more branches? */
7728             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7729                 DEBUG_EXECUTE_r({
7730                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
7731                         depth,
7732                         PL_colors[4],
7733                         PL_colors[5] );
7734                 });
7735                 sayNO_SILENT;
7736             }
7737             continue; /* execute next BRANCH[J] op */
7738             /* NOTREACHED */
7739
7740         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7741             minmod = 1;
7742             break;
7743
7744 #undef  ST
7745 #define ST st->u.curlym
7746
7747         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
7748
7749             /* This is an optimisation of CURLYX that enables us to push
7750              * only a single backtracking state, no matter how many matches
7751              * there are in {m,n}. It relies on the pattern being constant
7752              * length, with no parens to influence future backrefs
7753              */
7754
7755             ST.me = scan;
7756             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7757
7758             ST.lastparen      = rex->lastparen;
7759             ST.lastcloseparen = rex->lastcloseparen;
7760
7761             /* if paren positive, emulate an OPEN/CLOSE around A */
7762             if (ST.me->flags) {
7763                 U32 paren = ST.me->flags;
7764                 if (paren > maxopenparen)
7765                     maxopenparen = paren;
7766                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7767             }
7768             ST.A = scan;
7769             ST.B = next;
7770             ST.alen = 0;
7771             ST.count = 0;
7772             ST.minmod = minmod;
7773             minmod = 0;
7774             ST.c1 = CHRTEST_UNINIT;
7775             REGCP_SET(ST.cp);
7776
7777             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7778                 goto curlym_do_B;
7779
7780           curlym_do_A: /* execute the A in /A{m,n}B/  */
7781             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7782             NOT_REACHED; /* NOTREACHED */
7783
7784         case CURLYM_A: /* we've just matched an A */
7785             ST.count++;
7786             /* after first match, determine A's length: u.curlym.alen */
7787             if (ST.count == 1) {
7788                 if (reginfo->is_utf8_target) {
7789                     char *s = st->locinput;
7790                     while (s < locinput) {
7791                         ST.alen++;
7792                         s += UTF8SKIP(s);
7793                     }
7794                 }
7795                 else {
7796                     ST.alen = locinput - st->locinput;
7797                 }
7798                 if (ST.alen == 0)
7799                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7800             }
7801             DEBUG_EXECUTE_r(
7802                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
7803                           depth, (IV) ST.count, (IV)ST.alen)
7804             );
7805
7806             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7807                 goto fake_end;
7808
7809             {
7810                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7811                 if ( max == REG_INFTY || ST.count < max )
7812                     goto curlym_do_A; /* try to match another A */
7813             }
7814             goto curlym_do_B; /* try to match B */
7815
7816         case CURLYM_A_fail: /* just failed to match an A */
7817             REGCP_UNWIND(ST.cp);
7818
7819
7820             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7821                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7822                 sayNO;
7823
7824           curlym_do_B: /* execute the B in /A{m,n}B/  */
7825             if (ST.c1 == CHRTEST_UNINIT) {
7826                 /* calculate c1 and c2 for possible match of 1st char
7827                  * following curly */
7828                 ST.c1 = ST.c2 = CHRTEST_VOID;
7829                 assert(ST.B);
7830                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7831                     regnode *text_node = ST.B;
7832                     if (! HAS_TEXT(text_node))
7833                         FIND_NEXT_IMPT(text_node);
7834                     /* this used to be
7835
7836                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7837
7838                         But the former is redundant in light of the latter.
7839
7840                         if this changes back then the macro for
7841                         IS_TEXT and friends need to change.
7842                      */
7843                     if (PL_regkind[OP(text_node)] == EXACT) {
7844                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7845                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7846                            reginfo))
7847                         {
7848                             sayNO;
7849                         }
7850                     }
7851                 }
7852             }
7853
7854             DEBUG_EXECUTE_r(
7855                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
7856                     depth, (IV)ST.count)
7857                 );
7858             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7859                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7860                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7861                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7862                     {
7863                         /* simulate B failing */
7864                         DEBUG_OPTIMISE_r(
7865                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
7866                                 depth,
7867                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7868                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7869                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7870                         );
7871                         state_num = CURLYM_B_fail;
7872                         goto reenter_switch;
7873                     }
7874                 }
7875                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7876                     /* simulate B failing */
7877                     DEBUG_OPTIMISE_r(
7878                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7879                             depth,
7880                             (int) nextchr, ST.c1, ST.c2)
7881                     );
7882                     state_num = CURLYM_B_fail;
7883                     goto reenter_switch;
7884                 }
7885             }
7886
7887             if (ST.me->flags) {
7888                 /* emulate CLOSE: mark current A as captured */
7889                 I32 paren = ST.me->flags;
7890                 if (ST.count) {
7891                     rex->offs[paren].start
7892                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7893                     rex->offs[paren].end = locinput - reginfo->strbeg;
7894                     if ((U32)paren > rex->lastparen)
7895                         rex->lastparen = paren;
7896                     rex->lastcloseparen = paren;
7897                 }
7898                 else
7899                     rex->offs[paren].end = -1;
7900
7901                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7902                 {
7903                     if (ST.count)
7904                         goto fake_end;
7905                     else
7906                         sayNO;
7907                 }
7908             }
7909
7910             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7911             NOT_REACHED; /* NOTREACHED */
7912
7913         case CURLYM_B_fail: /* just failed to match a B */
7914             REGCP_UNWIND(ST.cp);
7915             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7916             if (ST.minmod) {
7917                 I32 max = ARG2(ST.me);
7918                 if (max != REG_INFTY && ST.count == max)
7919                     sayNO;
7920                 goto curlym_do_A; /* try to match a further A */
7921             }
7922             /* backtrack one A */
7923             if (ST.count == ARG1(ST.me) /* min */)
7924                 sayNO;
7925             ST.count--;
7926             SET_locinput(HOPc(locinput, -ST.alen));
7927             goto curlym_do_B; /* try to match B */
7928
7929 #undef ST
7930 #define ST st->u.curly
7931
7932 #define CURLY_SETPAREN(paren, success) \
7933     if (paren) { \
7934         if (success) { \
7935             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7936             rex->offs[paren].end = locinput - reginfo->strbeg; \
7937             if (paren > rex->lastparen) \
7938                 rex->lastparen = paren; \
7939             rex->lastcloseparen = paren; \
7940         } \
7941         else { \
7942             rex->offs[paren].end = -1; \
7943             rex->lastparen      = ST.lastparen; \
7944             rex->lastcloseparen = ST.lastcloseparen; \
7945         } \
7946     }
7947
7948         case STAR:              /*  /A*B/ where A is width 1 char */
7949             ST.paren = 0;
7950             ST.min = 0;
7951             ST.max = REG_INFTY;
7952             scan = NEXTOPER(scan);
7953             goto repeat;
7954
7955         case PLUS:              /*  /A+B/ where A is width 1 char */
7956             ST.paren = 0;
7957             ST.min = 1;
7958             ST.max = REG_INFTY;
7959             scan = NEXTOPER(scan);
7960             goto repeat;
7961
7962         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
7963             ST.paren = scan->flags;     /* Which paren to set */
7964             ST.lastparen      = rex->lastparen;
7965             ST.lastcloseparen = rex->lastcloseparen;
7966             if (ST.paren > maxopenparen)
7967                 maxopenparen = ST.paren;
7968             ST.min = ARG1(scan);  /* min to match */
7969             ST.max = ARG2(scan);  /* max to match */
7970             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
7971             {
7972                 ST.min=1;
7973                 ST.max=1;
7974             }
7975             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7976             goto repeat;
7977
7978         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
7979             ST.paren = 0;
7980             ST.min = ARG1(scan);  /* min to match */
7981             ST.max = ARG2(scan);  /* max to match */
7982             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7983           repeat:
7984             /*
7985             * Lookahead to avoid useless match attempts
7986             * when we know what character comes next.
7987             *
7988             * Used to only do .*x and .*?x, but now it allows
7989             * for )'s, ('s and (?{ ... })'s to be in the way
7990             * of the quantifier and the EXACT-like node.  -- japhy
7991             */
7992
7993             assert(ST.min <= ST.max);
7994             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7995                 ST.c1 = ST.c2 = CHRTEST_VOID;
7996             }
7997             else {
7998                 regnode *text_node = next;
7999
8000                 if (! HAS_TEXT(text_node))
8001                     FIND_NEXT_IMPT(text_node);
8002
8003                 if (! HAS_TEXT(text_node))
8004                     ST.c1 = ST.c2 = CHRTEST_VOID;
8005                 else {
8006                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8007                         ST.c1 = ST.c2 = CHRTEST_VOID;
8008                     }
8009                     else {
8010
8011                     /*  Currently we only get here when
8012
8013                         PL_rekind[OP(text_node)] == EXACT
8014
8015                         if this changes back then the macro for IS_TEXT and
8016                         friends need to change. */
8017                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8018                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8019                            reginfo))
8020                         {
8021                             sayNO;
8022                         }
8023                     }
8024                 }
8025             }
8026
8027             ST.A = scan;
8028             ST.B = next;
8029             if (minmod) {
8030                 char *li = locinput;
8031                 minmod = 0;
8032                 if (ST.min &&
8033                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8034                             < ST.min)
8035                     sayNO;
8036                 SET_locinput(li);
8037                 ST.count = ST.min;
8038                 REGCP_SET(ST.cp);
8039                 if (ST.c1 == CHRTEST_VOID)
8040                     goto curly_try_B_min;
8041
8042                 ST.oldloc = locinput;
8043
8044                 /* set ST.maxpos to the furthest point along the
8045                  * string that could possibly match */
8046                 if  (ST.max == REG_INFTY) {
8047                     ST.maxpos = reginfo->strend - 1;
8048                     if (utf8_target)
8049                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8050                             ST.maxpos--;
8051                 }
8052                 else if (utf8_target) {
8053                     int m = ST.max - ST.min;
8054                     for (ST.maxpos = locinput;
8055                          m >0 && ST.maxpos < reginfo->strend; m--)
8056                         ST.maxpos += UTF8SKIP(ST.maxpos);
8057                 }
8058                 else {
8059                     ST.maxpos = locinput + ST.max - ST.min;
8060                     if (ST.maxpos >= reginfo->strend)
8061                         ST.maxpos = reginfo->strend - 1;
8062                 }
8063                 goto curly_try_B_min_known;
8064
8065             }
8066             else {
8067                 /* avoid taking address of locinput, so it can remain
8068                  * a register var */
8069                 char *li = locinput;
8070                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8071                 if (ST.count < ST.min)
8072                     sayNO;
8073                 SET_locinput(li);
8074                 if ((ST.count > ST.min)
8075                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8076                 {
8077                     /* A{m,n} must come at the end of the string, there's
8078                      * no point in backing off ... */
8079                     ST.min = ST.count;
8080                     /* ...except that $ and \Z can match before *and* after
8081                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8082                        We may back off by one in this case. */
8083                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8084                         ST.min--;
8085                 }
8086                 REGCP_SET(ST.cp);
8087                 goto curly_try_B_max;
8088             }
8089             NOT_REACHED; /* NOTREACHED */
8090
8091         case CURLY_B_min_known_fail:
8092             /* failed to find B in a non-greedy match where c1,c2 valid */
8093
8094             REGCP_UNWIND(ST.cp);
8095             if (ST.paren) {
8096                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8097             }
8098             /* Couldn't or didn't -- move forward. */
8099             ST.oldloc = locinput;
8100             if (utf8_target)
8101                 locinput += UTF8SKIP(locinput);
8102             else
8103                 locinput++;
8104             ST.count++;
8105           curly_try_B_min_known:
8106              /* find the next place where 'B' could work, then call B */
8107             {
8108                 int n;
8109                 if (utf8_target) {
8110                     n = (ST.oldloc == locinput) ? 0 : 1;
8111                     if (ST.c1 == ST.c2) {
8112                         /* set n to utf8_distance(oldloc, locinput) */
8113                         while (locinput <= ST.maxpos
8114                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8115                         {
8116                             locinput += UTF8SKIP(locinput);
8117                             n++;
8118                         }
8119                     }
8120                     else {
8121                         /* set n to utf8_distance(oldloc, locinput) */
8122                         while (locinput <= ST.maxpos
8123                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8124                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8125                         {
8126                             locinput += UTF8SKIP(locinput);
8127                             n++;
8128                         }
8129                     }
8130                 }
8131                 else {  /* Not utf8_target */
8132                     if (ST.c1 == ST.c2) {
8133                         while (locinput <= ST.maxpos &&
8134                                UCHARAT(locinput) != ST.c1)
8135                             locinput++;
8136                     }
8137                     else {
8138                         while (locinput <= ST.maxpos
8139                                && UCHARAT(locinput) != ST.c1
8140                                && UCHARAT(locinput) != ST.c2)
8141                             locinput++;
8142                     }
8143                     n = locinput - ST.oldloc;
8144                 }
8145                 if (locinput > ST.maxpos)
8146                     sayNO;
8147                 if (n) {
8148                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8149                      * at b; check that everything between oldloc and
8150                      * locinput matches */
8151                     char *li = ST.oldloc;
8152                     ST.count += n;
8153                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8154                         sayNO;
8155                     assert(n == REG_INFTY || locinput == li);
8156                 }
8157                 CURLY_SETPAREN(ST.paren, ST.count);
8158                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8159                     goto fake_end;
8160                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8161             }
8162             NOT_REACHED; /* NOTREACHED */
8163
8164         case CURLY_B_min_fail:
8165             /* failed to find B in a non-greedy match where c1,c2 invalid */
8166
8167             REGCP_UNWIND(ST.cp);
8168             if (ST.paren) {
8169                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8170             }
8171             /* failed -- move forward one */
8172             {
8173                 char *li = locinput;
8174                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8175                     sayNO;
8176                 }
8177                 locinput = li;
8178             }
8179             {
8180                 ST.count++;
8181                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8182                         ST.count > 0)) /* count overflow ? */
8183                 {
8184                   curly_try_B_min:
8185                     CURLY_SETPAREN(ST.paren, ST.count);
8186                     if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8187                         goto fake_end;
8188                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8189                 }
8190             }
8191             sayNO;
8192             NOT_REACHED; /* NOTREACHED */
8193
8194           curly_try_B_max:
8195             /* a successful greedy match: now try to match B */
8196             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8197                 goto fake_end;
8198             {
8199                 bool could_match = locinput < reginfo->strend;
8200
8201                 /* If it could work, try it. */
8202                 if (ST.c1 != CHRTEST_VOID && could_match) {
8203                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8204                     {
8205                         could_match = memEQ(locinput,
8206                                             ST.c1_utf8,
8207                                             UTF8SKIP(locinput))
8208                                     || memEQ(locinput,
8209                                              ST.c2_utf8,
8210                                              UTF8SKIP(locinput));
8211                     }
8212                     else {
8213                         could_match = UCHARAT(locinput) == ST.c1
8214                                       || UCHARAT(locinput) == ST.c2;
8215                     }
8216                 }
8217                 if (ST.c1 == CHRTEST_VOID || could_match) {
8218                     CURLY_SETPAREN(ST.paren, ST.count);
8219                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8220                     NOT_REACHED; /* NOTREACHED */
8221                 }
8222             }
8223             /* FALLTHROUGH */
8224
8225         case CURLY_B_max_fail:
8226             /* failed to find B in a greedy match */
8227
8228             REGCP_UNWIND(ST.cp);
8229             if (ST.paren) {
8230                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8231             }
8232             /*  back up. */
8233             if (--ST.count < ST.min)
8234                 sayNO;
8235             locinput = HOPc(locinput, -1);
8236             goto curly_try_B_max;
8237
8238 #undef ST
8239
8240         case END: /*  last op of main pattern  */
8241           fake_end:
8242             if (cur_eval) {
8243                 /* we've just finished A in /(??{A})B/; now continue with B */
8244                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8245                 st->u.eval.prev_rex = rex_sv;           /* inner */
8246
8247                 /* Save *all* the positions. */
8248                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8249                 rex_sv = CUR_EVAL.prev_rex;
8250                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8251                 SET_reg_curpm(rex_sv);
8252                 rex = ReANY(rex_sv);
8253                 rexi = RXi_GET(rex);
8254
8255                 st->u.eval.prev_curlyx = cur_curlyx;
8256                 cur_curlyx = CUR_EVAL.prev_curlyx;
8257
8258                 REGCP_SET(st->u.eval.lastcp);
8259
8260                 /* Restore parens of the outer rex without popping the
8261                  * savestack */
8262                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8263
8264                 st->u.eval.prev_eval = cur_eval;
8265                 cur_eval = CUR_EVAL.prev_eval;
8266                 DEBUG_EXECUTE_r(
8267                     Perl_re_exec_indentf( aTHX_  "EVAL trying tail ... (cur_eval=%p)\n",
8268                                       depth, cur_eval););
8269                 if ( nochange_depth )
8270                     nochange_depth--;
8271
8272                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8273
8274                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8275                                     locinput); /* match B */
8276             }
8277
8278             if (locinput < reginfo->till) {
8279                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8280                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8281                                       PL_colors[4],
8282                                       (long)(locinput - startpos),
8283                                       (long)(reginfo->till - startpos),
8284                                       PL_colors[5]));
8285
8286                 sayNO_SILENT;           /* Cannot match: too short. */
8287             }
8288             sayYES;                     /* Success! */
8289
8290         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8291             DEBUG_EXECUTE_r(
8292             Perl_re_exec_indentf( aTHX_  "%ssubpattern success...%s\n",
8293                 depth, PL_colors[4], PL_colors[5]));
8294             sayYES;                     /* Success! */
8295
8296 #undef  ST
8297 #define ST st->u.ifmatch
8298
8299         {
8300             char *newstart;
8301
8302         case SUSPEND:   /* (?>A) */
8303             ST.wanted = 1;
8304             newstart = locinput;
8305             goto do_ifmatch;
8306
8307         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8308             ST.wanted = 0;
8309             goto ifmatch_trivial_fail_test;
8310
8311         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8312             ST.wanted = 1;
8313           ifmatch_trivial_fail_test:
8314             if (scan->flags) {
8315                 char * const s = HOPBACKc(locinput, scan->flags);
8316                 if (!s) {
8317                     /* trivial fail */
8318                     if (logical) {
8319                         logical = 0;
8320                         sw = 1 - cBOOL(ST.wanted);
8321                     }
8322                     else if (ST.wanted)
8323                         sayNO;
8324                     next = scan + ARG(scan);
8325                     if (next == scan)
8326                         next = NULL;
8327                     break;
8328                 }
8329                 newstart = s;
8330             }
8331             else
8332                 newstart = locinput;
8333
8334           do_ifmatch:
8335             ST.me = scan;
8336             ST.logical = logical;
8337             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8338
8339             /* execute body of (?...A) */
8340             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8341             NOT_REACHED; /* NOTREACHED */
8342         }
8343
8344         case IFMATCH_A_fail: /* body of (?...A) failed */
8345             ST.wanted = !ST.wanted;
8346             /* FALLTHROUGH */
8347
8348         case IFMATCH_A: /* body of (?...A) succeeded */
8349             if (ST.logical) {
8350                 sw = cBOOL(ST.wanted);
8351             }
8352             else if (!ST.wanted)
8353                 sayNO;
8354
8355             if (OP(ST.me) != SUSPEND) {
8356                 /* restore old position except for (?>...) */
8357                 locinput = st->locinput;
8358             }
8359             scan = ST.me + ARG(ST.me);
8360             if (scan == ST.me)
8361                 scan = NULL;
8362             continue; /* execute B */
8363
8364 #undef ST
8365
8366         case LONGJMP: /*  alternative with many branches compiles to
8367                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8368             next = scan + ARG(scan);
8369             if (next == scan)
8370                 next = NULL;
8371             break;
8372
8373         case COMMIT:  /*  (*COMMIT)  */
8374             reginfo->cutpoint = reginfo->strend;
8375             /* FALLTHROUGH */
8376
8377         case PRUNE:   /*  (*PRUNE)   */
8378             if (scan->flags)
8379                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8380             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8381             NOT_REACHED; /* NOTREACHED */
8382
8383         case COMMIT_next_fail:
8384             no_final = 1;
8385             /* FALLTHROUGH */
8386             sayNO;
8387             NOT_REACHED; /* NOTREACHED */
8388
8389         case OPFAIL:   /* (*FAIL)  */
8390             if (scan->flags)
8391                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8392             if (logical) {
8393                 /* deal with (?(?!)X|Y) properly,
8394                  * make sure we trigger the no branch
8395                  * of the trailing IFTHEN structure*/
8396                 sw= 0;
8397                 break;
8398             } else {
8399                 sayNO;
8400             }
8401             NOT_REACHED; /* NOTREACHED */
8402
8403 #define ST st->u.mark
8404         case MARKPOINT: /*  (*MARK:foo)  */
8405             ST.prev_mark = mark_state;
8406             ST.mark_name = sv_commit = sv_yes_mark
8407                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8408             mark_state = st;
8409             ST.mark_loc = locinput;
8410             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8411             NOT_REACHED; /* NOTREACHED */
8412
8413         case MARKPOINT_next:
8414             mark_state = ST.prev_mark;
8415             sayYES;
8416             NOT_REACHED; /* NOTREACHED */
8417
8418         case MARKPOINT_next_fail:
8419             if (popmark && sv_eq(ST.mark_name,popmark))
8420             {
8421                 if (ST.mark_loc > startpoint)
8422                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8423                 popmark = NULL; /* we found our mark */
8424                 sv_commit = ST.mark_name;
8425
8426                 DEBUG_EXECUTE_r({
8427                         Perl_re_exec_indentf( aTHX_  "%ssetting cutpoint to mark:%" SVf "...%s\n",
8428                             depth,
8429                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8430                 });
8431             }
8432             mark_state = ST.prev_mark;
8433             sv_yes_mark = mark_state ?
8434                 mark_state->u.mark.mark_name : NULL;
8435             sayNO;
8436             NOT_REACHED; /* NOTREACHED */
8437
8438         case SKIP:  /*  (*SKIP)  */
8439             if (!scan->flags) {
8440                 /* (*SKIP) : if we fail we cut here*/
8441                 ST.mark_name = NULL;
8442                 ST.mark_loc = locinput;
8443                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8444             } else {
8445                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8446                    otherwise do nothing.  Meaning we need to scan
8447                  */
8448                 regmatch_state *cur = mark_state;
8449                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8450
8451                 while (cur) {
8452                     if ( sv_eq( cur->u.mark.mark_name,
8453                                 find ) )
8454                     {
8455                         ST.mark_name = find;
8456                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8457                     }
8458                     cur = cur->u.mark.prev_mark;
8459                 }
8460             }
8461             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8462             break;
8463
8464         case SKIP_next_fail:
8465             if (ST.mark_name) {
8466                 /* (*CUT:NAME) - Set up to search for the name as we
8467                    collapse the stack*/
8468                 popmark = ST.mark_name;
8469             } else {
8470                 /* (*CUT) - No name, we cut here.*/
8471                 if (ST.mark_loc > startpoint)
8472                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8473                 /* but we set sv_commit to latest mark_name if there
8474                    is one so they can test to see how things lead to this
8475                    cut */
8476                 if (mark_state)
8477                     sv_commit=mark_state->u.mark.mark_name;
8478             }
8479             no_final = 1;
8480             sayNO;
8481             NOT_REACHED; /* NOTREACHED */
8482 #undef ST
8483
8484         case LNBREAK: /* \R */
8485             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8486                 locinput += n;
8487             } else
8488                 sayNO;
8489             break;
8490
8491         default:
8492             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8493                           PTR2UV(scan), OP(scan));
8494             Perl_croak(aTHX_ "regexp memory corruption");
8495
8496         /* this is a point to jump to in order to increment
8497          * locinput by one character */
8498           increment_locinput:
8499             assert(!NEXTCHR_IS_EOS);
8500             if (utf8_target) {
8501                 locinput += PL_utf8skip[nextchr];
8502                 /* locinput is allowed to go 1 char off the end (signifying
8503                  * EOS), but not 2+ */
8504                 if (locinput > reginfo->strend)
8505                     sayNO;
8506             }
8507             else
8508                 locinput++;
8509             break;
8510
8511         } /* end switch */
8512
8513         /* switch break jumps here */
8514         scan = next; /* prepare to execute the next op and ... */
8515         continue;    /* ... jump back to the top, reusing st */
8516         /* NOTREACHED */
8517
8518       push_yes_state:
8519         /* push a state that backtracks on success */
8520         st->u.yes.prev_yes_state = yes_state;
8521         yes_state = st;
8522         /* FALLTHROUGH */
8523       push_state:
8524         /* push a new regex state, then continue at scan  */
8525         {
8526             regmatch_state *newst;
8527
8528             DEBUG_STACK_r({
8529                 regmatch_state *cur = st;
8530                 regmatch_state *curyes = yes_state;
8531                 U32 i;
8532                 regmatch_slab *slab = PL_regmatch_slab;
8533                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
8534                     if (cur < SLAB_FIRST(slab)) {
8535                         slab = slab->prev;
8536                         cur = SLAB_LAST(slab);
8537                     }
8538                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
8539                         depth,
8540                         i ? "    " : "push",
8541                         depth - i, PL_reg_name[cur->resume_state],
8542                         (curyes == cur) ? "yes" : ""
8543                     );
8544                     if (curyes == cur)
8545                         curyes = cur->u.yes.prev_yes_state;
8546                 }
8547             } else
8548                 DEBUG_STATE_pp("push")
8549             );
8550             depth++;
8551             st->locinput = locinput;
8552             newst = st+1;
8553             if (newst >  SLAB_LAST(PL_regmatch_slab))
8554                 newst = S_push_slab(aTHX);
8555             PL_regmatch_state = newst;
8556
8557             locinput = pushinput;
8558             st = newst;
8559             continue;
8560             /* NOTREACHED */
8561         }
8562     }
8563 #ifdef SOLARIS_BAD_OPTIMIZER
8564 #  undef PL_charclass
8565 #endif
8566
8567     /*
8568     * We get here only if there's trouble -- normally "case END" is
8569     * the terminating point.
8570     */
8571     Perl_croak(aTHX_ "corrupted regexp pointers");
8572     NOT_REACHED; /* NOTREACHED */
8573
8574   yes:
8575     if (yes_state) {
8576         /* we have successfully completed a subexpression, but we must now
8577          * pop to the state marked by yes_state and continue from there */
8578         assert(st != yes_state);
8579 #ifdef DEBUGGING
8580         while (st != yes_state) {
8581             st--;
8582             if (st < SLAB_FIRST(PL_regmatch_slab)) {
8583                 PL_regmatch_slab = PL_regmatch_slab->prev;
8584                 st = SLAB_LAST(PL_regmatch_slab);
8585             }
8586             DEBUG_STATE_r({
8587                 if (no_final) {
8588                     DEBUG_STATE_pp("pop (no final)");
8589                 } else {
8590                     DEBUG_STATE_pp("pop (yes)");
8591                 }
8592             });
8593             depth--;
8594         }
8595 #else
8596         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8597             || yes_state > SLAB_LAST(PL_regmatch_slab))
8598         {
8599             /* not in this slab, pop slab */
8600             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8601             PL_regmatch_slab = PL_regmatch_slab->prev;
8602             st = SLAB_LAST(PL_regmatch_slab);
8603         }
8604         depth -= (st - yes_state);
8605 #endif
8606         st = yes_state;
8607         yes_state = st->u.yes.prev_yes_state;
8608         PL_regmatch_state = st;
8609
8610         if (no_final)
8611             locinput= st->locinput;
8612         state_num = st->resume_state + no_final;
8613         goto reenter_switch;
8614     }
8615
8616     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
8617                           PL_colors[4], PL_colors[5]));
8618
8619     if (reginfo->info_aux_eval) {
8620         /* each successfully executed (?{...}) block does the equivalent of
8621          *   local $^R = do {...}
8622          * When popping the save stack, all these locals would be undone;
8623          * bypass this by setting the outermost saved $^R to the latest
8624          * value */
8625         /* I dont know if this is needed or works properly now.
8626          * see code related to PL_replgv elsewhere in this file.
8627          * Yves
8628          */
8629         if (oreplsv != GvSV(PL_replgv))
8630             sv_setsv(oreplsv, GvSV(PL_replgv));
8631     }
8632     result = 1;
8633     goto final_exit;
8634
8635   no:
8636     DEBUG_EXECUTE_r(
8637         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
8638             depth,
8639             PL_colors[4], PL_colors[5])
8640         );
8641
8642   no_silent:
8643     if (no_final) {
8644         if (yes_state) {
8645             goto yes;
8646         } else {
8647             goto final_exit;
8648         }
8649     }
8650     if (depth) {
8651         /* there's a previous state to backtrack to */
8652         st--;
8653         if (st < SLAB_FIRST(PL_regmatch_slab)) {
8654             PL_regmatch_slab = PL_regmatch_slab->prev;
8655             st = SLAB_LAST(PL_regmatch_slab);
8656         }
8657         PL_regmatch_state = st;
8658         locinput= st->locinput;
8659
8660         DEBUG_STATE_pp("pop");
8661         depth--;
8662         if (yes_state == st)
8663             yes_state = st->u.yes.prev_yes_state;
8664
8665         state_num = st->resume_state + 1; /* failure = success + 1 */
8666         PERL_ASYNC_CHECK();
8667         goto reenter_switch;
8668     }
8669     result = 0;
8670
8671   final_exit:
8672     if (rex->intflags & PREGf_VERBARG_SEEN) {
8673         SV *sv_err = get_sv("REGERROR", 1);
8674         SV *sv_mrk = get_sv("REGMARK", 1);
8675         if (result) {
8676             sv_commit = &PL_sv_no;
8677             if (!sv_yes_mark)
8678                 sv_yes_mark = &PL_sv_yes;
8679         } else {
8680             if (!sv_commit)
8681                 sv_commit = &PL_sv_yes;
8682             sv_yes_mark = &PL_sv_no;
8683         }
8684         assert(sv_err);
8685         assert(sv_mrk);
8686         sv_setsv(sv_err, sv_commit);
8687         sv_setsv(sv_mrk, sv_yes_mark);
8688     }
8689
8690
8691     if (last_pushed_cv) {
8692         dSP;
8693         /* see "Some notes about MULTICALL" above */
8694         POP_MULTICALL;
8695         PERL_UNUSED_VAR(SP);
8696     }
8697     else
8698         LEAVE_SCOPE(orig_savestack_ix);
8699
8700     assert(!result ||  locinput - reginfo->strbeg >= 0);
8701     return result ?  locinput - reginfo->strbeg : -1;
8702 }
8703
8704 /*
8705  - regrepeat - repeatedly match something simple, report how many
8706  *
8707  * What 'simple' means is a node which can be the operand of a quantifier like
8708  * '+', or {1,3}
8709  *
8710  * startposp - pointer a pointer to the start position.  This is updated
8711  *             to point to the byte following the highest successful
8712  *             match.
8713  * p         - the regnode to be repeatedly matched against.
8714  * reginfo   - struct holding match state, such as strend
8715  * max       - maximum number of things to match.
8716  * depth     - (for debugging) backtracking depth.
8717  */
8718 STATIC I32
8719 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8720             regmatch_info *const reginfo, I32 max _pDEPTH)
8721 {
8722     char *scan;     /* Pointer to current position in target string */
8723     I32 c;
8724     char *loceol = reginfo->strend;   /* local version */
8725     I32 hardcount = 0;  /* How many matches so far */
8726     bool utf8_target = reginfo->is_utf8_target;
8727     unsigned int to_complement = 0;  /* Invert the result? */
8728     UV utf8_flags;
8729     _char_class_number classnum;
8730
8731     PERL_ARGS_ASSERT_REGREPEAT;
8732
8733     scan = *startposp;
8734     if (max == REG_INFTY)
8735         max = I32_MAX;
8736     else if (! utf8_target && loceol - scan > max)
8737         loceol = scan + max;
8738
8739     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8740      * to the maximum of how far we should go in it (leaving it set to the real
8741      * end, if the maximum permissible would take us beyond that).  This allows
8742      * us to make the loop exit condition that we haven't gone past <loceol> to
8743      * also mean that we haven't exceeded the max permissible count, saving a
8744      * test each time through the loop.  But it assumes that the OP matches a
8745      * single byte, which is true for most of the OPs below when applied to a
8746      * non-UTF-8 target.  Those relatively few OPs that don't have this
8747      * characteristic will have to compensate.
8748      *
8749      * There is no adjustment for UTF-8 targets, as the number of bytes per
8750      * character varies.  OPs will have to test both that the count is less
8751      * than the max permissible (using <hardcount> to keep track), and that we
8752      * are still within the bounds of the string (using <loceol>.  A few OPs
8753      * match a single byte no matter what the encoding.  They can omit the max
8754      * test if, for the UTF-8 case, they do the adjustment that was skipped
8755      * above.
8756      *
8757      * Thus, the code above sets things up for the common case; and exceptional
8758      * cases need extra work; the common case is to make sure <scan> doesn't
8759      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8760      * count doesn't exceed the maximum permissible */
8761
8762     switch (OP(p)) {
8763     case REG_ANY:
8764         if (utf8_target) {
8765             while (scan < loceol && hardcount < max && *scan != '\n') {
8766                 scan += UTF8SKIP(scan);
8767                 hardcount++;
8768             }
8769         } else {
8770             while (scan < loceol && *scan != '\n')
8771                 scan++;
8772         }
8773         break;
8774     case SANY:
8775         if (utf8_target) {
8776             while (scan < loceol && hardcount < max) {
8777                 scan += UTF8SKIP(scan);
8778                 hardcount++;
8779             }
8780         }
8781         else
8782             scan = loceol;
8783         break;
8784     case EXACTL:
8785         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8786         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8787             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8788         }
8789         /* FALLTHROUGH */
8790     case EXACT:
8791         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8792
8793         c = (U8)*STRING(p);
8794
8795         /* Can use a simple loop if the pattern char to match on is invariant
8796          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
8797          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8798          * true iff it doesn't matter if the argument is in UTF-8 or not */
8799         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8800             if (utf8_target && loceol - scan > max) {
8801                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8802                  * since here, to match at all, 1 char == 1 byte */
8803                 loceol = scan + max;
8804             }
8805             while (scan < loceol && UCHARAT(scan) == c) {
8806                 scan++;
8807             }
8808         }
8809         else if (reginfo->is_utf8_pat) {
8810             if (utf8_target) {
8811                 STRLEN scan_char_len;
8812
8813                 /* When both target and pattern are UTF-8, we have to do
8814                  * string EQ */
8815                 while (hardcount < max
8816                        && scan < loceol
8817                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8818                        && memEQ(scan, STRING(p), scan_char_len))
8819                 {
8820                     scan += scan_char_len;
8821                     hardcount++;
8822                 }
8823             }
8824             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8825
8826                 /* Target isn't utf8; convert the character in the UTF-8
8827                  * pattern to non-UTF8, and do a simple loop */
8828                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8829                 while (scan < loceol && UCHARAT(scan) == c) {
8830                     scan++;
8831                 }
8832             } /* else pattern char is above Latin1, can't possibly match the
8833                  non-UTF-8 target */
8834         }
8835         else {
8836
8837             /* Here, the string must be utf8; pattern isn't, and <c> is
8838              * different in utf8 than not, so can't compare them directly.
8839              * Outside the loop, find the two utf8 bytes that represent c, and
8840              * then look for those in sequence in the utf8 string */
8841             U8 high = UTF8_TWO_BYTE_HI(c);
8842             U8 low = UTF8_TWO_BYTE_LO(c);
8843
8844             while (hardcount < max
8845                     && scan + 1 < loceol
8846                     && UCHARAT(scan) == high
8847                     && UCHARAT(scan + 1) == low)
8848             {
8849                 scan += 2;
8850                 hardcount++;
8851             }
8852         }
8853         break;
8854
8855     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
8856         assert(! reginfo->is_utf8_pat);
8857         /* FALLTHROUGH */
8858     case EXACTFA:
8859         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8860         goto do_exactf;
8861
8862     case EXACTFL:
8863         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8864         utf8_flags = FOLDEQ_LOCALE;
8865         goto do_exactf;
8866
8867     case EXACTF:   /* This node only generated for non-utf8 patterns */
8868         assert(! reginfo->is_utf8_pat);
8869         utf8_flags = 0;
8870         goto do_exactf;
8871
8872     case EXACTFLU8:
8873         if (! utf8_target) {
8874             break;
8875         }
8876         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8877                                     | FOLDEQ_S2_FOLDS_SANE;
8878         goto do_exactf;
8879
8880     case EXACTFU_SS:
8881     case EXACTFU:
8882         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8883
8884       do_exactf: {
8885         int c1, c2;
8886         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8887
8888         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8889
8890         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8891                                         reginfo))
8892         {
8893             if (c1 == CHRTEST_VOID) {
8894                 /* Use full Unicode fold matching */
8895                 char *tmpeol = reginfo->strend;
8896                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8897                 while (hardcount < max
8898                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8899                                              STRING(p), NULL, pat_len,
8900                                              reginfo->is_utf8_pat, utf8_flags))
8901                 {
8902                     scan = tmpeol;
8903                     tmpeol = reginfo->strend;
8904                     hardcount++;
8905                 }
8906             }
8907             else if (utf8_target) {
8908                 if (c1 == c2) {
8909                     while (scan < loceol
8910                            && hardcount < max
8911                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8912                     {
8913                         scan += UTF8SKIP(scan);
8914                         hardcount++;
8915                     }
8916                 }
8917                 else {
8918                     while (scan < loceol
8919                            && hardcount < max
8920                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8921                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8922                     {
8923                         scan += UTF8SKIP(scan);
8924                         hardcount++;
8925                     }
8926                 }
8927             }
8928             else if (c1 == c2) {
8929                 while (scan < loceol && UCHARAT(scan) == c1) {
8930                     scan++;
8931                 }
8932             }
8933             else {
8934                 while (scan < loceol &&
8935                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8936                 {
8937                     scan++;
8938                 }
8939             }
8940         }
8941         break;
8942     }
8943     case ANYOFL:
8944         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8945
8946         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
8947             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8948         }
8949         /* FALLTHROUGH */
8950     case ANYOFD:
8951     case ANYOF:
8952         if (utf8_target) {
8953             while (hardcount < max
8954                    && scan < loceol
8955                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8956             {
8957                 scan += UTF8SKIP(scan);
8958                 hardcount++;
8959             }
8960         }
8961         else if (ANYOF_FLAGS(p)) {
8962             while (scan < loceol
8963                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
8964                 scan++;
8965         }
8966         else {
8967             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
8968                 scan++;
8969         }
8970         break;
8971
8972     /* The argument (FLAGS) to all the POSIX node types is the class number */
8973
8974     case NPOSIXL:
8975         to_complement = 1;
8976         /* FALLTHROUGH */
8977
8978     case POSIXL:
8979         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8980         if (! utf8_target) {
8981             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8982                                                                    *scan)))
8983             {
8984                 scan++;
8985             }
8986         } else {
8987             while (hardcount < max && scan < loceol
8988                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8989                                                                   (U8 *) scan)))
8990             {
8991                 scan += UTF8SKIP(scan);
8992                 hardcount++;
8993             }
8994         }
8995         break;
8996
8997     case POSIXD:
8998         if (utf8_target) {
8999             goto utf8_posix;
9000         }
9001         /* FALLTHROUGH */
9002
9003     case POSIXA:
9004         if (utf8_target && loceol - scan > max) {
9005
9006             /* We didn't adjust <loceol> at the beginning of this routine
9007              * because is UTF-8, but it is actually ok to do so, since here, to
9008              * match, 1 char == 1 byte. */
9009             loceol = scan + max;
9010         }
9011         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9012             scan++;
9013         }
9014         break;
9015
9016     case NPOSIXD:
9017         if (utf8_target) {
9018             to_complement = 1;
9019             goto utf8_posix;
9020         }
9021         /* FALLTHROUGH */
9022
9023     case NPOSIXA:
9024         if (! utf8_target) {
9025             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9026                 scan++;
9027             }
9028         }
9029         else {
9030
9031             /* The complement of something that matches only ASCII matches all
9032              * non-ASCII, plus everything in ASCII that isn't in the class. */
9033             while (hardcount < max && scan < loceol
9034                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9035                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9036             {
9037                 scan += UTF8SKIP(scan);
9038                 hardcount++;
9039             }
9040         }
9041         break;
9042
9043     case NPOSIXU:
9044         to_complement = 1;
9045         /* FALLTHROUGH */
9046
9047     case POSIXU:
9048         if (! utf8_target) {
9049             while (scan < loceol && to_complement
9050                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9051             {
9052                 scan++;
9053             }
9054         }
9055         else {
9056           utf8_posix:
9057             classnum = (_char_class_number) FLAGS(p);
9058             if (classnum < _FIRST_NON_SWASH_CC) {
9059
9060                 /* Here, a swash is needed for above-Latin1 code points.
9061                  * Process as many Latin1 code points using the built-in rules.
9062                  * Go to another loop to finish processing upon encountering
9063                  * the first Latin1 code point.  We could do that in this loop
9064                  * as well, but the other way saves having to test if the swash
9065                  * has been loaded every time through the loop: extra space to
9066                  * save a test. */
9067                 while (hardcount < max && scan < loceol) {
9068                     if (UTF8_IS_INVARIANT(*scan)) {
9069                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
9070                                                                    classnum))))
9071                         {
9072                             break;
9073                         }
9074                         scan++;
9075                     }
9076                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
9077                         if (! (to_complement
9078                               ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
9079                                                                      *(scan + 1)),
9080                                                     classnum))))
9081                         {
9082                             break;
9083                         }
9084                         scan += 2;
9085                     }
9086                     else {
9087                         goto found_above_latin1;
9088                     }
9089
9090                     hardcount++;
9091                 }
9092             }
9093             else {
9094                 /* For these character classes, the knowledge of how to handle
9095                  * every code point is compiled in to Perl via a macro.  This
9096                  * code is written for making the loops as tight as possible.
9097                  * It could be refactored to save space instead */
9098                 switch (classnum) {
9099                     case _CC_ENUM_SPACE:
9100                         while (hardcount < max
9101                                && scan < loceol
9102                                && (to_complement
9103                                    ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9104                         {
9105                             scan += UTF8SKIP(scan);
9106                             hardcount++;
9107                         }
9108                         break;
9109                     case _CC_ENUM_BLANK:
9110                         while (hardcount < max
9111                                && scan < loceol
9112                                && (to_complement
9113                                     ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9114                         {
9115                             scan += UTF8SKIP(scan);
9116                             hardcount++;
9117                         }
9118                         break;
9119                     case _CC_ENUM_XDIGIT:
9120                         while (hardcount < max
9121                                && scan < loceol
9122                                && (to_complement
9123                                    ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9124                         {
9125                             scan += UTF8SKIP(scan);
9126                             hardcount++;
9127                         }
9128                         break;
9129                     case _CC_ENUM_VERTSPACE:
9130                         while (hardcount < max
9131                                && scan < loceol
9132                                && (to_complement
9133                                    ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9134                         {
9135                             scan += UTF8SKIP(scan);
9136                             hardcount++;
9137                         }
9138                         break;
9139                     case _CC_ENUM_CNTRL:
9140                         while (hardcount < max
9141                                && scan < loceol
9142                                && (to_complement
9143                                    ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9144                         {
9145                             scan += UTF8SKIP(scan);
9146                             hardcount++;
9147                         }
9148                         break;
9149                     default:
9150                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9151                 }
9152             }
9153         }
9154         break;
9155
9156       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
9157
9158         /* Load the swash if not already present */
9159         if (! PL_utf8_swash_ptrs[classnum]) {
9160             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9161             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9162                                         "utf8",
9163                                         "",
9164                                         &PL_sv_undef, 1, 0,
9165                                         PL_XPosix_ptrs[classnum], &flags);
9166         }
9167
9168         while (hardcount < max && scan < loceol
9169                && to_complement ^ cBOOL(_generic_utf8_safe(
9170                                        classnum,
9171                                        scan,
9172                                        loceol,
9173                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
9174                                                    (U8 *) scan,
9175                                                    TRUE))))
9176         {
9177             scan += UTF8SKIP(scan);
9178             hardcount++;
9179         }
9180         break;
9181
9182     case LNBREAK:
9183         if (utf8_target) {
9184             while (hardcount < max && scan < loceol &&
9185                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9186                 scan += c;
9187                 hardcount++;
9188             }
9189         } else {
9190             /* LNBREAK can match one or two latin chars, which is ok, but we
9191              * have to use hardcount in this situation, and throw away the
9192              * adjustment to <loceol> done before the switch statement */
9193             loceol = reginfo->strend;
9194             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9195                 scan+=c;
9196                 hardcount++;
9197             }
9198         }
9199         break;
9200
9201     case BOUNDL:
9202     case NBOUNDL:
9203         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9204         /* FALLTHROUGH */
9205     case BOUND:
9206     case BOUNDA:
9207     case BOUNDU:
9208     case EOS:
9209     case GPOS:
9210     case KEEPS:
9211     case NBOUND:
9212     case NBOUNDA:
9213     case NBOUNDU:
9214     case OPFAIL:
9215     case SBOL:
9216     case SEOL:
9217         /* These are all 0 width, so match right here or not at all. */
9218         break;
9219
9220     default:
9221         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9222         NOT_REACHED; /* NOTREACHED */
9223
9224     }
9225
9226     if (hardcount)
9227         c = hardcount;
9228     else
9229         c = scan - *startposp;
9230     *startposp = scan;
9231
9232     DEBUG_r({
9233         GET_RE_DEBUG_FLAGS_DECL;
9234         DEBUG_EXECUTE_r({
9235             SV * const prop = sv_newmortal();
9236             regprop(prog, prop, p, reginfo, NULL);
9237             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9238                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9239         });
9240     });
9241
9242     return(c);
9243 }
9244
9245
9246 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9247 /*
9248 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9249 create a copy so that changes the caller makes won't change the shared one.
9250 If <altsvp> is non-null, will return NULL in it, for back-compat.
9251  */
9252 SV *
9253 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9254 {
9255     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9256
9257     if (altsvp) {
9258         *altsvp = NULL;
9259     }
9260
9261     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9262 }
9263
9264 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9265
9266 /*
9267  - reginclass - determine if a character falls into a character class
9268
9269   n is the ANYOF-type regnode
9270   p is the target string
9271   p_end points to one byte beyond the end of the target string
9272   utf8_target tells whether p is in UTF-8.
9273
9274   Returns true if matched; false otherwise.
9275
9276   Note that this can be a synthetic start class, a combination of various
9277   nodes, so things you think might be mutually exclusive, such as locale,
9278   aren't.  It can match both locale and non-locale
9279
9280  */
9281
9282 STATIC bool
9283 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9284 {
9285     dVAR;
9286     const char flags = ANYOF_FLAGS(n);
9287     bool match = FALSE;
9288     UV c = *p;
9289
9290     PERL_ARGS_ASSERT_REGINCLASS;
9291
9292     /* If c is not already the code point, get it.  Note that
9293      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9294     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9295         STRLEN c_len = 0;
9296         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9297         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9298         if (c_len == (STRLEN)-1) {
9299             _force_out_malformed_utf8_message(p, p_end,
9300                                               utf8n_flags,
9301                                               1 /* 1 means die */ );
9302             NOT_REACHED; /* NOTREACHED */
9303         }
9304         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9305             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9306         }
9307     }
9308
9309     /* If this character is potentially in the bitmap, check it */
9310     if (c < NUM_ANYOF_CODE_POINTS) {
9311         if (ANYOF_BITMAP_TEST(n, c))
9312             match = TRUE;
9313         else if ((flags
9314                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9315                   && OP(n) == ANYOFD
9316                   && ! utf8_target
9317                   && ! isASCII(c))
9318         {
9319             match = TRUE;
9320         }
9321         else if (flags & ANYOF_LOCALE_FLAGS) {
9322             if ((flags & ANYOFL_FOLD)
9323                 && c < 256
9324                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9325             {
9326                 match = TRUE;
9327             }
9328             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9329                      && c < 256
9330             ) {
9331
9332                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9333                  * if the class includes the Posix character class given by
9334                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9335                  * complemented Posix class given by int(bit/2).  So we loop
9336                  * through the bits, each time changing whether we complement
9337                  * the result or not.  Suppose for the sake of illustration
9338                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9339                  * is set, it means there is a match for this ANYOF node if the
9340                  * character is in the class given by the expression (0 / 2 = 0
9341                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9342                  * and since 'to_complement' is 0, the result will stay TRUE,
9343                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9344                  * bit 1 is 1.  That means there is a match if the character
9345                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9346                  * but will on bit 1.  On the second iteration 'to_complement'
9347                  * will be 1, so the exclusive or will reverse things, so we
9348                  * are testing for \W.  On the third iteration, 'to_complement'
9349                  * will be 0, and we would be testing for \s; the fourth
9350                  * iteration would test for \S, etc.
9351                  *
9352                  * Note that this code assumes that all the classes are closed
9353                  * under folding.  For example, if a character matches \w, then
9354                  * its fold does too; and vice versa.  This should be true for
9355                  * any well-behaved locale for all the currently defined Posix
9356                  * classes, except for :lower: and :upper:, which are handled
9357                  * by the pseudo-class :cased: which matches if either of the
9358                  * other two does.  To get rid of this assumption, an outer
9359                  * loop could be used below to iterate over both the source
9360                  * character, and its fold (if different) */
9361
9362                 int count = 0;
9363                 int to_complement = 0;
9364
9365                 while (count < ANYOF_MAX) {
9366                     if (ANYOF_POSIXL_TEST(n, count)
9367                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9368                     {
9369                         match = TRUE;
9370                         break;
9371                     }
9372                     count++;
9373                     to_complement ^= 1;
9374                 }
9375             }
9376         }
9377     }
9378
9379
9380     /* If the bitmap didn't (or couldn't) match, and something outside the
9381      * bitmap could match, try that. */
9382     if (!match) {
9383         if (c >= NUM_ANYOF_CODE_POINTS
9384             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9385         {
9386             match = TRUE;       /* Everything above the bitmap matches */
9387         }
9388             /* Here doesn't match everything above the bitmap.  If there is
9389              * some information available beyond the bitmap, we may find a
9390              * match in it.  If so, this is most likely because the code point
9391              * is outside the bitmap range.  But rarely, it could be because of
9392              * some other reason.  If so, various flags are set to indicate
9393              * this possibility.  On ANYOFD nodes, there may be matches that
9394              * happen only when the target string is UTF-8; or for other node
9395              * types, because runtime lookup is needed, regardless of the
9396              * UTF-8ness of the target string.  Finally, under /il, there may
9397              * be some matches only possible if the locale is a UTF-8 one. */
9398         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9399                  && (   c >= NUM_ANYOF_CODE_POINTS
9400                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9401                          && (   UNLIKELY(OP(n) != ANYOFD)
9402                              || (utf8_target && ! isASCII_uni(c)
9403 #                               if NUM_ANYOF_CODE_POINTS > 256
9404                                                                  && c < 256
9405 #                               endif
9406                                 )))
9407                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9408                          && IN_UTF8_CTYPE_LOCALE)))
9409         {
9410             SV* only_utf8_locale = NULL;
9411             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9412                                                        &only_utf8_locale, NULL);
9413             if (sw) {
9414                 U8 utf8_buffer[2];
9415                 U8 * utf8_p;
9416                 if (utf8_target) {
9417                     utf8_p = (U8 *) p;
9418                 } else { /* Convert to utf8 */
9419                     utf8_p = utf8_buffer;
9420                     append_utf8_from_native_byte(*p, &utf8_p);
9421                     utf8_p = utf8_buffer;
9422                 }
9423
9424                 if (swash_fetch(sw, utf8_p, TRUE)) {
9425                     match = TRUE;
9426                 }
9427             }
9428             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9429                 match = _invlist_contains_cp(only_utf8_locale, c);
9430             }
9431         }
9432
9433         if (UNICODE_IS_SUPER(c)
9434             && (flags
9435                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9436             && OP(n) != ANYOFD
9437             && ckWARN_d(WARN_NON_UNICODE))
9438         {
9439             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9440                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9441         }
9442     }
9443
9444 #if ANYOF_INVERT != 1
9445     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9446      * use it, don't */
9447 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9448 #endif
9449
9450     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9451     return (flags & ANYOF_INVERT) ^ match;
9452 }
9453
9454 STATIC U8 *
9455 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9456 {
9457     /* return the position 'off' UTF-8 characters away from 's', forward if
9458      * 'off' >= 0, backwards if negative.  But don't go outside of position
9459      * 'lim', which better be < s  if off < 0 */
9460
9461     PERL_ARGS_ASSERT_REGHOP3;
9462
9463     if (off >= 0) {
9464         while (off-- && s < lim) {
9465             /* XXX could check well-formedness here */
9466             s += UTF8SKIP(s);
9467         }
9468     }
9469     else {
9470         while (off++ && s > lim) {
9471             s--;
9472             if (UTF8_IS_CONTINUED(*s)) {
9473                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9474                     s--;
9475                 if (! UTF8_IS_START(*s)) {
9476                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9477                 }
9478             }
9479             /* XXX could check well-formedness here */
9480         }
9481     }
9482     return s;
9483 }
9484
9485 STATIC U8 *
9486 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9487 {
9488     PERL_ARGS_ASSERT_REGHOP4;
9489
9490     if (off >= 0) {
9491         while (off-- && s < rlim) {
9492             /* XXX could check well-formedness here */
9493             s += UTF8SKIP(s);
9494         }
9495     }
9496     else {
9497         while (off++ && s > llim) {
9498             s--;
9499             if (UTF8_IS_CONTINUED(*s)) {
9500                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9501                     s--;
9502                 if (! UTF8_IS_START(*s)) {
9503                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9504                 }
9505             }
9506             /* XXX could check well-formedness here */
9507         }
9508     }
9509     return s;
9510 }
9511
9512 /* like reghop3, but returns NULL on overrun, rather than returning last
9513  * char pos */
9514
9515 STATIC U8 *
9516 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9517 {
9518     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9519
9520     if (off >= 0) {
9521         while (off-- && s < lim) {
9522             /* XXX could check well-formedness here */
9523             s += UTF8SKIP(s);
9524         }
9525         if (off >= 0)
9526             return NULL;
9527     }
9528     else {
9529         while (off++ && s > lim) {
9530             s--;
9531             if (UTF8_IS_CONTINUED(*s)) {
9532                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9533                     s--;
9534                 if (! UTF8_IS_START(*s)) {
9535                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9536                 }
9537             }
9538             /* XXX could check well-formedness here */
9539         }
9540         if (off <= 0)
9541             return NULL;
9542     }
9543     return s;
9544 }
9545
9546
9547 /* when executing a regex that may have (?{}), extra stuff needs setting
9548    up that will be visible to the called code, even before the current
9549    match has finished. In particular:
9550
9551    * $_ is localised to the SV currently being matched;
9552    * pos($_) is created if necessary, ready to be updated on each call-out
9553      to code;
9554    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9555      isn't set until the current pattern is successfully finished), so that
9556      $1 etc of the match-so-far can be seen;
9557    * save the old values of subbeg etc of the current regex, and  set then
9558      to the current string (again, this is normally only done at the end
9559      of execution)
9560 */
9561
9562 static void
9563 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9564 {
9565     MAGIC *mg;
9566     regexp *const rex = ReANY(reginfo->prog);
9567     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9568
9569     eval_state->rex = rex;
9570
9571     if (reginfo->sv) {
9572         /* Make $_ available to executed code. */
9573         if (reginfo->sv != DEFSV) {
9574             SAVE_DEFSV;
9575             DEFSV_set(reginfo->sv);
9576         }
9577
9578         if (!(mg = mg_find_mglob(reginfo->sv))) {
9579             /* prepare for quick setting of pos */
9580             mg = sv_magicext_mglob(reginfo->sv);
9581             mg->mg_len = -1;
9582         }
9583         eval_state->pos_magic = mg;
9584         eval_state->pos       = mg->mg_len;
9585         eval_state->pos_flags = mg->mg_flags;
9586     }
9587     else
9588         eval_state->pos_magic = NULL;
9589
9590     if (!PL_reg_curpm) {
9591         /* PL_reg_curpm is a fake PMOP that we can attach the current
9592          * regex to and point PL_curpm at, so that $1 et al are visible
9593          * within a /(?{})/. It's just allocated once per interpreter the
9594          * first time its needed */
9595         Newxz(PL_reg_curpm, 1, PMOP);
9596 #ifdef USE_ITHREADS
9597         {
9598             SV* const repointer = &PL_sv_undef;
9599             /* this regexp is also owned by the new PL_reg_curpm, which
9600                will try to free it.  */
9601             av_push(PL_regex_padav, repointer);
9602             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
9603             PL_regex_pad = AvARRAY(PL_regex_padav);
9604         }
9605 #endif
9606     }
9607     SET_reg_curpm(reginfo->prog);
9608     eval_state->curpm = PL_curpm;
9609     PL_curpm_under = PL_curpm;
9610     PL_curpm = PL_reg_curpm;
9611     if (RXp_MATCH_COPIED(rex)) {
9612         /*  Here is a serious problem: we cannot rewrite subbeg,
9613             since it may be needed if this match fails.  Thus
9614             $` inside (?{}) could fail... */
9615         eval_state->subbeg     = rex->subbeg;
9616         eval_state->sublen     = rex->sublen;
9617         eval_state->suboffset  = rex->suboffset;
9618         eval_state->subcoffset = rex->subcoffset;
9619 #ifdef PERL_ANY_COW
9620         eval_state->saved_copy = rex->saved_copy;
9621 #endif
9622         RXp_MATCH_COPIED_off(rex);
9623     }
9624     else
9625         eval_state->subbeg = NULL;
9626     rex->subbeg = (char *)reginfo->strbeg;
9627     rex->suboffset = 0;
9628     rex->subcoffset = 0;
9629     rex->sublen = reginfo->strend - reginfo->strbeg;
9630 }
9631
9632
9633 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
9634
9635 static void
9636 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
9637 {
9638     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
9639     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
9640     regmatch_slab *s;
9641
9642     Safefree(aux->poscache);
9643
9644     if (eval_state) {
9645
9646         /* undo the effects of S_setup_eval_state() */
9647
9648         if (eval_state->subbeg) {
9649             regexp * const rex = eval_state->rex;
9650             rex->subbeg     = eval_state->subbeg;
9651             rex->sublen     = eval_state->sublen;
9652             rex->suboffset  = eval_state->suboffset;
9653             rex->subcoffset = eval_state->subcoffset;
9654 #ifdef PERL_ANY_COW
9655             rex->saved_copy = eval_state->saved_copy;
9656 #endif
9657             RXp_MATCH_COPIED_on(rex);
9658         }
9659         if (eval_state->pos_magic)
9660         {
9661             eval_state->pos_magic->mg_len = eval_state->pos;
9662             eval_state->pos_magic->mg_flags =
9663                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
9664                | (eval_state->pos_flags & MGf_BYTES);
9665         }
9666
9667         PL_curpm = eval_state->curpm;
9668     }
9669
9670     PL_regmatch_state = aux->old_regmatch_state;
9671     PL_regmatch_slab  = aux->old_regmatch_slab;
9672
9673     /* free all slabs above current one - this must be the last action
9674      * of this function, as aux and eval_state are allocated within
9675      * slabs and may be freed here */
9676
9677     s = PL_regmatch_slab->next;
9678     if (s) {
9679         PL_regmatch_slab->next = NULL;
9680         while (s) {
9681             regmatch_slab * const osl = s;
9682             s = s->next;
9683             Safefree(osl);
9684         }
9685     }
9686 }
9687
9688
9689 STATIC void
9690 S_to_utf8_substr(pTHX_ regexp *prog)
9691 {
9692     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9693      * on the converted value */
9694
9695     int i = 1;
9696
9697     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9698
9699     do {
9700         if (prog->substrs->data[i].substr
9701             && !prog->substrs->data[i].utf8_substr) {
9702             SV* const sv = newSVsv(prog->substrs->data[i].substr);
9703             prog->substrs->data[i].utf8_substr = sv;
9704             sv_utf8_upgrade(sv);
9705             if (SvVALID(prog->substrs->data[i].substr)) {
9706                 if (SvTAIL(prog->substrs->data[i].substr)) {
9707                     /* Trim the trailing \n that fbm_compile added last
9708                        time.  */
9709                     SvCUR_set(sv, SvCUR(sv) - 1);
9710                     /* Whilst this makes the SV technically "invalid" (as its
9711                        buffer is no longer followed by "\0") when fbm_compile()
9712                        adds the "\n" back, a "\0" is restored.  */
9713                     fbm_compile(sv, FBMcf_TAIL);
9714                 } else
9715                     fbm_compile(sv, 0);
9716             }
9717             if (prog->substrs->data[i].substr == prog->check_substr)
9718                 prog->check_utf8 = sv;
9719         }
9720     } while (i--);
9721 }
9722
9723 STATIC bool
9724 S_to_byte_substr(pTHX_ regexp *prog)
9725 {
9726     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9727      * on the converted value; returns FALSE if can't be converted. */
9728
9729     int i = 1;
9730
9731     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9732
9733     do {
9734         if (prog->substrs->data[i].utf8_substr
9735             && !prog->substrs->data[i].substr) {
9736             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
9737             if (! sv_utf8_downgrade(sv, TRUE)) {
9738                 return FALSE;
9739             }
9740             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9741                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9742                     /* Trim the trailing \n that fbm_compile added last
9743                         time.  */
9744                     SvCUR_set(sv, SvCUR(sv) - 1);
9745                     fbm_compile(sv, FBMcf_TAIL);
9746                 } else
9747                     fbm_compile(sv, 0);
9748             }
9749             prog->substrs->data[i].substr = sv;
9750             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9751                 prog->check_substr = sv;
9752         }
9753     } while (i--);
9754
9755     return TRUE;
9756 }
9757
9758 #ifndef PERL_IN_XSUB_RE
9759
9760 bool
9761 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
9762 {
9763     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
9764      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
9765      * the larger string bounded by 'strbeg' and 'strend'.
9766      *
9767      * 'cp' needs to be assigned (if not a future version of the Unicode
9768      * Standard could make it something that combines with adjacent characters,
9769      * so code using it would then break), and there has to be a GCB break
9770      * before and after the character. */
9771
9772     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
9773     const U8 * prev_cp_start;
9774
9775     PERL_ARGS_ASSERT__IS_GRAPHEME;
9776
9777     /* Unassigned code points are forbidden */
9778     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
9779                                     _invlist_search(PL_Assigned_invlist, cp))))
9780     {
9781         return FALSE;
9782     }
9783
9784     cp_gcb_val = getGCB_VAL_CP(cp);
9785
9786     /* Find the GCB value of the previous code point in the input */
9787     prev_cp_start = utf8_hop_back(s, -1, strbeg);
9788     if (UNLIKELY(prev_cp_start == s)) {
9789         prev_cp_gcb_val = GCB_EDGE;
9790     }
9791     else {
9792         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
9793     }
9794
9795     /* And check that is a grapheme boundary */
9796     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
9797                 TRUE /* is UTF-8 encoded */ ))
9798     {
9799         return FALSE;
9800     }
9801
9802     /* Similarly verify there is a break between the current character and the
9803      * following one */
9804     s += UTF8SKIP(s);
9805     if (s >= strend) {
9806         next_cp_gcb_val = GCB_EDGE;
9807     }
9808     else {
9809         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
9810     }
9811
9812     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
9813 }
9814
9815 #endif
9816
9817
9818
9819 /*
9820  * ex: set ts=8 sts=4 sw=4 et:
9821  */