regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 /* Returns a boolean as to whether the input unsigned number is a power of 2
 100  * (2**0, 2**1, etc).  In other words if it has just a single bit set.
 101  * If not, subtracting 1 would leave the uppermost bit set, so the & would
 102  * yield non-zero */
 103 #define isPOWER_OF_2(n) ((n & (n-1)) == 0)
 104
 105 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 106     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 107     goto target;                                                         \
 108 } STMT_END
 109
 110 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 111
 112 #ifndef STATIC
 113 #define STATIC  static
 114 #endif
 115
 116 /* Valid only if 'c', the character being looke-up, is an invariant under
 117  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 118  * everything matchable is straight forward in the bitmap */
 119 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 120                                 ? reginclass(prog,p,c,c+1,u)                \
 121                                 : ANYOF_BITMAP_TEST(p,*(c)))
 122
 123 /*
 124  * Forwards.
 125  */
 126
 127 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 128
 129 #define HOPc(pos,off) \
 130         (char *)(reginfo->is_utf8_target \
 131             ? reghop3((U8*)pos, off, \
 132                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 133             : (U8*)(pos + off))
 134
 135 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 136 #define HOPBACK3(pos, off, lim) \
 137         (reginfo->is_utf8_target                          \
 138             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 139             : (pos - off >= lim)                                 \
 140                 ? (U8*)pos - off                                 \
 141                 : NULL)
 142
 143 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 144
 145 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 146 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 147
 148 /* lim must be +ve. Returns NULL on overshoot */
 149 #define HOPMAYBE3(pos,off,lim) \
 150         (reginfo->is_utf8_target                        \
 151             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 152             : ((U8*)pos + off <= lim)                   \
 153                 ? (U8*)pos + off                        \
 154                 : NULL)
 155
 156 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 157  * off must be >=0; args should be vars rather than expressions */
 158 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 159     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 160     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 161 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 162
 163 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 164     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 165     : (U8*)(pos + off))
 166 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 167
 168 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 169 #define NEXTCHR_IS_EOS (nextchr < 0)
 170
 171 #define SET_nextchr \
 172     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 173
 174 #define SET_locinput(p) \
 175     locinput = (p);  \
 176     SET_nextchr
 177
 178
 179 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 180         if (!swash_ptr) {                                                     \
 181             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 182             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 183                                          1, 0, invlist, &flags);              \
 184             assert(swash_ptr);                                                \
 185         }                                                                     \
 186     } STMT_END
 187
 188 /* If in debug mode, we test that a known character properly matches */
 189 #ifdef DEBUGGING
 190 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 191                                           property_name,                      \
 192                                           invlist,                            \
 193                                           utf8_char_in_property)              \
 194         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 195         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 196 #else
 197 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 198                                           property_name,                      \
 199                                           invlist,                            \
 200                                           utf8_char_in_property)              \
 201         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 202 #endif
 203
 204 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 205                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 206                                         "",                                   \
 207                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 208                                         LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
 209
 210 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 211 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 212
 213 /* for use after a quantifier and before an EXACT-like node -- japhy */
 214 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 215  *
 216  * NOTE that *nothing* that affects backtracking should be in here, specifically
 217  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 218  * node that is in between two EXACT like nodes when ascertaining what the required
 219  * "follow" character is. This should probably be moved to regex compile time
 220  * although it may be done at run time beause of the REF possibility - more
 221  * investigation required. -- demerphq
 222 */
 223 #define JUMPABLE(rn) (                                                             \
 224     OP(rn) == OPEN ||                                                              \
 225     (OP(rn) == CLOSE &&                                                            \
 226      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 227     OP(rn) == EVAL ||                                                              \
 228     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 229     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 230     OP(rn) == KEEPS ||                                                             \
 231     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 232 )
 233 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 234
 235 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 236
 237 #if 0
 238 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 239    we don't need this definition.  XXX These are now out-of-sync*/
 240 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 241 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 242 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 243
 244 #else
 245 /* ... so we use this as its faster. */
 246 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 247 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 248 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 249 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 250
 251 #endif
 252
 253 /*
 254   Search for mandatory following text node; for lookahead, the text must
 255   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 256 */
 257 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 258     while (JUMPABLE(rn)) { \
 259         const OPCODE type = OP(rn); \
 260         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 261             rn = NEXTOPER(NEXTOPER(rn)); \
 262         else if (type == PLUS) \
 263             rn = NEXTOPER(rn); \
 264         else if (type == IFMATCH) \
 265             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 266         else rn += NEXT_OFF(rn); \
 267     } \
 268 } STMT_END
 269
 270 #define SLAB_FIRST(s) (&(s)->states[0])
 271 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 272
 273 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 274 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 275 static regmatch_state * S_push_slab(pTHX);
 276
 277 #define REGCP_PAREN_ELEMS 3
 278 #define REGCP_OTHER_ELEMS 3
 279 #define REGCP_FRAME_ELEMS 1
 280 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 281  * are needed for the regexp context stack bookkeeping. */
 282
 283 STATIC CHECKPOINT
 284 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 285 {
 286     const int retval = PL_savestack_ix;
 287     const int paren_elems_to_push =
 288                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 289     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 290     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 291     I32 p;
 292     GET_RE_DEBUG_FLAGS_DECL;
 293
 294     PERL_ARGS_ASSERT_REGCPPUSH;
 295
 296     if (paren_elems_to_push < 0)
 297         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 298                    (int)paren_elems_to_push, (int)maxopenparen,
 299                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 300
 301     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 302         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 303                    " out of range (%lu-%ld)",
 304                    total_elems,
 305                    (unsigned long)maxopenparen,
 306                    (long)parenfloor);
 307
 308     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 309
 310     DEBUG_BUFFERS_r(
 311         if ((int)maxopenparen > (int)parenfloor)
 312             Perl_re_exec_indentf( aTHX_
 313                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 314                 depth,
 315                 PTR2UV(rex),
 316                 PTR2UV(rex->offs)
 317             );
 318     );
 319     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 320 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 321         SSPUSHIV(rex->offs[p].end);
 322         SSPUSHIV(rex->offs[p].start);
 323         SSPUSHINT(rex->offs[p].start_tmp);
 324         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 325             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 326             depth,
 327             (UV)p,
 328             (IV)rex->offs[p].start,
 329             (IV)rex->offs[p].start_tmp,
 330             (IV)rex->offs[p].end
 331         ));
 332     }
 333 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 334     SSPUSHINT(maxopenparen);
 335     SSPUSHINT(rex->lastparen);
 336     SSPUSHINT(rex->lastcloseparen);
 337     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 338
 339     return retval;
 340 }
 341
 342 /* These are needed since we do not localize EVAL nodes: */
 343 #define REGCP_SET(cp)                                           \
 344     DEBUG_STATE_r(                                              \
 345         Perl_re_exec_indentf( aTHX_                             \
 346             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 347             depth, (IV)PL_savestack_ix                          \
 348         )                                                       \
 349     );                                                          \
 350     cp = PL_savestack_ix
 351
 352 #define REGCP_UNWIND(cp)                                        \
 353     DEBUG_STATE_r(                                              \
 354         if (cp != PL_savestack_ix)                              \
 355             Perl_re_exec_indentf( aTHX_                         \
 356                 "Clearing an EVAL scope, savestack=%"           \
 357                 IVdf "..%" IVdf "\n",                           \
 358                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 359             )                                                   \
 360     );                                                          \
 361     regcpblow(cp)
 362
 363 #define UNWIND_PAREN(lp, lcp)               \
 364     for (n = rex->lastparen; n > lp; n--)   \
 365         rex->offs[n].end = -1;              \
 366     rex->lastparen = n;                     \
 367     rex->lastcloseparen = lcp;
 368
 369
 370 STATIC void
 371 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 372 {
 373     UV i;
 374     U32 paren;
 375     GET_RE_DEBUG_FLAGS_DECL;
 376
 377     PERL_ARGS_ASSERT_REGCPPOP;
 378
 379     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 380     i = SSPOPUV;
 381     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 382     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 383     rex->lastcloseparen = SSPOPINT;
 384     rex->lastparen = SSPOPINT;
 385     *maxopenparen_p = SSPOPINT;
 386
 387     i -= REGCP_OTHER_ELEMS;
 388     /* Now restore the parentheses context. */
 389     DEBUG_BUFFERS_r(
 390         if (i || rex->lastparen + 1 <= rex->nparens)
 391             Perl_re_exec_indentf( aTHX_
 392                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 393                 depth,
 394                 PTR2UV(rex),
 395                 PTR2UV(rex->offs)
 396             );
 397     );
 398     paren = *maxopenparen_p;
 399     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 400         SSize_t tmps;
 401         rex->offs[paren].start_tmp = SSPOPINT;
 402         rex->offs[paren].start = SSPOPIV;
 403         tmps = SSPOPIV;
 404         if (paren <= rex->lastparen)
 405             rex->offs[paren].end = tmps;
 406         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 407             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 408             depth,
 409             (UV)paren,
 410             (IV)rex->offs[paren].start,
 411             (IV)rex->offs[paren].start_tmp,
 412             (IV)rex->offs[paren].end,
 413             (paren > rex->lastparen ? "(skipped)" : ""));
 414         );
 415         paren--;
 416     }
 417 #if 1
 418     /* It would seem that the similar code in regtry()
 419      * already takes care of this, and in fact it is in
 420      * a better location to since this code can #if 0-ed out
 421      * but the code in regtry() is needed or otherwise tests
 422      * requiring null fields (pat.t#187 and split.t#{13,14}
 423      * (as of patchlevel 7877)  will fail.  Then again,
 424      * this code seems to be necessary or otherwise
 425      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 426      * --jhi updated by dapm */
 427     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 428         if (i > *maxopenparen_p)
 429             rex->offs[i].start = -1;
 430         rex->offs[i].end = -1;
 431         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 432             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 433             depth,
 434             (UV)i,
 435             (i > *maxopenparen_p) ? "-1" : "  "
 436         ));
 437     }
 438 #endif
 439 }
 440
 441 /* restore the parens and associated vars at savestack position ix,
 442  * but without popping the stack */
 443
 444 STATIC void
 445 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 446 {
 447     I32 tmpix = PL_savestack_ix;
 448     PERL_ARGS_ASSERT_REGCP_RESTORE;
 449
 450     PL_savestack_ix = ix;
 451     regcppop(rex, maxopenparen_p);
 452     PL_savestack_ix = tmpix;
 453 }
 454
 455 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 456
 457 #ifndef PERL_IN_XSUB_RE
 458
 459 bool
 460 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 461 {
 462     /* Returns a boolean as to whether or not 'character' is a member of the
 463      * Posix character class given by 'classnum' that should be equivalent to a
 464      * value in the typedef '_char_class_number'.
 465      *
 466      * Ideally this could be replaced by a just an array of function pointers
 467      * to the C library functions that implement the macros this calls.
 468      * However, to compile, the precise function signatures are required, and
 469      * these may vary from platform to to platform.  To avoid having to figure
 470      * out what those all are on each platform, I (khw) am using this method,
 471      * which adds an extra layer of function call overhead (unless the C
 472      * optimizer strips it away).  But we don't particularly care about
 473      * performance with locales anyway. */
 474
 475     switch ((_char_class_number) classnum) {
 476         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 477         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 478         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 479         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 480         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 481                                         || isUPPER_LC(character);
 482         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 483         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 484         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 485         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 486         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 487         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 488         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 489         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 490         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 491         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 492         default:    /* VERTSPACE should never occur in locales */
 493             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 494     }
 495
 496     NOT_REACHED; /* NOTREACHED */
 497     return FALSE;
 498 }
 499
 500 #endif
 501
 502 STATIC bool
 503 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 504 {
 505     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 506      * 'character' is a member of the Posix character class given by 'classnum'
 507      * that should be equivalent to a value in the typedef
 508      * '_char_class_number'.
 509      *
 510      * This just calls isFOO_lc on the code point for the character if it is in
 511      * the range 0-255.  Outside that range, all characters use Unicode
 512      * rules, ignoring any locale.  So use the Unicode function if this class
 513      * requires a swash, and use the Unicode macro otherwise. */
 514
 515     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 516
 517     if (UTF8_IS_INVARIANT(*character)) {
 518         return isFOO_lc(classnum, *character);
 519     }
 520     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 521         return isFOO_lc(classnum,
 522                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 523     }
 524
 525     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 526
 527     if (classnum < _FIRST_NON_SWASH_CC) {
 528
 529         /* Initialize the swash unless done already */
 530         if (! PL_utf8_swash_ptrs[classnum]) {
 531             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 532             PL_utf8_swash_ptrs[classnum] =
 533                     _core_swash_init("utf8",
 534                                      "",
 535                                      &PL_sv_undef, 1, 0,
 536                                      PL_XPosix_ptrs[classnum], &flags);
 537         }
 538
 539         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 540                                  character,
 541                                  TRUE /* is UTF */ ));
 542     }
 543
 544     switch ((_char_class_number) classnum) {
 545         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 546         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 547         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 548         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 549         default:                 break;
 550     }
 551
 552     return FALSE; /* Things like CNTRL are always below 256 */
 553 }
 554
 555 STATIC char *
 556 S_find_next_ascii(char * s, const char * send, const bool utf8_target)
 557 {
 558     /* Returns the position of the first ASCII byte in the sequence between 's'
 559      * and 'send-1' inclusive; returns 'send' if none found */
 560
 561     PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
 562
 563 #ifndef EBCDIC
 564
 565     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 566
 567                             /* This term is wordsize if subword; 0 if not */
 568                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 569
 570                             /* 'offset' */
 571                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 572     {
 573
 574         /* Process per-byte until reach word boundary.  XXX This loop could be
 575          * eliminated if we knew that this platform had fast unaligned reads */
 576         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 577             if (isASCII(*s)) {
 578                 return s;
 579             }
 580             s++;    /* khw didn't bother creating a separate loop for
 581                        utf8_target */
 582         }
 583
 584         /* Here, we know we have at least one full word to process.  Process
 585          * per-word as long as we have at least a full word left */
 586         do {
 587             PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
 588             if (complemented & PERL_VARIANTS_WORD_MASK)  {
 589
 590 #if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 591    || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 592
 593                 s += _variant_byte_number(complemented);
 594                 return s;
 595
 596 #else   /* If weird byte order, drop into next loop to do byte-at-a-time
 597            checks. */
 598
 599                 break;
 600 #endif
 601             }
 602
 603             s += PERL_WORDSIZE;
 604
 605         } while (s + PERL_WORDSIZE <= send);
 606     }
 607
 608 #endif
 609
 610     /* Process per-character */
 611     if (utf8_target) {
 612         while (s < send) {
 613             if (isASCII(*s)) {
 614                 return s;
 615             }
 616             s += UTF8SKIP(s);
 617         }
 618     }
 619     else {
 620         while (s < send) {
 621             if (isASCII(*s)) {
 622                 return s;
 623             }
 624             s++;
 625         }
 626     }
 627
 628     return s;
 629 }
 630
 631 STATIC char *
 632 S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
 633 {
 634     /* Returns the position of the first non-ASCII byte in the sequence between
 635      * 's' and 'send-1' inclusive; returns 'send' if none found */
 636
 637 #ifdef EBCDIC
 638
 639     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 640
 641     if (utf8_target) {
 642         while (s < send) {
 643             if ( ! isASCII(*s)) {
 644                 return s;
 645             }
 646             s += UTF8SKIP(s);
 647         }
 648     }
 649     else {
 650         while (s < send) {
 651             if ( ! isASCII(*s)) {
 652                 return s;
 653             }
 654             s++;
 655         }
 656     }
 657
 658     return s;
 659
 660 #else
 661
 662     const U8 * next_non_ascii = NULL;
 663
 664     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 665     PERL_UNUSED_ARG(utf8_target);
 666
 667     /* On ASCII platforms invariants and ASCII are identical, so if the string
 668      * is entirely invariants, there is no non-ASCII character */
 669     return (is_utf8_invariant_string_loc((U8 *) s,
 670                                          (STRLEN) (send - s),
 671                                          &next_non_ascii))
 672             ? (char *) send
 673             : (char *) next_non_ascii;
 674
 675 #endif
 676
 677 }
 678
 679 /*
 680  * pregexec and friends
 681  */
 682
 683 #ifndef PERL_IN_XSUB_RE
 684 /*
 685  - pregexec - match a regexp against a string
 686  */
 687 I32
 688 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 689          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 690 /* stringarg: the point in the string at which to begin matching */
 691 /* strend:    pointer to null at end of string */
 692 /* strbeg:    real beginning of string */
 693 /* minend:    end of match must be >= minend bytes after stringarg. */
 694 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 695  *            itself is accessed via the pointers above */
 696 /* nosave:    For optimizations. */
 697 {
 698     PERL_ARGS_ASSERT_PREGEXEC;
 699
 700     return
 701         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 702                       nosave ? 0 : REXEC_COPY_STR);
 703 }
 704 #endif
 705
 706
 707
 708 /* re_intuit_start():
 709  *
 710  * Based on some optimiser hints, try to find the earliest position in the
 711  * string where the regex could match.
 712  *
 713  *   rx:     the regex to match against
 714  *   sv:     the SV being matched: only used for utf8 flag; the string
 715  *           itself is accessed via the pointers below. Note that on
 716  *           something like an overloaded SV, SvPOK(sv) may be false
 717  *           and the string pointers may point to something unrelated to
 718  *           the SV itself.
 719  *   strbeg: real beginning of string
 720  *   strpos: the point in the string at which to begin matching
 721  *   strend: pointer to the byte following the last char of the string
 722  *   flags   currently unused; set to 0
 723  *   data:   currently unused; set to NULL
 724  *
 725  * The basic idea of re_intuit_start() is to use some known information
 726  * about the pattern, namely:
 727  *
 728  *   a) the longest known anchored substring (i.e. one that's at a
 729  *      constant offset from the beginning of the pattern; but not
 730  *      necessarily at a fixed offset from the beginning of the
 731  *      string);
 732  *   b) the longest floating substring (i.e. one that's not at a constant
 733  *      offset from the beginning of the pattern);
 734  *   c) Whether the pattern is anchored to the string; either
 735  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 736  *      or anchored to pos(): /\G/;
 737  *   d) A start class: a real or synthetic character class which
 738  *      represents which characters are legal at the start of the pattern;
 739  *
 740  * to either quickly reject the match, or to find the earliest position
 741  * within the string at which the pattern might match, thus avoiding
 742  * running the full NFA engine at those earlier locations, only to
 743  * eventually fail and retry further along.
 744  *
 745  * Returns NULL if the pattern can't match, or returns the address within
 746  * the string which is the earliest place the match could occur.
 747  *
 748  * The longest of the anchored and floating substrings is called 'check'
 749  * and is checked first. The other is called 'other' and is checked
 750  * second. The 'other' substring may not be present.  For example,
 751  *
 752  *    /(abc|xyz)ABC\d{0,3}DEFG/
 753  *
 754  * will have
 755  *
 756  *   check substr (float)    = "DEFG", offset 6..9 chars
 757  *   other substr (anchored) = "ABC",  offset 3..3 chars
 758  *   stclass = [ax]
 759  *
 760  * Be aware that during the course of this function, sometimes 'anchored'
 761  * refers to a substring being anchored relative to the start of the
 762  * pattern, and sometimes to the pattern itself being anchored relative to
 763  * the string. For example:
 764  *
 765  *   /\dabc/:   "abc" is anchored to the pattern;
 766  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 767  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 768  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 769  *                    but the pattern is anchored to the string.
 770  */
 771
 772 char *
 773 Perl_re_intuit_start(pTHX_
 774                     REGEXP * const rx,
 775                     SV *sv,
 776                     const char * const strbeg,
 777                     char *strpos,
 778                     char *strend,
 779                     const U32 flags,
 780                     re_scream_pos_data *data)
 781 {
 782     struct regexp *const prog = ReANY(rx);
 783     SSize_t start_shift = prog->check_offset_min;
 784     /* Should be nonnegative! */
 785     SSize_t end_shift   = 0;
 786     /* current lowest pos in string where the regex can start matching */
 787     char *rx_origin = strpos;
 788     SV *check;
 789     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 790     U8   other_ix = 1 - prog->substrs->check_ix;
 791     bool ml_anch = 0;
 792     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 793     char *check_at = NULL;              /* check substr found at this pos */
 794     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 795     RXi_GET_DECL(prog,progi);
 796     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 797     regmatch_info *const reginfo = &reginfo_buf;
 798     GET_RE_DEBUG_FLAGS_DECL;
 799
 800     PERL_ARGS_ASSERT_RE_INTUIT_START;
 801     PERL_UNUSED_ARG(flags);
 802     PERL_UNUSED_ARG(data);
 803
 804     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 805                 "Intuit: trying to determine minimum start position...\n"));
 806
 807     /* for now, assume that all substr offsets are positive. If at some point
 808      * in the future someone wants to do clever things with lookbehind and
 809      * -ve offsets, they'll need to fix up any code in this function
 810      * which uses these offsets. See the thread beginning
 811      * <20140113145929.GF27210@iabyn.com>
 812      */
 813     assert(prog->substrs->data[0].min_offset >= 0);
 814     assert(prog->substrs->data[0].max_offset >= 0);
 815     assert(prog->substrs->data[1].min_offset >= 0);
 816     assert(prog->substrs->data[1].max_offset >= 0);
 817     assert(prog->substrs->data[2].min_offset >= 0);
 818     assert(prog->substrs->data[2].max_offset >= 0);
 819
 820     /* for now, assume that if both present, that the floating substring
 821      * doesn't start before the anchored substring.
 822      * If you break this assumption (e.g. doing better optimisations
 823      * with lookahead/behind), then you'll need to audit the code in this
 824      * function carefully first
 825      */
 826     assert(
 827             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 828               && (prog->float_utf8    || prog->float_substr))
 829            || (prog->float_min_offset >= prog->anchored_offset));
 830
 831     /* byte rather than char calculation for efficiency. It fails
 832      * to quickly reject some cases that can't match, but will reject
 833      * them later after doing full char arithmetic */
 834     if (prog->minlen > strend - strpos) {
 835         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 836                               "  String too short...\n"));
 837         goto fail;
 838     }
 839
 840     RXp_MATCH_UTF8_set(prog, utf8_target);
 841     reginfo->is_utf8_target = cBOOL(utf8_target);
 842     reginfo->info_aux = NULL;
 843     reginfo->strbeg = strbeg;
 844     reginfo->strend = strend;
 845     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 846     reginfo->intuit = 1;
 847     /* not actually used within intuit, but zero for safety anyway */
 848     reginfo->poscache_maxiter = 0;
 849
 850     if (utf8_target) {
 851         if ((!prog->anchored_utf8 && prog->anchored_substr)
 852                 || (!prog->float_utf8 && prog->float_substr))
 853             to_utf8_substr(prog);
 854         check = prog->check_utf8;
 855     } else {
 856         if (!prog->check_substr && prog->check_utf8) {
 857             if (! to_byte_substr(prog)) {
 858                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 859             }
 860         }
 861         check = prog->check_substr;
 862     }
 863
 864     /* dump the various substring data */
 865     DEBUG_OPTIMISE_MORE_r({
 866         int i;
 867         for (i=0; i<=2; i++) {
 868             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 869                                   : prog->substrs->data[i].substr);
 870             if (!sv)
 871                 continue;
 872
 873             Perl_re_printf( aTHX_
 874                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
 875                 " useful=%" IVdf " utf8=%d [%s]\n",
 876                 i,
 877                 (IV)prog->substrs->data[i].min_offset,
 878                 (IV)prog->substrs->data[i].max_offset,
 879                 (IV)prog->substrs->data[i].end_shift,
 880                 BmUSEFUL(sv),
 881                 utf8_target ? 1 : 0,
 882                 SvPEEK(sv));
 883         }
 884     });
 885
 886     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 887
 888         /* ml_anch: check after \n?
 889          *
 890          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 891          * with /.*.../, these flags will have been added by the
 892          * compiler:
 893          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 894          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 895          */
 896         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 897                    && !(prog->intflags & PREGf_IMPLICIT);
 898
 899         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 900             /* we are only allowed to match at BOS or \G */
 901
 902             /* trivially reject if there's a BOS anchor and we're not at BOS.
 903              *
 904              * Note that we don't try to do a similar quick reject for
 905              * \G, since generally the caller will have calculated strpos
 906              * based on pos() and gofs, so the string is already correctly
 907              * anchored by definition; and handling the exceptions would
 908              * be too fiddly (e.g. REXEC_IGNOREPOS).
 909              */
 910             if (   strpos != strbeg
 911                 && (prog->intflags & PREGf_ANCH_SBOL))
 912             {
 913                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 914                                 "  Not at start...\n"));
 915                 goto fail;
 916             }
 917
 918             /* in the presence of an anchor, the anchored (relative to the
 919              * start of the regex) substr must also be anchored relative
 920              * to strpos. So quickly reject if substr isn't found there.
 921              * This works for \G too, because the caller will already have
 922              * subtracted gofs from pos, and gofs is the offset from the
 923              * \G to the start of the regex. For example, in /.abc\Gdef/,
 924              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 925              * caller will have set strpos=pos()-4; we look for the substr
 926              * at position pos()-4+1, which lines up with the "a" */
 927
 928             if (prog->check_offset_min == prog->check_offset_max) {
 929                 /* Substring at constant offset from beg-of-str... */
 930                 SSize_t slen = SvCUR(check);
 931                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 932
 933                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 934                     "  Looking for check substr at fixed offset %" IVdf "...\n",
 935                     (IV)prog->check_offset_min));
 936
 937                 if (SvTAIL(check)) {
 938                     /* In this case, the regex is anchored at the end too.
 939                      * Unless it's a multiline match, the lengths must match
 940                      * exactly, give or take a \n.  NB: slen >= 1 since
 941                      * the last char of check is \n */
 942                     if (!multiline
 943                         && (   strend - s > slen
 944                             || strend - s < slen - 1
 945                             || (strend - s == slen && strend[-1] != '\n')))
 946                     {
 947                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 948                                             "  String too long...\n"));
 949                         goto fail_finish;
 950                     }
 951                     /* Now should match s[0..slen-2] */
 952                     slen--;
 953                 }
 954                 if (slen && (strend - s < slen
 955                     || *SvPVX_const(check) != *s
 956                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
 957                 {
 958                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 959                                     "  String not equal...\n"));
 960                     goto fail_finish;
 961                 }
 962
 963                 check_at = s;
 964                 goto success_at_start;
 965             }
 966         }
 967     }
 968
 969     end_shift = prog->check_end_shift;
 970
 971 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 972     if (end_shift < 0)
 973         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
 974                    (IV)end_shift, RX_PRECOMP(rx));
 975 #endif
 976
 977   restart:
 978
 979     /* This is the (re)entry point of the main loop in this function.
 980      * The goal of this loop is to:
 981      * 1) find the "check" substring in the region rx_origin..strend
 982      *    (adjusted by start_shift / end_shift). If not found, reject
 983      *    immediately.
 984      * 2) If it exists, look for the "other" substr too if defined; for
 985      *    example, if the check substr maps to the anchored substr, then
 986      *    check the floating substr, and vice-versa. If not found, go
 987      *    back to (1) with rx_origin suitably incremented.
 988      * 3) If we find an rx_origin position that doesn't contradict
 989      *    either of the substrings, then check the possible additional
 990      *    constraints on rx_origin of /^.../m or a known start class.
 991      *    If these fail, then depending on which constraints fail, jump
 992      *    back to here, or to various other re-entry points further along
 993      *    that skip some of the first steps.
 994      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 995      *    substring. If the start position was determined to be at the
 996      *    beginning of the string  - so, not rejected, but not optimised,
 997      *    since we have to run regmatch from position 0 - decrement the
 998      *    BmUSEFUL() count. Otherwise increment it.
 999      */
1000
1001
1002     /* first, look for the 'check' substring */
1003
1004     {
1005         U8* start_point;
1006         U8* end_point;
1007
1008         DEBUG_OPTIMISE_MORE_r({
1009             Perl_re_printf( aTHX_
1010                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1011                 " Start shift: %" IVdf " End shift %" IVdf
1012                 " Real end Shift: %" IVdf "\n",
1013                 (IV)(rx_origin - strbeg),
1014                 (IV)prog->check_offset_min,
1015                 (IV)start_shift,
1016                 (IV)end_shift,
1017                 (IV)prog->check_end_shift);
1018         });
1019
1020         end_point = HOPBACK3(strend, end_shift, rx_origin);
1021         if (!end_point)
1022             goto fail_finish;
1023         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1024         if (!start_point)
1025             goto fail_finish;
1026
1027
1028         /* If the regex is absolutely anchored to either the start of the
1029          * string (SBOL) or to pos() (ANCH_GPOS), then
1030          * check_offset_max represents an upper bound on the string where
1031          * the substr could start. For the ANCH_GPOS case, we assume that
1032          * the caller of intuit will have already set strpos to
1033          * pos()-gofs, so in this case strpos + offset_max will still be
1034          * an upper bound on the substr.
1035          */
1036         if (!ml_anch
1037             && prog->intflags & PREGf_ANCH
1038             && prog->check_offset_max != SSize_t_MAX)
1039         {
1040             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1041             const char * const anchor =
1042                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1043             SSize_t targ_len = (char*)end_point - anchor;
1044
1045             if (check_len > targ_len) {
1046                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1047                               "Anchored string too short...\n"));
1048                 goto fail_finish;
1049             }
1050
1051             /* do a bytes rather than chars comparison. It's conservative;
1052              * so it skips doing the HOP if the result can't possibly end
1053              * up earlier than the old value of end_point.
1054              */
1055             assert(anchor + check_len <= (char *)end_point);
1056             if (prog->check_offset_max + check_len < targ_len) {
1057                 end_point = HOP3lim((U8*)anchor,
1058                                 prog->check_offset_max,
1059                                 end_point - check_len
1060                             )
1061                             + check_len;
1062             }
1063         }
1064
1065         check_at = fbm_instr( start_point, end_point,
1066                       check, multiline ? FBMrf_MULTILINE : 0);
1067
1068         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1069             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1070             (IV)((char*)start_point - strbeg),
1071             (IV)((char*)end_point   - strbeg),
1072             (IV)(check_at ? check_at - strbeg : -1)
1073         ));
1074
1075         /* Update the count-of-usability, remove useless subpatterns,
1076             unshift s.  */
1077
1078         DEBUG_EXECUTE_r({
1079             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1080                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1081             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1082                               (check_at ? "Found" : "Did not find"),
1083                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1084                     ? "anchored" : "floating"),
1085                 quoted,
1086                 RE_SV_TAIL(check),
1087                 (check_at ? " at offset " : "...\n") );
1088         });
1089
1090         if (!check_at)
1091             goto fail_finish;
1092         /* set rx_origin to the minimum position where the regex could start
1093          * matching, given the constraint of the just-matched check substring.
1094          * But don't set it lower than previously.
1095          */
1096
1097         if (check_at - rx_origin > prog->check_offset_max)
1098             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1099         /* Finish the diagnostic message */
1100         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1101             "%ld (rx_origin now %" IVdf ")...\n",
1102             (long)(check_at - strbeg),
1103             (IV)(rx_origin - strbeg)
1104         ));
1105     }
1106
1107
1108     /* now look for the 'other' substring if defined */
1109
1110     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1111                     : prog->substrs->data[other_ix].substr)
1112     {
1113         /* Take into account the "other" substring. */
1114         char *last, *last1;
1115         char *s;
1116         SV* must;
1117         struct reg_substr_datum *other;
1118
1119       do_other_substr:
1120         other = &prog->substrs->data[other_ix];
1121
1122         /* if "other" is anchored:
1123          * we've previously found a floating substr starting at check_at.
1124          * This means that the regex origin must lie somewhere
1125          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1126          * and max:                 HOP3(check_at, -check_offset_min)
1127          * (except that min will be >= strpos)
1128          * So the fixed  substr must lie somewhere between
1129          *  HOP3(min, anchored_offset)
1130          *  HOP3(max, anchored_offset) + SvCUR(substr)
1131          */
1132
1133         /* if "other" is floating
1134          * Calculate last1, the absolute latest point where the
1135          * floating substr could start in the string, ignoring any
1136          * constraints from the earlier fixed match. It is calculated
1137          * as follows:
1138          *
1139          * strend - prog->minlen (in chars) is the absolute latest
1140          * position within the string where the origin of the regex
1141          * could appear. The latest start point for the floating
1142          * substr is float_min_offset(*) on from the start of the
1143          * regex.  last1 simply combines thee two offsets.
1144          *
1145          * (*) You might think the latest start point should be
1146          * float_max_offset from the regex origin, and technically
1147          * you'd be correct. However, consider
1148          *    /a\d{2,4}bcd\w/
1149          * Here, float min, max are 3,5 and minlen is 7.
1150          * This can match either
1151          *    /a\d\dbcd\w/
1152          *    /a\d\d\dbcd\w/
1153          *    /a\d\d\d\dbcd\w/
1154          * In the first case, the regex matches minlen chars; in the
1155          * second, minlen+1, in the third, minlen+2.
1156          * In the first case, the floating offset is 3 (which equals
1157          * float_min), in the second, 4, and in the third, 5 (which
1158          * equals float_max). In all cases, the floating string bcd
1159          * can never start more than 4 chars from the end of the
1160          * string, which equals minlen - float_min. As the substring
1161          * starts to match more than float_min from the start of the
1162          * regex, it makes the regex match more than minlen chars,
1163          * and the two cancel each other out. So we can always use
1164          * float_min - minlen, rather than float_max - minlen for the
1165          * latest position in the string.
1166          *
1167          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1168          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1169          */
1170
1171         assert(prog->minlen >= other->min_offset);
1172         last1 = HOP3c(strend,
1173                         other->min_offset - prog->minlen, strbeg);
1174
1175         if (other_ix) {/* i.e. if (other-is-float) */
1176             /* last is the latest point where the floating substr could
1177              * start, *given* any constraints from the earlier fixed
1178              * match. This constraint is that the floating string starts
1179              * <= float_max_offset chars from the regex origin (rx_origin).
1180              * If this value is less than last1, use it instead.
1181              */
1182             assert(rx_origin <= last1);
1183             last =
1184                 /* this condition handles the offset==infinity case, and
1185                  * is a short-cut otherwise. Although it's comparing a
1186                  * byte offset to a char length, it does so in a safe way,
1187                  * since 1 char always occupies 1 or more bytes,
1188                  * so if a string range is  (last1 - rx_origin) bytes,
1189                  * it will be less than or equal to  (last1 - rx_origin)
1190                  * chars; meaning it errs towards doing the accurate HOP3
1191                  * rather than just using last1 as a short-cut */
1192                 (last1 - rx_origin) < other->max_offset
1193                     ? last1
1194                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1195         }
1196         else {
1197             assert(strpos + start_shift <= check_at);
1198             last = HOP4c(check_at, other->min_offset - start_shift,
1199                         strbeg, strend);
1200         }
1201
1202         s = HOP3c(rx_origin, other->min_offset, strend);
1203         if (s < other_last)     /* These positions already checked */
1204             s = other_last;
1205
1206         must = utf8_target ? other->utf8_substr : other->substr;
1207         assert(SvPOK(must));
1208         {
1209             char *from = s;
1210             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1211
1212             if (to > strend)
1213                 to = strend;
1214             if (from > to) {
1215                 s = NULL;
1216                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1217                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1218                     (IV)(from - strbeg),
1219                     (IV)(to   - strbeg)
1220                 ));
1221             }
1222             else {
1223                 s = fbm_instr(
1224                     (unsigned char*)from,
1225                     (unsigned char*)to,
1226                     must,
1227                     multiline ? FBMrf_MULTILINE : 0
1228                 );
1229                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1230                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1231                     (IV)(from - strbeg),
1232                     (IV)(to   - strbeg),
1233                     (IV)(s ? s - strbeg : -1)
1234                 ));
1235             }
1236         }
1237
1238         DEBUG_EXECUTE_r({
1239             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1240                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1241             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1242                 s ? "Found" : "Contradicts",
1243                 other_ix ? "floating" : "anchored",
1244                 quoted, RE_SV_TAIL(must));
1245         });
1246
1247
1248         if (!s) {
1249             /* last1 is latest possible substr location. If we didn't
1250              * find it before there, we never will */
1251             if (last >= last1) {
1252                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1253                                         "; giving up...\n"));
1254                 goto fail_finish;
1255             }
1256
1257             /* try to find the check substr again at a later
1258              * position. Maybe next time we'll find the "other" substr
1259              * in range too */
1260             other_last = HOP3c(last, 1, strend) /* highest failure */;
1261             rx_origin =
1262                 other_ix /* i.e. if other-is-float */
1263                     ? HOP3c(rx_origin, 1, strend)
1264                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1265             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1266                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1267                 (other_ix ? "floating" : "anchored"),
1268                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1269                 (IV)(rx_origin - strbeg)
1270             ));
1271             goto restart;
1272         }
1273         else {
1274             if (other_ix) { /* if (other-is-float) */
1275                 /* other_last is set to s, not s+1, since its possible for
1276                  * a floating substr to fail first time, then succeed
1277                  * second time at the same floating position; e.g.:
1278                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1279                  * The first time round, anchored and float match at
1280                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1281                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1282                  */
1283                 other_last = s;
1284             }
1285             else {
1286                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1287                 other_last = HOP3c(s, 1, strend);
1288             }
1289             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1290                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1291                   (long)(s - strbeg),
1292                 (IV)(rx_origin - strbeg)
1293               ));
1294
1295         }
1296     }
1297     else {
1298         DEBUG_OPTIMISE_MORE_r(
1299             Perl_re_printf( aTHX_
1300                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1301                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1302                 " strend:%" IVdf "\n",
1303                 (IV)prog->check_offset_min,
1304                 (IV)prog->check_offset_max,
1305                 (IV)(check_at-strbeg),
1306                 (IV)(rx_origin-strbeg),
1307                 (IV)(rx_origin-check_at),
1308                 (IV)(strend-strbeg)
1309             )
1310         );
1311     }
1312
1313   postprocess_substr_matches:
1314
1315     /* handle the extra constraint of /^.../m if present */
1316
1317     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1318         char *s;
1319
1320         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1321                         "  looking for /^/m anchor"));
1322
1323         /* we have failed the constraint of a \n before rx_origin.
1324          * Find the next \n, if any, even if it's beyond the current
1325          * anchored and/or floating substrings. Whether we should be
1326          * scanning ahead for the next \n or the next substr is debatable.
1327          * On the one hand you'd expect rare substrings to appear less
1328          * often than \n's. On the other hand, searching for \n means
1329          * we're effectively flipping between check_substr and "\n" on each
1330          * iteration as the current "rarest" string candidate, which
1331          * means for example that we'll quickly reject the whole string if
1332          * hasn't got a \n, rather than trying every substr position
1333          * first
1334          */
1335
1336         s = HOP3c(strend, - prog->minlen, strpos);
1337         if (s <= rx_origin ||
1338             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1339         {
1340             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1341                             "  Did not find /%s^%s/m...\n",
1342                             PL_colors[0], PL_colors[1]));
1343             goto fail_finish;
1344         }
1345
1346         /* earliest possible origin is 1 char after the \n.
1347          * (since *rx_origin == '\n', it's safe to ++ here rather than
1348          * HOP(rx_origin, 1)) */
1349         rx_origin++;
1350
1351         if (prog->substrs->check_ix == 0  /* check is anchored */
1352             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1353         {
1354             /* Position contradicts check-string; either because
1355              * check was anchored (and thus has no wiggle room),
1356              * or check was float and rx_origin is above the float range */
1357             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1358                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1359                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1360             goto restart;
1361         }
1362
1363         /* if we get here, the check substr must have been float,
1364          * is in range, and we may or may not have had an anchored
1365          * "other" substr which still contradicts */
1366         assert(prog->substrs->check_ix); /* check is float */
1367
1368         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1369             /* whoops, the anchored "other" substr exists, so we still
1370              * contradict. On the other hand, the float "check" substr
1371              * didn't contradict, so just retry the anchored "other"
1372              * substr */
1373             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1374                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1375                 PL_colors[0], PL_colors[1],
1376                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1377                 (IV)(rx_origin - strbeg)
1378             ));
1379             goto do_other_substr;
1380         }
1381
1382         /* success: we don't contradict the found floating substring
1383          * (and there's no anchored substr). */
1384         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1385             "  Found /%s^%s/m with rx_origin %ld...\n",
1386             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1387     }
1388     else {
1389         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1390             "  (multiline anchor test skipped)\n"));
1391     }
1392
1393   success_at_start:
1394
1395
1396     /* if we have a starting character class, then test that extra constraint.
1397      * (trie stclasses are too expensive to use here, we are better off to
1398      * leave it to regmatch itself) */
1399
1400     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1401         const U8* const str = (U8*)STRING(progi->regstclass);
1402
1403         /* XXX this value could be pre-computed */
1404         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1405                     ?  (reginfo->is_utf8_pat
1406                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1407                         : STR_LEN(progi->regstclass))
1408                     : 1);
1409         char * endpos;
1410         char *s;
1411         /* latest pos that a matching float substr constrains rx start to */
1412         char *rx_max_float = NULL;
1413
1414         /* if the current rx_origin is anchored, either by satisfying an
1415          * anchored substring constraint, or a /^.../m constraint, then we
1416          * can reject the current origin if the start class isn't found
1417          * at the current position. If we have a float-only match, then
1418          * rx_origin is constrained to a range; so look for the start class
1419          * in that range. if neither, then look for the start class in the
1420          * whole rest of the string */
1421
1422         /* XXX DAPM it's not clear what the minlen test is for, and why
1423          * it's not used in the floating case. Nothing in the test suite
1424          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1425          * Here are some old comments, which may or may not be correct:
1426          *
1427          *   minlen == 0 is possible if regstclass is \b or \B,
1428          *   and the fixed substr is ''$.
1429          *   Since minlen is already taken into account, rx_origin+1 is
1430          *   before strend; accidentally, minlen >= 1 guaranties no false
1431          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1432          *   0) below assumes that regstclass does not come from lookahead...
1433          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1434          *   This leaves EXACTF-ish only, which are dealt with in
1435          *   find_byclass().
1436          */
1437
1438         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1439             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1440         else if (prog->float_substr || prog->float_utf8) {
1441             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1442             endpos = HOP3clim(rx_max_float, cl_l, strend);
1443         }
1444         else
1445             endpos= strend;
1446
1447         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1448             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1449             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1450               (IV)start_shift, (IV)(check_at - strbeg),
1451               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1452
1453         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1454                             reginfo);
1455         if (!s) {
1456             if (endpos == strend) {
1457                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1458                                 "  Could not match STCLASS...\n") );
1459                 goto fail;
1460             }
1461             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1462                                "  This position contradicts STCLASS...\n") );
1463             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1464                         && !(prog->intflags & PREGf_IMPLICIT))
1465                 goto fail;
1466
1467             /* Contradict one of substrings */
1468             if (prog->anchored_substr || prog->anchored_utf8) {
1469                 if (prog->substrs->check_ix == 1) { /* check is float */
1470                     /* Have both, check_string is floating */
1471                     assert(rx_origin + start_shift <= check_at);
1472                     if (rx_origin + start_shift != check_at) {
1473                         /* not at latest position float substr could match:
1474                          * Recheck anchored substring, but not floating.
1475                          * The condition above is in bytes rather than
1476                          * chars for efficiency. It's conservative, in
1477                          * that it errs on the side of doing 'goto
1478                          * do_other_substr'. In this case, at worst,
1479                          * an extra anchored search may get done, but in
1480                          * practice the extra fbm_instr() is likely to
1481                          * get skipped anyway. */
1482                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1483                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1484                             (long)(other_last - strbeg),
1485                             (IV)(rx_origin - strbeg)
1486                         ));
1487                         goto do_other_substr;
1488                     }
1489                 }
1490             }
1491             else {
1492                 /* float-only */
1493
1494                 if (ml_anch) {
1495                     /* In the presence of ml_anch, we might be able to
1496                      * find another \n without breaking the current float
1497                      * constraint. */
1498
1499                     /* strictly speaking this should be HOP3c(..., 1, ...),
1500                      * but since we goto a block of code that's going to
1501                      * search for the next \n if any, its safe here */
1502                     rx_origin++;
1503                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1504                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1505                               PL_colors[0], PL_colors[1],
1506                               (long)(rx_origin - strbeg)) );
1507                     goto postprocess_substr_matches;
1508                 }
1509
1510                 /* strictly speaking this can never be true; but might
1511                  * be if we ever allow intuit without substrings */
1512                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1513                     goto fail;
1514
1515                 rx_origin = rx_max_float;
1516             }
1517
1518             /* at this point, any matching substrings have been
1519              * contradicted. Start again... */
1520
1521             rx_origin = HOP3c(rx_origin, 1, strend);
1522
1523             /* uses bytes rather than char calculations for efficiency.
1524              * It's conservative: it errs on the side of doing 'goto restart',
1525              * where there is code that does a proper char-based test */
1526             if (rx_origin + start_shift + end_shift > strend) {
1527                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1528                                        "  Could not match STCLASS...\n") );
1529                 goto fail;
1530             }
1531             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1532                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1533                 (prog->substrs->check_ix ? "floating" : "anchored"),
1534                 (long)(rx_origin + start_shift - strbeg),
1535                 (IV)(rx_origin - strbeg)
1536             ));
1537             goto restart;
1538         }
1539
1540         /* Success !!! */
1541
1542         if (rx_origin != s) {
1543             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1544                         "  By STCLASS: moving %ld --> %ld\n",
1545                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1546                    );
1547         }
1548         else {
1549             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1550                                   "  Does not contradict STCLASS...\n");
1551                    );
1552         }
1553     }
1554
1555     /* Decide whether using the substrings helped */
1556
1557     if (rx_origin != strpos) {
1558         /* Fixed substring is found far enough so that the match
1559            cannot start at strpos. */
1560
1561         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1562         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1563     }
1564     else {
1565         /* The found rx_origin position does not prohibit matching at
1566          * strpos, so calling intuit didn't gain us anything. Decrement
1567          * the BmUSEFUL() count on the check substring, and if we reach
1568          * zero, free it.  */
1569         if (!(prog->intflags & PREGf_NAUGHTY)
1570             && (utf8_target ? (
1571                 prog->check_utf8                /* Could be deleted already */
1572                 && --BmUSEFUL(prog->check_utf8) < 0
1573                 && (prog->check_utf8 == prog->float_utf8)
1574             ) : (
1575                 prog->check_substr              /* Could be deleted already */
1576                 && --BmUSEFUL(prog->check_substr) < 0
1577                 && (prog->check_substr == prog->float_substr)
1578             )))
1579         {
1580             /* If flags & SOMETHING - do not do it many times on the same match */
1581             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1582             /* XXX Does the destruction order has to change with utf8_target? */
1583             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1584             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1585             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1586             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1587             check = NULL;                       /* abort */
1588             /* XXXX This is a remnant of the old implementation.  It
1589                     looks wasteful, since now INTUIT can use many
1590                     other heuristics. */
1591             prog->extflags &= ~RXf_USE_INTUIT;
1592         }
1593     }
1594
1595     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1596             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1597              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1598
1599     return rx_origin;
1600
1601   fail_finish:                          /* Substring not found */
1602     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1603         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1604   fail:
1605     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1606                           PL_colors[4], PL_colors[5]));
1607     return NULL;
1608 }
1609
1610
1611 #define DECL_TRIE_TYPE(scan) \
1612     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1613                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1614                  trie_utf8l, trie_flu8 }                                            \
1615                     trie_type = ((scan->flags == EXACT)                             \
1616                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1617                                  : (scan->flags == EXACTL)                          \
1618                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1619                                     : (scan->flags == EXACTFA)                      \
1620                                       ? (utf8_target                                \
1621                                          ? trie_utf8_exactfa_fold                   \
1622                                          : trie_latin_utf8_exactfa_fold)            \
1623                                       : (scan->flags == EXACTFLU8                   \
1624                                          ? trie_flu8                                \
1625                                          : (utf8_target                             \
1626                                            ? trie_utf8_fold                         \
1627                                            :   trie_latin_utf8_fold)))
1628
1629 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1630 STMT_START {                                                                        \
1631     STRLEN skiplen;                                                                 \
1632     U8 flags = FOLD_FLAGS_FULL;                                                     \
1633     switch (trie_type) {                                                            \
1634     case trie_flu8:                                                                 \
1635         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1636         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1637             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1638         }                                                                           \
1639         goto do_trie_utf8_fold;                                                     \
1640     case trie_utf8_exactfa_fold:                                                    \
1641         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1642         /* FALLTHROUGH */                                                           \
1643     case trie_utf8_fold:                                                            \
1644       do_trie_utf8_fold:                                                            \
1645         if ( foldlen>0 ) {                                                          \
1646             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1647             foldlen -= len;                                                         \
1648             uscan += len;                                                           \
1649             len=0;                                                                  \
1650         } else {                                                                    \
1651             len = UTF8SKIP(uc);                                                     \
1652             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen,  \
1653                                                                             flags); \
1654             skiplen = UVCHR_SKIP( uvc );                                            \
1655             foldlen -= skiplen;                                                     \
1656             uscan = foldbuf + skiplen;                                              \
1657         }                                                                           \
1658         break;                                                                      \
1659     case trie_latin_utf8_exactfa_fold:                                              \
1660         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1661         /* FALLTHROUGH */                                                           \
1662     case trie_latin_utf8_fold:                                                      \
1663         if ( foldlen>0 ) {                                                          \
1664             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1665             foldlen -= len;                                                         \
1666             uscan += len;                                                           \
1667             len=0;                                                                  \
1668         } else {                                                                    \
1669             len = 1;                                                                \
1670             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1671             skiplen = UVCHR_SKIP( uvc );                                            \
1672             foldlen -= skiplen;                                                     \
1673             uscan = foldbuf + skiplen;                                              \
1674         }                                                                           \
1675         break;                                                                      \
1676     case trie_utf8l:                                                                \
1677         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1678         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1679             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1680         }                                                                           \
1681         /* FALLTHROUGH */                                                           \
1682     case trie_utf8:                                                                 \
1683         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1684         break;                                                                      \
1685     case trie_plain:                                                                \
1686         uvc = (UV)*uc;                                                              \
1687         len = 1;                                                                    \
1688     }                                                                               \
1689     if (uvc < 256) {                                                                \
1690         charid = trie->charmap[ uvc ];                                              \
1691     }                                                                               \
1692     else {                                                                          \
1693         charid = 0;                                                                 \
1694         if (widecharmap) {                                                          \
1695             SV** const svpp = hv_fetch(widecharmap,                                 \
1696                         (char*)&uvc, sizeof(UV), 0);                                \
1697             if (svpp)                                                               \
1698                 charid = (U16)SvIV(*svpp);                                          \
1699         }                                                                           \
1700     }                                                                               \
1701 } STMT_END
1702
1703 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1704     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1705                 startpos, doutf8, depth)
1706
1707 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1708 STMT_START {                                              \
1709     while (s <= e) {                                      \
1710         if ( (COND)                                       \
1711              && (ln == 1 || folder(s, pat_string, ln))    \
1712              && (reginfo->intuit || regtry(reginfo, &s)) )\
1713             goto got_it;                                  \
1714         s++;                                              \
1715     }                                                     \
1716 } STMT_END
1717
1718 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1719 STMT_START {                                          \
1720     while (s < strend) {                              \
1721         CODE                                          \
1722         s += UTF8SKIP(s);                             \
1723     }                                                 \
1724 } STMT_END
1725
1726 #define REXEC_FBC_SCAN(CODE)                          \
1727 STMT_START {                                          \
1728     while (s < strend) {                              \
1729         CODE                                          \
1730         s++;                                          \
1731     }                                                 \
1732 } STMT_END
1733
1734 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1735 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1736     if (COND) {                                                \
1737         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1738             goto got_it;                                       \
1739         else                                                   \
1740             tmp = doevery;                                     \
1741     }                                                          \
1742     else                                                       \
1743         tmp = 1;                                               \
1744 )
1745
1746 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1747 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1748     if (COND) {                                                \
1749         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1750             goto got_it;                                       \
1751         else                                                   \
1752             tmp = doevery;                                     \
1753     }                                                          \
1754     else                                                       \
1755         tmp = 1;                                               \
1756 )
1757
1758 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1759     if (utf8_target) {                                         \
1760         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1761     }                                                          \
1762     else {                                                     \
1763         REXEC_FBC_CLASS_SCAN(COND);                            \
1764     }
1765
1766 /* The three macros below are slightly different versions of the same logic.
1767  *
1768  * The first is for /a and /aa when the target string is UTF-8.  This can only
1769  * match ascii, but it must advance based on UTF-8.   The other two handle the
1770  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1771  * for the boundary (or non-boundary) between a word and non-word character.
1772  * The utf8 and non-utf8 cases have the same logic, but the details must be
1773  * different.  Find the "wordness" of the character just prior to this one, and
1774  * compare it with the wordness of this one.  If they differ, we have a
1775  * boundary.  At the beginning of the string, pretend that the previous
1776  * character was a new-line.
1777  *
1778  * All these macros uncleanly have side-effects with each other and outside
1779  * variables.  So far it's been too much trouble to clean-up
1780  *
1781  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1782  *               a word character or not.
1783  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1784  *               word/non-word
1785  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1786  *
1787  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1788  * are looking for a boundary or for a non-boundary.  If we are looking for a
1789  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1790  * see if this tentative match actually works, and if so, to quit the loop
1791  * here.  And vice-versa if we are looking for a non-boundary.
1792  *
1793  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1794  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1795  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1796  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1797  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1798  * complement.  But in that branch we complement tmp, meaning that at the
1799  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1800  * which means at the top of the loop in the next iteration, it is
1801  * TEST_NON_UTF8(s-1) */
1802 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1803     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1804     tmp = TEST_NON_UTF8(tmp);                                                  \
1805     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1806         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1807             tmp = !tmp;                                                        \
1808             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1809         }                                                                      \
1810         else {                                                                 \
1811             IF_FAIL;                                                           \
1812         }                                                                      \
1813     );                                                                         \
1814
1815 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1816  * TEST_UTF8 is a macro that for the same input code points returns identically
1817  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1818 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1819     if (s == reginfo->strbeg) {                                                \
1820         tmp = '\n';                                                            \
1821     }                                                                          \
1822     else { /* Back-up to the start of the previous character */                \
1823         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1824         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1825                                                        0, UTF8_ALLOW_DEFAULT); \
1826     }                                                                          \
1827     tmp = TEST_UV(tmp);                                                        \
1828     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1829     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1830         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1831             tmp = !tmp;                                                        \
1832             IF_SUCCESS;                                                        \
1833         }                                                                      \
1834         else {                                                                 \
1835             IF_FAIL;                                                           \
1836         }                                                                      \
1837     );
1838
1839 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1840  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1841  * macros below */
1842 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1843     if (utf8_target) {                                                         \
1844         UTF8_CODE                                                              \
1845     }                                                                          \
1846     else {  /* Not utf8 */                                                     \
1847         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1848         tmp = TEST_NON_UTF8(tmp);                                              \
1849         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1850             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1851                 IF_SUCCESS;                                                    \
1852                 tmp = !tmp;                                                    \
1853             }                                                                  \
1854             else {                                                             \
1855                 IF_FAIL;                                                       \
1856             }                                                                  \
1857         );                                                                     \
1858     }                                                                          \
1859     /* Here, things have been set up by the previous code so that tmp is the   \
1860      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1861      * utf8ness of the target).  We also have to check if this matches against \
1862      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1863      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1864      * string */                                                               \
1865     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1866         IF_SUCCESS;                                                            \
1867     }                                                                          \
1868     else {                                                                     \
1869         IF_FAIL;                                                               \
1870     }
1871
1872 /* This is the macro to use when we want to see if something that looks like it
1873  * could match, actually does, and if so exits the loop */
1874 #define REXEC_FBC_TRYIT                            \
1875     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1876         goto got_it
1877
1878 /* The only difference between the BOUND and NBOUND cases is that
1879  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1880  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1881  * with the other one being empty (PLACEHOLDER is defined as empty).
1882  *
1883  * The TEST_FOO parameters are for operating on different forms of input, but
1884  * all should be ones that return identically for the same underlying code
1885  * points */
1886 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1887     FBC_BOUND_COMMON(                                                          \
1888           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1889           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1890
1891 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1892     FBC_BOUND_COMMON(                                                          \
1893             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1894             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1895
1896 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1897     FBC_BOUND_COMMON(                                                          \
1898           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1899           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1900
1901 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1902     FBC_BOUND_COMMON(                                                          \
1903             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1904             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1905
1906 #ifdef DEBUGGING
1907 static IV
1908 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1909   IV cp_out = Perl__invlist_search(invlist, cp_in);
1910   assert(cp_out >= 0);
1911   return cp_out;
1912 }
1913 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1914         invmap[S_get_break_val_cp_checked(invlist, cp)]
1915 #else
1916 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1917         invmap[_invlist_search(invlist, cp)]
1918 #endif
1919
1920 /* Takes a pointer to an inversion list, a pointer to its corresponding
1921  * inversion map, and a code point, and returns the code point's value
1922  * according to the two arrays.  It assumes that all code points have a value.
1923  * This is used as the base macro for macros for particular properties */
1924 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1925         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
1926
1927 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1928  * of a code point, returning the value for the first code point in the string.
1929  * And it takes the particular macro name that finds the desired value given a
1930  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1931 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1932              (__ASSERT_(pos < strend)                                          \
1933                  /* Note assumes is valid UTF-8 */                             \
1934              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1935
1936 /* Returns the GCB value for the input code point */
1937 #define getGCB_VAL_CP(cp)                                                      \
1938           _generic_GET_BREAK_VAL_CP(                                           \
1939                                     PL_GCB_invlist,                            \
1940                                     _Perl_GCB_invmap,                          \
1941                                     (cp))
1942
1943 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1944  * bounded by pos and strend */
1945 #define getGCB_VAL_UTF8(pos, strend)                                           \
1946     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1947
1948 /* Returns the LB value for the input code point */
1949 #define getLB_VAL_CP(cp)                                                       \
1950           _generic_GET_BREAK_VAL_CP(                                           \
1951                                     PL_LB_invlist,                             \
1952                                     _Perl_LB_invmap,                           \
1953                                     (cp))
1954
1955 /* Returns the LB value for the first code point in the UTF-8 encoded string
1956  * bounded by pos and strend */
1957 #define getLB_VAL_UTF8(pos, strend)                                            \
1958     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1959
1960
1961 /* Returns the SB value for the input code point */
1962 #define getSB_VAL_CP(cp)                                                       \
1963           _generic_GET_BREAK_VAL_CP(                                           \
1964                                     PL_SB_invlist,                             \
1965                                     _Perl_SB_invmap,                     \
1966                                     (cp))
1967
1968 /* Returns the SB value for the first code point in the UTF-8 encoded string
1969  * bounded by pos and strend */
1970 #define getSB_VAL_UTF8(pos, strend)                                            \
1971     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1972
1973 /* Returns the WB value for the input code point */
1974 #define getWB_VAL_CP(cp)                                                       \
1975           _generic_GET_BREAK_VAL_CP(                                           \
1976                                     PL_WB_invlist,                             \
1977                                     _Perl_WB_invmap,                         \
1978                                     (cp))
1979
1980 /* Returns the WB value for the first code point in the UTF-8 encoded string
1981  * bounded by pos and strend */
1982 #define getWB_VAL_UTF8(pos, strend)                                            \
1983     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1984
1985 /* We know what class REx starts with.  Try to find this position... */
1986 /* if reginfo->intuit, its a dryrun */
1987 /* annoyingly all the vars in this routine have different names from their counterparts
1988    in regmatch. /grrr */
1989 STATIC char *
1990 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1991     const char *strend, regmatch_info *reginfo)
1992 {
1993     dVAR;
1994     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1995     char *pat_string;   /* The pattern's exactish string */
1996     char *pat_end;          /* ptr to end char of pat_string */
1997     re_fold_t folder;   /* Function for computing non-utf8 folds */
1998     const U8 *fold_array;   /* array for folding ords < 256 */
1999     STRLEN ln;
2000     STRLEN lnc;
2001     U8 c1;
2002     U8 c2;
2003     char *e;
2004     I32 tmp = 1;        /* Scratch variable? */
2005     const bool utf8_target = reginfo->is_utf8_target;
2006     UV utf8_fold_flags = 0;
2007     const bool is_utf8_pat = reginfo->is_utf8_pat;
2008     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2009                                    with a result inverts that result, as 0^1 =
2010                                    1 and 1^1 = 0 */
2011     _char_class_number classnum;
2012
2013     RXi_GET_DECL(prog,progi);
2014
2015     PERL_ARGS_ASSERT_FIND_BYCLASS;
2016
2017     /* We know what class it must start with. */
2018     switch (OP(c)) {
2019     case ANYOFL:
2020         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2021
2022         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2023             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2024         }
2025
2026         /* FALLTHROUGH */
2027     case ANYOFD:
2028     case ANYOF:
2029         if (utf8_target) {
2030             REXEC_FBC_UTF8_CLASS_SCAN(
2031                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2032         }
2033         else if (ANYOF_FLAGS(c)) {
2034             REXEC_FBC_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2035         }
2036         else {
2037             REXEC_FBC_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2038         }
2039         break;
2040
2041     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
2042         assert(! is_utf8_pat);
2043         /* FALLTHROUGH */
2044     case EXACTFA:
2045         if (is_utf8_pat || utf8_target) {
2046             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2047             goto do_exactf_utf8;
2048         }
2049         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
2050         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
2051         goto do_exactf_non_utf8;        /* isn't dealt with by these */
2052
2053     case EXACTF:   /* This node only generated for non-utf8 patterns */
2054         assert(! is_utf8_pat);
2055         if (utf8_target) {
2056             utf8_fold_flags = 0;
2057             goto do_exactf_utf8;
2058         }
2059         fold_array = PL_fold;
2060         folder = foldEQ;
2061         goto do_exactf_non_utf8;
2062
2063     case EXACTFL:
2064         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2065         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2066             utf8_fold_flags = FOLDEQ_LOCALE;
2067             goto do_exactf_utf8;
2068         }
2069         fold_array = PL_fold_locale;
2070         folder = foldEQ_locale;
2071         goto do_exactf_non_utf8;
2072
2073     case EXACTFU_SS:
2074         if (is_utf8_pat) {
2075             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2076         }
2077         goto do_exactf_utf8;
2078
2079     case EXACTFLU8:
2080             if (! utf8_target) {    /* All code points in this node require
2081                                        UTF-8 to express.  */
2082                 break;
2083             }
2084             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2085                                              | FOLDEQ_S2_FOLDS_SANE;
2086             goto do_exactf_utf8;
2087
2088     case EXACTFU:
2089         if (is_utf8_pat || utf8_target) {
2090             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
2091             goto do_exactf_utf8;
2092         }
2093
2094         /* Any 'ss' in the pattern should have been replaced by regcomp,
2095          * so we don't have to worry here about this single special case
2096          * in the Latin1 range */
2097         fold_array = PL_fold_latin1;
2098         folder = foldEQ_latin1;
2099
2100         /* FALLTHROUGH */
2101
2102       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2103                            are no glitches with fold-length differences
2104                            between the target string and pattern */
2105
2106         /* The idea in the non-utf8 EXACTF* cases is to first find the
2107          * first character of the EXACTF* node and then, if necessary,
2108          * case-insensitively compare the full text of the node.  c1 is the
2109          * first character.  c2 is its fold.  This logic will not work for
2110          * Unicode semantics and the german sharp ss, which hence should
2111          * not be compiled into a node that gets here. */
2112         pat_string = STRING(c);
2113         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2114
2115         /* We know that we have to match at least 'ln' bytes (which is the
2116          * same as characters, since not utf8).  If we have to match 3
2117          * characters, and there are only 2 availabe, we know without
2118          * trying that it will fail; so don't start a match past the
2119          * required minimum number from the far end */
2120         e = HOP3c(strend, -((SSize_t)ln), s);
2121         if (e < s)
2122             break;
2123
2124         c1 = *pat_string;
2125         c2 = fold_array[c1];
2126         if (c1 == c2) { /* If char and fold are the same */
2127             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
2128         }
2129         else {
2130             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
2131         }
2132         break;
2133
2134       do_exactf_utf8:
2135       {
2136         unsigned expansion;
2137
2138         /* If one of the operands is in utf8, we can't use the simpler folding
2139          * above, due to the fact that many different characters can have the
2140          * same fold, or portion of a fold, or different- length fold */
2141         pat_string = STRING(c);
2142         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2143         pat_end = pat_string + ln;
2144         lnc = is_utf8_pat       /* length to match in characters */
2145                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2146                 : ln;
2147
2148         /* We have 'lnc' characters to match in the pattern, but because of
2149          * multi-character folding, each character in the target can match
2150          * up to 3 characters (Unicode guarantees it will never exceed
2151          * this) if it is utf8-encoded; and up to 2 if not (based on the
2152          * fact that the Latin 1 folds are already determined, and the
2153          * only multi-char fold in that range is the sharp-s folding to
2154          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2155          * string character.  Adjust lnc accordingly, rounding up, so that
2156          * if we need to match at least 4+1/3 chars, that really is 5. */
2157         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2158         lnc = (lnc + expansion - 1) / expansion;
2159
2160         /* As in the non-UTF8 case, if we have to match 3 characters, and
2161          * only 2 are left, it's guaranteed to fail, so don't start a
2162          * match that would require us to go beyond the end of the string
2163          */
2164         e = HOP3c(strend, -((SSize_t)lnc), s);
2165
2166         /* XXX Note that we could recalculate e to stop the loop earlier,
2167          * as the worst case expansion above will rarely be met, and as we
2168          * go along we would usually find that e moves further to the left.
2169          * This would happen only after we reached the point in the loop
2170          * where if there were no expansion we should fail.  Unclear if
2171          * worth the expense */
2172
2173         while (s <= e) {
2174             char *my_strend= (char *)strend;
2175             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2176                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2177                 && (reginfo->intuit || regtry(reginfo, &s)) )
2178             {
2179                 goto got_it;
2180             }
2181             s += (utf8_target) ? UTF8SKIP(s) : 1;
2182         }
2183         break;
2184     }
2185
2186     case BOUNDL:
2187         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2188         if (FLAGS(c) != TRADITIONAL_BOUND) {
2189             if (! IN_UTF8_CTYPE_LOCALE) {
2190                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2191                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2192             }
2193             goto do_boundu;
2194         }
2195
2196         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2197         break;
2198
2199     case NBOUNDL:
2200         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2201         if (FLAGS(c) != TRADITIONAL_BOUND) {
2202             if (! IN_UTF8_CTYPE_LOCALE) {
2203                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2204                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2205             }
2206             goto do_nboundu;
2207         }
2208
2209         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2210         break;
2211
2212     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2213                    meaning */
2214         assert(FLAGS(c) == TRADITIONAL_BOUND);
2215
2216         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2217         break;
2218
2219     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2220                    meaning */
2221         assert(FLAGS(c) == TRADITIONAL_BOUND);
2222
2223         FBC_BOUND_A(isWORDCHAR_A);
2224         break;
2225
2226     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2227                    meaning */
2228         assert(FLAGS(c) == TRADITIONAL_BOUND);
2229
2230         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2231         break;
2232
2233     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2234                    meaning */
2235         assert(FLAGS(c) == TRADITIONAL_BOUND);
2236
2237         FBC_NBOUND_A(isWORDCHAR_A);
2238         break;
2239
2240     case NBOUNDU:
2241         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2242             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2243             break;
2244         }
2245
2246       do_nboundu:
2247
2248         to_complement = 1;
2249         /* FALLTHROUGH */
2250
2251     case BOUNDU:
2252       do_boundu:
2253         switch((bound_type) FLAGS(c)) {
2254             case TRADITIONAL_BOUND:
2255                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2256                 break;
2257             case GCB_BOUND:
2258                 if (s == reginfo->strbeg) {
2259                     if (reginfo->intuit || regtry(reginfo, &s))
2260                     {
2261                         goto got_it;
2262                     }
2263
2264                     /* Didn't match.  Try at the next position (if there is one) */
2265                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2266                     if (UNLIKELY(s >= reginfo->strend)) {
2267                         break;
2268                     }
2269                 }
2270
2271                 if (utf8_target) {
2272                     GCB_enum before = getGCB_VAL_UTF8(
2273                                                reghop3((U8*)s, -1,
2274                                                        (U8*)(reginfo->strbeg)),
2275                                                (U8*) reginfo->strend);
2276                     while (s < strend) {
2277                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2278                                                         (U8*) reginfo->strend);
2279                         if (   (to_complement ^ isGCB(before,
2280                                                       after,
2281                                                       (U8*) reginfo->strbeg,
2282                                                       (U8*) s,
2283                                                       utf8_target))
2284                             && (reginfo->intuit || regtry(reginfo, &s)))
2285                         {
2286                             goto got_it;
2287                         }
2288                         before = after;
2289                         s += UTF8SKIP(s);
2290                     }
2291                 }
2292                 else {  /* Not utf8.  Everything is a GCB except between CR and
2293                            LF */
2294                     while (s < strend) {
2295                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2296                                               || UCHARAT(s) != '\n'))
2297                             && (reginfo->intuit || regtry(reginfo, &s)))
2298                         {
2299                             goto got_it;
2300                         }
2301                         s++;
2302                     }
2303                 }
2304
2305                 /* And, since this is a bound, it can match after the final
2306                  * character in the string */
2307                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2308                     goto got_it;
2309                 }
2310                 break;
2311
2312             case LB_BOUND:
2313                 if (s == reginfo->strbeg) {
2314                     if (reginfo->intuit || regtry(reginfo, &s)) {
2315                         goto got_it;
2316                     }
2317                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2318                     if (UNLIKELY(s >= reginfo->strend)) {
2319                         break;
2320                     }
2321                 }
2322
2323                 if (utf8_target) {
2324                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2325                                                                -1,
2326                                                                (U8*)(reginfo->strbeg)),
2327                                                        (U8*) reginfo->strend);
2328                     while (s < strend) {
2329                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2330                         if (to_complement ^ isLB(before,
2331                                                  after,
2332                                                  (U8*) reginfo->strbeg,
2333                                                  (U8*) s,
2334                                                  (U8*) reginfo->strend,
2335                                                  utf8_target)
2336                             && (reginfo->intuit || regtry(reginfo, &s)))
2337                         {
2338                             goto got_it;
2339                         }
2340                         before = after;
2341                         s += UTF8SKIP(s);
2342                     }
2343                 }
2344                 else {  /* Not utf8. */
2345                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2346                     while (s < strend) {
2347                         LB_enum after = getLB_VAL_CP((U8) *s);
2348                         if (to_complement ^ isLB(before,
2349                                                  after,
2350                                                  (U8*) reginfo->strbeg,
2351                                                  (U8*) s,
2352                                                  (U8*) reginfo->strend,
2353                                                  utf8_target)
2354                             && (reginfo->intuit || regtry(reginfo, &s)))
2355                         {
2356                             goto got_it;
2357                         }
2358                         before = after;
2359                         s++;
2360                     }
2361                 }
2362
2363                 if (reginfo->intuit || regtry(reginfo, &s)) {
2364                     goto got_it;
2365                 }
2366
2367                 break;
2368
2369             case SB_BOUND:
2370                 if (s == reginfo->strbeg) {
2371                     if (reginfo->intuit || regtry(reginfo, &s)) {
2372                         goto got_it;
2373                     }
2374                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2375                     if (UNLIKELY(s >= reginfo->strend)) {
2376                         break;
2377                     }
2378                 }
2379
2380                 if (utf8_target) {
2381                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2382                                                         -1,
2383                                                         (U8*)(reginfo->strbeg)),
2384                                                       (U8*) reginfo->strend);
2385                     while (s < strend) {
2386                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2387                                                          (U8*) reginfo->strend);
2388                         if ((to_complement ^ isSB(before,
2389                                                   after,
2390                                                   (U8*) reginfo->strbeg,
2391                                                   (U8*) s,
2392                                                   (U8*) reginfo->strend,
2393                                                   utf8_target))
2394                             && (reginfo->intuit || regtry(reginfo, &s)))
2395                         {
2396                             goto got_it;
2397                         }
2398                         before = after;
2399                         s += UTF8SKIP(s);
2400                     }
2401                 }
2402                 else {  /* Not utf8. */
2403                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2404                     while (s < strend) {
2405                         SB_enum after = getSB_VAL_CP((U8) *s);
2406                         if ((to_complement ^ isSB(before,
2407                                                   after,
2408                                                   (U8*) reginfo->strbeg,
2409                                                   (U8*) s,
2410                                                   (U8*) reginfo->strend,
2411                                                   utf8_target))
2412                             && (reginfo->intuit || regtry(reginfo, &s)))
2413                         {
2414                             goto got_it;
2415                         }
2416                         before = after;
2417                         s++;
2418                     }
2419                 }
2420
2421                 /* Here are at the final position in the target string.  The SB
2422                  * value is always true here, so matches, depending on other
2423                  * constraints */
2424                 if (reginfo->intuit || regtry(reginfo, &s)) {
2425                     goto got_it;
2426                 }
2427
2428                 break;
2429
2430             case WB_BOUND:
2431                 if (s == reginfo->strbeg) {
2432                     if (reginfo->intuit || regtry(reginfo, &s)) {
2433                         goto got_it;
2434                     }
2435                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2436                     if (UNLIKELY(s >= reginfo->strend)) {
2437                         break;
2438                     }
2439                 }
2440
2441                 if (utf8_target) {
2442                     /* We are at a boundary between char_sub_0 and char_sub_1.
2443                      * We also keep track of the value for char_sub_-1 as we
2444                      * loop through the line.   Context may be needed to make a
2445                      * determination, and if so, this can save having to
2446                      * recalculate it */
2447                     WB_enum previous = WB_UNKNOWN;
2448                     WB_enum before = getWB_VAL_UTF8(
2449                                               reghop3((U8*)s,
2450                                                       -1,
2451                                                       (U8*)(reginfo->strbeg)),
2452                                               (U8*) reginfo->strend);
2453                     while (s < strend) {
2454                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2455                                                         (U8*) reginfo->strend);
2456                         if ((to_complement ^ isWB(previous,
2457                                                   before,
2458                                                   after,
2459                                                   (U8*) reginfo->strbeg,
2460                                                   (U8*) s,
2461                                                   (U8*) reginfo->strend,
2462                                                   utf8_target))
2463                             && (reginfo->intuit || regtry(reginfo, &s)))
2464                         {
2465                             goto got_it;
2466                         }
2467                         previous = before;
2468                         before = after;
2469                         s += UTF8SKIP(s);
2470                     }
2471                 }
2472                 else {  /* Not utf8. */
2473                     WB_enum previous = WB_UNKNOWN;
2474                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2475                     while (s < strend) {
2476                         WB_enum after = getWB_VAL_CP((U8) *s);
2477                         if ((to_complement ^ isWB(previous,
2478                                                   before,
2479                                                   after,
2480                                                   (U8*) reginfo->strbeg,
2481                                                   (U8*) s,
2482                                                   (U8*) reginfo->strend,
2483                                                   utf8_target))
2484                             && (reginfo->intuit || regtry(reginfo, &s)))
2485                         {
2486                             goto got_it;
2487                         }
2488                         previous = before;
2489                         before = after;
2490                         s++;
2491                     }
2492                 }
2493
2494                 if (reginfo->intuit || regtry(reginfo, &s)) {
2495                     goto got_it;
2496                 }
2497         }
2498         break;
2499
2500     case LNBREAK:
2501         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2502                         is_LNBREAK_latin1_safe(s, strend)
2503         );
2504         break;
2505
2506     case ASCII:
2507         s = find_next_ascii(s, strend, utf8_target);
2508         if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
2509             goto got_it;
2510         }
2511
2512         break;
2513
2514     case NASCII:
2515         s = find_next_non_ascii(s, strend, utf8_target);
2516         if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
2517             goto got_it;
2518         }
2519
2520         break;
2521
2522     /* The argument to all the POSIX node types is the class number to pass to
2523      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2524
2525     case NPOSIXL:
2526         to_complement = 1;
2527         /* FALLTHROUGH */
2528
2529     case POSIXL:
2530         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2531         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2532                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2533         break;
2534
2535     case NPOSIXD:
2536         to_complement = 1;
2537         /* FALLTHROUGH */
2538
2539     case POSIXD:
2540         if (utf8_target) {
2541             goto posix_utf8;
2542         }
2543         goto posixa;
2544
2545     case NPOSIXA:
2546         if (utf8_target) {
2547             /* The complement of something that matches only ASCII matches all
2548              * non-ASCII, plus everything in ASCII that isn't in the class. */
2549             REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
2550                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2551             break;
2552         }
2553
2554         to_complement = 1;
2555         goto posixa;
2556
2557     case POSIXA:
2558         /* Don't need to worry about utf8, as it can match only a single
2559          * byte invariant character.  But we do anyway for performance reasons,
2560          * as otherwise we would have to examine all the continuation
2561          * characters */
2562         if (utf8_target) {
2563             REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
2564             break;
2565         }
2566
2567       posixa:
2568         REXEC_FBC_CLASS_SCAN(
2569                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2570         break;
2571
2572     case NPOSIXU:
2573         to_complement = 1;
2574         /* FALLTHROUGH */
2575
2576     case POSIXU:
2577         if (! utf8_target) {
2578             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2579                                                                     FLAGS(c))));
2580         }
2581         else {
2582
2583           posix_utf8:
2584             classnum = (_char_class_number) FLAGS(c);
2585             if (classnum < _FIRST_NON_SWASH_CC) {
2586                 while (s < strend) {
2587
2588                     /* We avoid loading in the swash as long as possible, but
2589                      * should we have to, we jump to a separate loop.  This
2590                      * extra 'if' statement is what keeps this code from being
2591                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2592                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2593                         goto found_above_latin1;
2594                     }
2595                     if ((UTF8_IS_INVARIANT(*s)
2596                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2597                                                                 classnum)))
2598                         || (   UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
2599                             && to_complement ^ cBOOL(
2600                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2601                                                                       *(s + 1)),
2602                                               classnum))))
2603                     {
2604                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2605                             goto got_it;
2606                         else {
2607                             tmp = doevery;
2608                         }
2609                     }
2610                     else {
2611                         tmp = 1;
2612                     }
2613                     s += UTF8SKIP(s);
2614                 }
2615             }
2616             else switch (classnum) {    /* These classes are implemented as
2617                                            macros */
2618                 case _CC_ENUM_SPACE:
2619                     REXEC_FBC_UTF8_CLASS_SCAN(
2620                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2621                     break;
2622
2623                 case _CC_ENUM_BLANK:
2624                     REXEC_FBC_UTF8_CLASS_SCAN(
2625                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2626                     break;
2627
2628                 case _CC_ENUM_XDIGIT:
2629                     REXEC_FBC_UTF8_CLASS_SCAN(
2630                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2631                     break;
2632
2633                 case _CC_ENUM_VERTSPACE:
2634                     REXEC_FBC_UTF8_CLASS_SCAN(
2635                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2636                     break;
2637
2638                 case _CC_ENUM_CNTRL:
2639                     REXEC_FBC_UTF8_CLASS_SCAN(
2640                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2641                     break;
2642
2643                 default:
2644                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2645                     NOT_REACHED; /* NOTREACHED */
2646             }
2647         }
2648         break;
2649
2650       found_above_latin1:   /* Here we have to load a swash to get the result
2651                                for the current code point */
2652         if (! PL_utf8_swash_ptrs[classnum]) {
2653             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2654             PL_utf8_swash_ptrs[classnum] =
2655                     _core_swash_init("utf8",
2656                                      "",
2657                                      &PL_sv_undef, 1, 0,
2658                                      PL_XPosix_ptrs[classnum], &flags);
2659         }
2660
2661         /* This is a copy of the loop above for swash classes, though using the
2662          * FBC macro instead of being expanded out.  Since we've loaded the
2663          * swash, we don't have to check for that each time through the loop */
2664         REXEC_FBC_UTF8_CLASS_SCAN(
2665                 to_complement ^ cBOOL(_generic_utf8_safe(
2666                                       classnum,
2667                                       s,
2668                                       strend,
2669                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2670                                                   (U8 *) s, TRUE))));
2671         break;
2672
2673     case AHOCORASICKC:
2674     case AHOCORASICK:
2675         {
2676             DECL_TRIE_TYPE(c);
2677             /* what trie are we using right now */
2678             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2679             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2680             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2681
2682             const char *last_start = strend - trie->minlen;
2683 #ifdef DEBUGGING
2684             const char *real_start = s;
2685 #endif
2686             STRLEN maxlen = trie->maxlen;
2687             SV *sv_points;
2688             U8 **points; /* map of where we were in the input string
2689                             when reading a given char. For ASCII this
2690                             is unnecessary overhead as the relationship
2691                             is always 1:1, but for Unicode, especially
2692                             case folded Unicode this is not true. */
2693             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2694             U8 *bitmap=NULL;
2695
2696
2697             GET_RE_DEBUG_FLAGS_DECL;
2698
2699             /* We can't just allocate points here. We need to wrap it in
2700              * an SV so it gets freed properly if there is a croak while
2701              * running the match */
2702             ENTER;
2703             SAVETMPS;
2704             sv_points=newSV(maxlen * sizeof(U8 *));
2705             SvCUR_set(sv_points,
2706                 maxlen * sizeof(U8 *));
2707             SvPOK_on(sv_points);
2708             sv_2mortal(sv_points);
2709             points=(U8**)SvPV_nolen(sv_points );
2710             if ( trie_type != trie_utf8_fold
2711                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2712             {
2713                 if (trie->bitmap)
2714                     bitmap=(U8*)trie->bitmap;
2715                 else
2716                     bitmap=(U8*)ANYOF_BITMAP(c);
2717             }
2718             /* this is the Aho-Corasick algorithm modified a touch
2719                to include special handling for long "unknown char" sequences.
2720                The basic idea being that we use AC as long as we are dealing
2721                with a possible matching char, when we encounter an unknown char
2722                (and we have not encountered an accepting state) we scan forward
2723                until we find a legal starting char.
2724                AC matching is basically that of trie matching, except that when
2725                we encounter a failing transition, we fall back to the current
2726                states "fail state", and try the current char again, a process
2727                we repeat until we reach the root state, state 1, or a legal
2728                transition. If we fail on the root state then we can either
2729                terminate if we have reached an accepting state previously, or
2730                restart the entire process from the beginning if we have not.
2731
2732              */
2733             while (s <= last_start) {
2734                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2735                 U8 *uc = (U8*)s;
2736                 U16 charid = 0;
2737                 U32 base = 1;
2738                 U32 state = 1;
2739                 UV uvc = 0;
2740                 STRLEN len = 0;
2741                 STRLEN foldlen = 0;
2742                 U8 *uscan = (U8*)NULL;
2743                 U8 *leftmost = NULL;
2744 #ifdef DEBUGGING
2745                 U32 accepted_word= 0;
2746 #endif
2747                 U32 pointpos = 0;
2748
2749                 while ( state && uc <= (U8*)strend ) {
2750                     int failed=0;
2751                     U32 word = aho->states[ state ].wordnum;
2752
2753                     if( state==1 ) {
2754                         if ( bitmap ) {
2755                             DEBUG_TRIE_EXECUTE_r(
2756                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2757                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2758                                         (char *)uc, utf8_target, 0 );
2759                                     Perl_re_printf( aTHX_
2760                                         " Scanning for legal start char...\n");
2761                                 }
2762                             );
2763                             if (utf8_target) {
2764                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2765                                     uc += UTF8SKIP(uc);
2766                                 }
2767                             } else {
2768                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2769                                     uc++;
2770                                 }
2771                             }
2772                             s= (char *)uc;
2773                         }
2774                         if (uc >(U8*)last_start) break;
2775                     }
2776
2777                     if ( word ) {
2778                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2779                         if (!leftmost || lpos < leftmost) {
2780                             DEBUG_r(accepted_word=word);
2781                             leftmost= lpos;
2782                         }
2783                         if (base==0) break;
2784
2785                     }
2786                     points[pointpos++ % maxlen]= uc;
2787                     if (foldlen || uc < (U8*)strend) {
2788                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2789                                          widecharmap, uc,
2790                                          uscan, len, uvc, charid, foldlen,
2791                                          foldbuf, uniflags);
2792                         DEBUG_TRIE_EXECUTE_r({
2793                             dump_exec_pos( (char *)uc, c, strend,
2794                                         real_start, s, utf8_target, 0);
2795                             Perl_re_printf( aTHX_
2796                                 " Charid:%3u CP:%4" UVxf " ",
2797                                  charid, uvc);
2798                         });
2799                     }
2800                     else {
2801                         len = 0;
2802                         charid = 0;
2803                     }
2804
2805
2806                     do {
2807 #ifdef DEBUGGING
2808                         word = aho->states[ state ].wordnum;
2809 #endif
2810                         base = aho->states[ state ].trans.base;
2811
2812                         DEBUG_TRIE_EXECUTE_r({
2813                             if (failed)
2814                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2815                                     s,   utf8_target, 0 );
2816                             Perl_re_printf( aTHX_
2817                                 "%sState: %4" UVxf ", word=%" UVxf,
2818                                 failed ? " Fail transition to " : "",
2819                                 (UV)state, (UV)word);
2820                         });
2821                         if ( base ) {
2822                             U32 tmp;
2823                             I32 offset;
2824                             if (charid &&
2825                                  ( ((offset = base + charid
2826                                     - 1 - trie->uniquecharcount)) >= 0)
2827                                  && ((U32)offset < trie->lasttrans)
2828                                  && trie->trans[offset].check == state
2829                                  && (tmp=trie->trans[offset].next))
2830                             {
2831                                 DEBUG_TRIE_EXECUTE_r(
2832                                     Perl_re_printf( aTHX_ " - legal\n"));
2833                                 state = tmp;
2834                                 break;
2835                             }
2836                             else {
2837                                 DEBUG_TRIE_EXECUTE_r(
2838                                     Perl_re_printf( aTHX_ " - fail\n"));
2839                                 failed = 1;
2840                                 state = aho->fail[state];
2841                             }
2842                         }
2843                         else {
2844                             /* we must be accepting here */
2845                             DEBUG_TRIE_EXECUTE_r(
2846                                     Perl_re_printf( aTHX_ " - accepting\n"));
2847                             failed = 1;
2848                             break;
2849                         }
2850                     } while(state);
2851                     uc += len;
2852                     if (failed) {
2853                         if (leftmost)
2854                             break;
2855                         if (!state) state = 1;
2856                     }
2857                 }
2858                 if ( aho->states[ state ].wordnum ) {
2859                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2860                     if (!leftmost || lpos < leftmost) {
2861                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2862                         leftmost = lpos;
2863                     }
2864                 }
2865                 if (leftmost) {
2866                     s = (char*)leftmost;
2867                     DEBUG_TRIE_EXECUTE_r({
2868                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
2869                             (UV)accepted_word, (IV)(s - real_start)
2870                         );
2871                     });
2872                     if (reginfo->intuit || regtry(reginfo, &s)) {
2873                         FREETMPS;
2874                         LEAVE;
2875                         goto got_it;
2876                     }
2877                     s = HOPc(s,1);
2878                     DEBUG_TRIE_EXECUTE_r({
2879                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
2880                     });
2881                 } else {
2882                     DEBUG_TRIE_EXECUTE_r(
2883                         Perl_re_printf( aTHX_ "No match.\n"));
2884                     break;
2885                 }
2886             }
2887             FREETMPS;
2888             LEAVE;
2889         }
2890         break;
2891     default:
2892         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2893     }
2894     return 0;
2895   got_it:
2896     return s;
2897 }
2898
2899 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2900  * flags have same meanings as with regexec_flags() */
2901
2902 static void
2903 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2904                             char *strbeg,
2905                             char *strend,
2906                             SV *sv,
2907                             U32 flags,
2908                             bool utf8_target)
2909 {
2910     struct regexp *const prog = ReANY(rx);
2911
2912     if (flags & REXEC_COPY_STR) {
2913 #ifdef PERL_ANY_COW
2914         if (SvCANCOW(sv)) {
2915             DEBUG_C(Perl_re_printf( aTHX_
2916                               "Copy on write: regexp capture, type %d\n",
2917                                     (int) SvTYPE(sv)));
2918             /* Create a new COW SV to share the match string and store
2919              * in saved_copy, unless the current COW SV in saved_copy
2920              * is valid and suitable for our purpose */
2921             if ((   prog->saved_copy
2922                  && SvIsCOW(prog->saved_copy)
2923                  && SvPOKp(prog->saved_copy)
2924                  && SvIsCOW(sv)
2925                  && SvPOKp(sv)
2926                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2927             {
2928                 /* just reuse saved_copy SV */
2929                 if (RXp_MATCH_COPIED(prog)) {
2930                     Safefree(prog->subbeg);
2931                     RXp_MATCH_COPIED_off(prog);
2932                 }
2933             }
2934             else {
2935                 /* create new COW SV to share string */
2936                 RXp_MATCH_COPY_FREE(prog);
2937                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2938             }
2939             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2940             assert (SvPOKp(prog->saved_copy));
2941             prog->sublen  = strend - strbeg;
2942             prog->suboffset = 0;
2943             prog->subcoffset = 0;
2944         } else
2945 #endif
2946         {
2947             SSize_t min = 0;
2948             SSize_t max = strend - strbeg;
2949             SSize_t sublen;
2950
2951             if (    (flags & REXEC_COPY_SKIP_POST)
2952                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2953                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2954             ) { /* don't copy $' part of string */
2955                 U32 n = 0;
2956                 max = -1;
2957                 /* calculate the right-most part of the string covered
2958                  * by a capture. Due to lookahead, this may be to
2959                  * the right of $&, so we have to scan all captures */
2960                 while (n <= prog->lastparen) {
2961                     if (prog->offs[n].end > max)
2962                         max = prog->offs[n].end;
2963                     n++;
2964                 }
2965                 if (max == -1)
2966                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2967                             ? prog->offs[0].start
2968                             : 0;
2969                 assert(max >= 0 && max <= strend - strbeg);
2970             }
2971
2972             if (    (flags & REXEC_COPY_SKIP_PRE)
2973                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2974                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2975             ) { /* don't copy $` part of string */
2976                 U32 n = 0;
2977                 min = max;
2978                 /* calculate the left-most part of the string covered
2979                  * by a capture. Due to lookbehind, this may be to
2980                  * the left of $&, so we have to scan all captures */
2981                 while (min && n <= prog->lastparen) {
2982                     if (   prog->offs[n].start != -1
2983                         && prog->offs[n].start < min)
2984                     {
2985                         min = prog->offs[n].start;
2986                     }
2987                     n++;
2988                 }
2989                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2990                     && min >  prog->offs[0].end
2991                 )
2992                     min = prog->offs[0].end;
2993
2994             }
2995
2996             assert(min >= 0 && min <= max && min <= strend - strbeg);
2997             sublen = max - min;
2998
2999             if (RXp_MATCH_COPIED(prog)) {
3000                 if (sublen > prog->sublen)
3001                     prog->subbeg =
3002                             (char*)saferealloc(prog->subbeg, sublen+1);
3003             }
3004             else
3005                 prog->subbeg = (char*)safemalloc(sublen+1);
3006             Copy(strbeg + min, prog->subbeg, sublen, char);
3007             prog->subbeg[sublen] = '\0';
3008             prog->suboffset = min;
3009             prog->sublen = sublen;
3010             RXp_MATCH_COPIED_on(prog);
3011         }
3012         prog->subcoffset = prog->suboffset;
3013         if (prog->suboffset && utf8_target) {
3014             /* Convert byte offset to chars.
3015              * XXX ideally should only compute this if @-/@+
3016              * has been seen, a la PL_sawampersand ??? */
3017
3018             /* If there's a direct correspondence between the
3019              * string which we're matching and the original SV,
3020              * then we can use the utf8 len cache associated with
3021              * the SV. In particular, it means that under //g,
3022              * sv_pos_b2u() will use the previously cached
3023              * position to speed up working out the new length of
3024              * subcoffset, rather than counting from the start of
3025              * the string each time. This stops
3026              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3027              * from going quadratic */
3028             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3029                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3030                                                 SV_GMAGIC|SV_CONST_RETURN);
3031             else
3032                 prog->subcoffset = utf8_length((U8*)strbeg,
3033                                     (U8*)(strbeg+prog->suboffset));
3034         }
3035     }
3036     else {
3037         RXp_MATCH_COPY_FREE(prog);
3038         prog->subbeg = strbeg;
3039         prog->suboffset = 0;
3040         prog->subcoffset = 0;
3041         prog->sublen = strend - strbeg;
3042     }
3043 }
3044
3045
3046
3047
3048 /*
3049  - regexec_flags - match a regexp against a string
3050  */
3051 I32
3052 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3053               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3054 /* stringarg: the point in the string at which to begin matching */
3055 /* strend:    pointer to null at end of string */
3056 /* strbeg:    real beginning of string */
3057 /* minend:    end of match must be >= minend bytes after stringarg. */
3058 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3059  *            itself is accessed via the pointers above */
3060 /* data:      May be used for some additional optimizations.
3061               Currently unused. */
3062 /* flags:     For optimizations. See REXEC_* in regexp.h */
3063
3064 {
3065     struct regexp *const prog = ReANY(rx);
3066     char *s;
3067     regnode *c;
3068     char *startpos;
3069     SSize_t minlen;             /* must match at least this many chars */
3070     SSize_t dontbother = 0;     /* how many characters not to try at end */
3071     const bool utf8_target = cBOOL(DO_UTF8(sv));
3072     I32 multiline;
3073     RXi_GET_DECL(prog,progi);
3074     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3075     regmatch_info *const reginfo = &reginfo_buf;
3076     regexp_paren_pair *swap = NULL;
3077     I32 oldsave;
3078     GET_RE_DEBUG_FLAGS_DECL;
3079
3080     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3081     PERL_UNUSED_ARG(data);
3082
3083     /* Be paranoid... */
3084     if (prog == NULL) {
3085         Perl_croak(aTHX_ "NULL regexp parameter");
3086     }
3087
3088     DEBUG_EXECUTE_r(
3089         debug_start_match(rx, utf8_target, stringarg, strend,
3090         "Matching");
3091     );
3092
3093     startpos = stringarg;
3094
3095     /* set these early as they may be used by the HOP macros below */
3096     reginfo->strbeg = strbeg;
3097     reginfo->strend = strend;
3098     reginfo->is_utf8_target = cBOOL(utf8_target);
3099
3100     if (prog->intflags & PREGf_GPOS_SEEN) {
3101         MAGIC *mg;
3102
3103         /* set reginfo->ganch, the position where \G can match */
3104
3105         reginfo->ganch =
3106             (flags & REXEC_IGNOREPOS)
3107             ? stringarg /* use start pos rather than pos() */
3108             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3109               /* Defined pos(): */
3110             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3111             : strbeg; /* pos() not defined; use start of string */
3112
3113         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3114             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3115
3116         /* in the presence of \G, we may need to start looking earlier in
3117          * the string than the suggested start point of stringarg:
3118          * if prog->gofs is set, then that's a known, fixed minimum
3119          * offset, such as
3120          * /..\G/:   gofs = 2
3121          * /ab|c\G/: gofs = 1
3122          * or if the minimum offset isn't known, then we have to go back
3123          * to the start of the string, e.g. /w+\G/
3124          */
3125
3126         if (prog->intflags & PREGf_ANCH_GPOS) {
3127             if (prog->gofs) {
3128                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3129                 if (!startpos ||
3130                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3131                 {
3132                     DEBUG_r(Perl_re_printf( aTHX_
3133                             "fail: ganch-gofs before earliest possible start\n"));
3134                     return 0;
3135                 }
3136             }
3137             else
3138                 startpos = reginfo->ganch;
3139         }
3140         else if (prog->gofs) {
3141             startpos = HOPBACKc(startpos, prog->gofs);
3142             if (!startpos)
3143                 startpos = strbeg;
3144         }
3145         else if (prog->intflags & PREGf_GPOS_FLOAT)
3146             startpos = strbeg;
3147     }
3148
3149     minlen = prog->minlen;
3150     if ((startpos + minlen) > strend || startpos < strbeg) {
3151         DEBUG_r(Perl_re_printf( aTHX_
3152                     "Regex match can't succeed, so not even tried\n"));
3153         return 0;
3154     }
3155
3156     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3157      * which will call destuctors to reset PL_regmatch_state, free higher
3158      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3159      * regmatch_info_aux_eval */
3160
3161     oldsave = PL_savestack_ix;
3162
3163     s = startpos;
3164
3165     if ((prog->extflags & RXf_USE_INTUIT)
3166         && !(flags & REXEC_CHECKED))
3167     {
3168         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3169                                     flags, NULL);
3170         if (!s)
3171             return 0;
3172
3173         if (prog->extflags & RXf_CHECK_ALL) {
3174             /* we can match based purely on the result of INTUIT.
3175              * Set up captures etc just for $& and $-[0]
3176              * (an intuit-only match wont have $1,$2,..) */
3177             assert(!prog->nparens);
3178
3179             /* s/// doesn't like it if $& is earlier than where we asked it to
3180              * start searching (which can happen on something like /.\G/) */
3181             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3182                     && (s < stringarg))
3183             {
3184                 /* this should only be possible under \G */
3185                 assert(prog->intflags & PREGf_GPOS_SEEN);
3186                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3187                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3188                 goto phooey;
3189             }
3190
3191             /* match via INTUIT shouldn't have any captures.
3192              * Let @-, @+, $^N know */
3193             prog->lastparen = prog->lastcloseparen = 0;
3194             RXp_MATCH_UTF8_set(prog, utf8_target);
3195             prog->offs[0].start = s - strbeg;
3196             prog->offs[0].end = utf8_target
3197                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3198                 : s - strbeg + prog->minlenret;
3199             if ( !(flags & REXEC_NOT_FIRST) )
3200                 S_reg_set_capture_string(aTHX_ rx,
3201                                         strbeg, strend,
3202                                         sv, flags, utf8_target);
3203
3204             return 1;
3205         }
3206     }
3207
3208     multiline = prog->extflags & RXf_PMf_MULTILINE;
3209
3210     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3211         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3212                               "String too short [regexec_flags]...\n"));
3213         goto phooey;
3214     }
3215
3216     /* Check validity of program. */
3217     if (UCHARAT(progi->program) != REG_MAGIC) {
3218         Perl_croak(aTHX_ "corrupted regexp program");
3219     }
3220
3221     RXp_MATCH_TAINTED_off(prog);
3222     RXp_MATCH_UTF8_set(prog, utf8_target);
3223
3224     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3225     reginfo->intuit = 0;
3226     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3227     reginfo->warned = FALSE;
3228     reginfo->sv = sv;
3229     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3230     /* see how far we have to get to not match where we matched before */
3231     reginfo->till = stringarg + minend;
3232
3233     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3234         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3235            S_cleanup_regmatch_info_aux has executed (registered by
3236            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3237            magic belonging to this SV.
3238            Not newSVsv, either, as it does not COW.
3239         */
3240         reginfo->sv = newSV(0);
3241         SvSetSV_nosteal(reginfo->sv, sv);
3242         SAVEFREESV(reginfo->sv);
3243     }
3244
3245     /* reserve next 2 or 3 slots in PL_regmatch_state:
3246      * slot N+0: may currently be in use: skip it
3247      * slot N+1: use for regmatch_info_aux struct
3248      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3249      * slot N+3: ready for use by regmatch()
3250      */
3251
3252     {
3253         regmatch_state *old_regmatch_state;
3254         regmatch_slab  *old_regmatch_slab;
3255         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3256
3257         /* on first ever match, allocate first slab */
3258         if (!PL_regmatch_slab) {
3259             Newx(PL_regmatch_slab, 1, regmatch_slab);
3260             PL_regmatch_slab->prev = NULL;
3261             PL_regmatch_slab->next = NULL;
3262             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3263         }
3264
3265         old_regmatch_state = PL_regmatch_state;
3266         old_regmatch_slab  = PL_regmatch_slab;
3267
3268         for (i=0; i <= max; i++) {
3269             if (i == 1)
3270                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3271             else if (i ==2)
3272                 reginfo->info_aux_eval =
3273                 reginfo->info_aux->info_aux_eval =
3274                             &(PL_regmatch_state->u.info_aux_eval);
3275
3276             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3277                 PL_regmatch_state = S_push_slab(aTHX);
3278         }
3279
3280         /* note initial PL_regmatch_state position; at end of match we'll
3281          * pop back to there and free any higher slabs */
3282
3283         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3284         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3285         reginfo->info_aux->poscache = NULL;
3286
3287         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3288
3289         if ((prog->extflags & RXf_EVAL_SEEN))
3290             S_setup_eval_state(aTHX_ reginfo);
3291         else
3292             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3293     }
3294
3295     /* If there is a "must appear" string, look for it. */
3296
3297     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3298         /* We have to be careful. If the previous successful match
3299            was from this regex we don't want a subsequent partially
3300            successful match to clobber the old results.
3301            So when we detect this possibility we add a swap buffer
3302            to the re, and switch the buffer each match. If we fail,
3303            we switch it back; otherwise we leave it swapped.
3304         */
3305         swap = prog->offs;
3306         /* do we need a save destructor here for eval dies? */
3307         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3308         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3309             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3310             0,
3311             PTR2UV(prog),
3312             PTR2UV(swap),
3313             PTR2UV(prog->offs)
3314         ));
3315     }
3316
3317     if (prog->recurse_locinput)
3318         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3319
3320     /* Simplest case: anchored match need be tried only once, or with
3321      * MBOL, only at the beginning of each line.
3322      *
3323      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3324      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3325      * match at the start of the string then it won't match anywhere else
3326      * either; while with /.*.../, if it doesn't match at the beginning,
3327      * the earliest it could match is at the start of the next line */
3328
3329     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3330         char *end;
3331
3332         if (regtry(reginfo, &s))
3333             goto got_it;
3334
3335         if (!(prog->intflags & PREGf_ANCH_MBOL))
3336             goto phooey;
3337
3338         /* didn't match at start, try at other newline positions */
3339
3340         if (minlen)
3341             dontbother = minlen - 1;
3342         end = HOP3c(strend, -dontbother, strbeg) - 1;
3343
3344         /* skip to next newline */
3345
3346         while (s <= end) { /* note it could be possible to match at the end of the string */
3347             /* NB: newlines are the same in unicode as they are in latin */
3348             if (*s++ != '\n')
3349                 continue;
3350             if (prog->check_substr || prog->check_utf8) {
3351             /* note that with PREGf_IMPLICIT, intuit can only fail
3352              * or return the start position, so it's of limited utility.
3353              * Nevertheless, I made the decision that the potential for
3354              * quick fail was still worth it - DAPM */
3355                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3356                 if (!s)
3357                     goto phooey;
3358             }
3359             if (regtry(reginfo, &s))
3360                 goto got_it;
3361         }
3362         goto phooey;
3363     } /* end anchored search */
3364
3365     if (prog->intflags & PREGf_ANCH_GPOS)
3366     {
3367         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3368         assert(prog->intflags & PREGf_GPOS_SEEN);
3369         /* For anchored \G, the only position it can match from is
3370          * (ganch-gofs); we already set startpos to this above; if intuit
3371          * moved us on from there, we can't possibly succeed */
3372         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3373         if (s == startpos && regtry(reginfo, &s))
3374             goto got_it;
3375         goto phooey;
3376     }
3377
3378     /* Messy cases:  unanchored match. */
3379     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3380         /* we have /x+whatever/ */
3381         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3382         char ch;
3383 #ifdef DEBUGGING
3384         int did_match = 0;
3385 #endif
3386         if (utf8_target) {
3387             if (! prog->anchored_utf8) {
3388                 to_utf8_substr(prog);
3389             }
3390             ch = SvPVX_const(prog->anchored_utf8)[0];
3391             REXEC_FBC_SCAN(
3392                 if (*s == ch) {
3393                     DEBUG_EXECUTE_r( did_match = 1 );
3394                     if (regtry(reginfo, &s)) goto got_it;
3395                     s += UTF8SKIP(s);
3396                     while (s < strend && *s == ch)
3397                         s += UTF8SKIP(s);
3398                 }
3399             );
3400
3401         }
3402         else {
3403             if (! prog->anchored_substr) {
3404                 if (! to_byte_substr(prog)) {
3405                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3406                 }
3407             }
3408             ch = SvPVX_const(prog->anchored_substr)[0];
3409             REXEC_FBC_SCAN(
3410                 if (*s == ch) {
3411                     DEBUG_EXECUTE_r( did_match = 1 );
3412                     if (regtry(reginfo, &s)) goto got_it;
3413                     s++;
3414                     while (s < strend && *s == ch)
3415                         s++;
3416                 }
3417             );
3418         }
3419         DEBUG_EXECUTE_r(if (!did_match)
3420                 Perl_re_printf( aTHX_
3421                                   "Did not find anchored character...\n")
3422                );
3423     }
3424     else if (prog->anchored_substr != NULL
3425               || prog->anchored_utf8 != NULL
3426               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3427                   && prog->float_max_offset < strend - s)) {
3428         SV *must;
3429         SSize_t back_max;
3430         SSize_t back_min;
3431         char *last;
3432         char *last1;            /* Last position checked before */
3433 #ifdef DEBUGGING
3434         int did_match = 0;
3435 #endif
3436         if (prog->anchored_substr || prog->anchored_utf8) {
3437             if (utf8_target) {
3438                 if (! prog->anchored_utf8) {
3439                     to_utf8_substr(prog);
3440                 }
3441                 must = prog->anchored_utf8;
3442             }
3443             else {
3444                 if (! prog->anchored_substr) {
3445                     if (! to_byte_substr(prog)) {
3446                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3447                     }
3448                 }
3449                 must = prog->anchored_substr;
3450             }
3451             back_max = back_min = prog->anchored_offset;
3452         } else {
3453             if (utf8_target) {
3454                 if (! prog->float_utf8) {
3455                     to_utf8_substr(prog);
3456                 }
3457                 must = prog->float_utf8;
3458             }
3459             else {
3460                 if (! prog->float_substr) {
3461                     if (! to_byte_substr(prog)) {
3462                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3463                     }
3464                 }
3465                 must = prog->float_substr;
3466             }
3467             back_max = prog->float_max_offset;
3468             back_min = prog->float_min_offset;
3469         }
3470
3471         if (back_min<0) {
3472             last = strend;
3473         } else {
3474             last = HOP3c(strend,        /* Cannot start after this */
3475                   -(SSize_t)(CHR_SVLEN(must)
3476                          - (SvTAIL(must) != 0) + back_min), strbeg);
3477         }
3478         if (s > reginfo->strbeg)
3479             last1 = HOPc(s, -1);
3480         else
3481             last1 = s - 1;      /* bogus */
3482
3483         /* XXXX check_substr already used to find "s", can optimize if
3484            check_substr==must. */
3485         dontbother = 0;
3486         strend = HOPc(strend, -dontbother);
3487         while ( (s <= last) &&
3488                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3489                                   (unsigned char*)strend, must,
3490                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3491             DEBUG_EXECUTE_r( did_match = 1 );
3492             if (HOPc(s, -back_max) > last1) {
3493                 last1 = HOPc(s, -back_min);
3494                 s = HOPc(s, -back_max);
3495             }
3496             else {
3497                 char * const t = (last1 >= reginfo->strbeg)
3498                                     ? HOPc(last1, 1) : last1 + 1;
3499
3500                 last1 = HOPc(s, -back_min);
3501                 s = t;
3502             }
3503             if (utf8_target) {
3504                 while (s <= last1) {
3505                     if (regtry(reginfo, &s))
3506                         goto got_it;
3507                     if (s >= last1) {
3508                         s++; /* to break out of outer loop */
3509                         break;
3510                     }
3511                     s += UTF8SKIP(s);
3512                 }
3513             }
3514             else {
3515                 while (s <= last1) {
3516                     if (regtry(reginfo, &s))
3517                         goto got_it;
3518                     s++;
3519                 }
3520             }
3521         }
3522         DEBUG_EXECUTE_r(if (!did_match) {
3523             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3524                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3525             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3526                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3527                                ? "anchored" : "floating"),
3528                 quoted, RE_SV_TAIL(must));
3529         });
3530         goto phooey;
3531     }
3532     else if ( (c = progi->regstclass) ) {
3533         if (minlen) {
3534             const OPCODE op = OP(progi->regstclass);
3535             /* don't bother with what can't match */
3536             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3537                 strend = HOPc(strend, -(minlen - 1));
3538         }
3539         DEBUG_EXECUTE_r({
3540             SV * const prop = sv_newmortal();
3541             regprop(prog, prop, c, reginfo, NULL);
3542             {
3543                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3544                     s,strend-s,PL_dump_re_max_len);
3545                 Perl_re_printf( aTHX_
3546                     "Matching stclass %.*s against %s (%d bytes)\n",
3547                     (int)SvCUR(prop), SvPVX_const(prop),
3548                      quoted, (int)(strend - s));
3549             }
3550         });
3551         if (find_byclass(prog, c, s, strend, reginfo))
3552             goto got_it;
3553         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3554     }
3555     else {
3556         dontbother = 0;
3557         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3558             /* Trim the end. */
3559             char *last= NULL;
3560             SV* float_real;
3561             STRLEN len;
3562             const char *little;
3563
3564             if (utf8_target) {
3565                 if (! prog->float_utf8) {
3566                     to_utf8_substr(prog);
3567                 }
3568                 float_real = prog->float_utf8;
3569             }
3570             else {
3571                 if (! prog->float_substr) {
3572                     if (! to_byte_substr(prog)) {
3573                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3574                     }
3575                 }
3576                 float_real = prog->float_substr;
3577             }
3578
3579             little = SvPV_const(float_real, len);
3580             if (SvTAIL(float_real)) {
3581                     /* This means that float_real contains an artificial \n on
3582                      * the end due to the presence of something like this:
3583                      * /foo$/ where we can match both "foo" and "foo\n" at the
3584                      * end of the string.  So we have to compare the end of the
3585                      * string first against the float_real without the \n and
3586                      * then against the full float_real with the string.  We
3587                      * have to watch out for cases where the string might be
3588                      * smaller than the float_real or the float_real without
3589                      * the \n. */
3590                     char *checkpos= strend - len;
3591                     DEBUG_OPTIMISE_r(
3592                         Perl_re_printf( aTHX_
3593                             "%sChecking for float_real.%s\n",
3594                             PL_colors[4], PL_colors[5]));
3595                     if (checkpos + 1 < strbeg) {
3596                         /* can't match, even if we remove the trailing \n
3597                          * string is too short to match */
3598                         DEBUG_EXECUTE_r(
3599                             Perl_re_printf( aTHX_
3600                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3601                                 PL_colors[4], PL_colors[5]));
3602                         goto phooey;
3603                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3604                         /* can match, the end of the string matches without the
3605                          * "\n" */
3606                         last = checkpos + 1;
3607                     } else if (checkpos < strbeg) {
3608                         /* cant match, string is too short when the "\n" is
3609                          * included */
3610                         DEBUG_EXECUTE_r(
3611                             Perl_re_printf( aTHX_
3612                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3613                                 PL_colors[4], PL_colors[5]));
3614                         goto phooey;
3615                     } else if (!multiline) {
3616                         /* non multiline match, so compare with the "\n" at the
3617                          * end of the string */
3618                         if (memEQ(checkpos, little, len)) {
3619                             last= checkpos;
3620                         } else {
3621                             DEBUG_EXECUTE_r(
3622                                 Perl_re_printf( aTHX_
3623                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3624                                     PL_colors[4], PL_colors[5]));
3625                             goto phooey;
3626                         }
3627                     } else {
3628                         /* multiline match, so we have to search for a place
3629                          * where the full string is located */
3630                         goto find_last;
3631                     }
3632             } else {
3633                   find_last:
3634                     if (len)
3635                         last = rninstr(s, strend, little, little + len);
3636                     else
3637                         last = strend;  /* matching "$" */
3638             }
3639             if (!last) {
3640                 /* at one point this block contained a comment which was
3641                  * probably incorrect, which said that this was a "should not
3642                  * happen" case.  Even if it was true when it was written I am
3643                  * pretty sure it is not anymore, so I have removed the comment
3644                  * and replaced it with this one. Yves */
3645                 DEBUG_EXECUTE_r(
3646                     Perl_re_printf( aTHX_
3647                         "%sString does not contain required substring, cannot match.%s\n",
3648                         PL_colors[4], PL_colors[5]
3649                     ));
3650                 goto phooey;
3651             }
3652             dontbother = strend - last + prog->float_min_offset;
3653         }
3654         if (minlen && (dontbother < minlen))
3655             dontbother = minlen - 1;
3656         strend -= dontbother;              /* this one's always in bytes! */
3657         /* We don't know much -- general case. */
3658         if (utf8_target) {
3659             for (;;) {
3660                 if (regtry(reginfo, &s))
3661                     goto got_it;
3662                 if (s >= strend)
3663                     break;
3664                 s += UTF8SKIP(s);
3665             };
3666         }
3667         else {
3668             do {
3669                 if (regtry(reginfo, &s))
3670                     goto got_it;
3671             } while (s++ < strend);
3672         }
3673     }
3674
3675     /* Failure. */
3676     goto phooey;
3677
3678   got_it:
3679     /* s/// doesn't like it if $& is earlier than where we asked it to
3680      * start searching (which can happen on something like /.\G/) */
3681     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3682             && (prog->offs[0].start < stringarg - strbeg))
3683     {
3684         /* this should only be possible under \G */
3685         assert(prog->intflags & PREGf_GPOS_SEEN);
3686         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3687             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3688         goto phooey;
3689     }
3690
3691     DEBUG_BUFFERS_r(
3692         if (swap)
3693             Perl_re_exec_indentf( aTHX_
3694                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3695                 0,
3696                 PTR2UV(prog),
3697                 PTR2UV(swap)
3698             );
3699     );
3700     Safefree(swap);
3701
3702     /* clean up; this will trigger destructors that will free all slabs
3703      * above the current one, and cleanup the regmatch_info_aux
3704      * and regmatch_info_aux_eval sructs */
3705
3706     LEAVE_SCOPE(oldsave);
3707
3708     if (RXp_PAREN_NAMES(prog))
3709         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3710
3711     /* make sure $`, $&, $', and $digit will work later */
3712     if ( !(flags & REXEC_NOT_FIRST) )
3713         S_reg_set_capture_string(aTHX_ rx,
3714                                     strbeg, reginfo->strend,
3715                                     sv, flags, utf8_target);
3716
3717     return 1;
3718
3719   phooey:
3720     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3721                           PL_colors[4], PL_colors[5]));
3722
3723     /* clean up; this will trigger destructors that will free all slabs
3724      * above the current one, and cleanup the regmatch_info_aux
3725      * and regmatch_info_aux_eval sructs */
3726
3727     LEAVE_SCOPE(oldsave);
3728
3729     if (swap) {
3730         /* we failed :-( roll it back */
3731         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3732             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3733             0,
3734             PTR2UV(prog),
3735             PTR2UV(prog->offs),
3736             PTR2UV(swap)
3737         ));
3738         Safefree(prog->offs);
3739         prog->offs = swap;
3740     }
3741     return 0;
3742 }
3743
3744
3745 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3746  * Do inc before dec, in case old and new rex are the same */
3747 #define SET_reg_curpm(Re2)                          \
3748     if (reginfo->info_aux_eval) {                   \
3749         (void)ReREFCNT_inc(Re2);                    \
3750         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3751         PM_SETRE((PL_reg_curpm), (Re2));            \
3752     }
3753
3754
3755 /*
3756  - regtry - try match at specific point
3757  */
3758 STATIC bool                     /* 0 failure, 1 success */
3759 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3760 {
3761     CHECKPOINT lastcp;
3762     REGEXP *const rx = reginfo->prog;
3763     regexp *const prog = ReANY(rx);
3764     SSize_t result;
3765 #ifdef DEBUGGING
3766     U32 depth = 0; /* used by REGCP_SET */
3767 #endif
3768     RXi_GET_DECL(prog,progi);
3769     GET_RE_DEBUG_FLAGS_DECL;
3770
3771     PERL_ARGS_ASSERT_REGTRY;
3772
3773     reginfo->cutpoint=NULL;
3774
3775     prog->offs[0].start = *startposp - reginfo->strbeg;
3776     prog->lastparen = 0;
3777     prog->lastcloseparen = 0;
3778
3779     /* XXXX What this code is doing here?!!!  There should be no need
3780        to do this again and again, prog->lastparen should take care of
3781        this!  --ilya*/
3782
3783     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3784      * Actually, the code in regcppop() (which Ilya may be meaning by
3785      * prog->lastparen), is not needed at all by the test suite
3786      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3787      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3788      * Meanwhile, this code *is* needed for the
3789      * above-mentioned test suite tests to succeed.  The common theme
3790      * on those tests seems to be returning null fields from matches.
3791      * --jhi updated by dapm */
3792
3793     /* After encountering a variant of the issue mentioned above I think
3794      * the point Ilya was making is that if we properly unwind whenever
3795      * we set lastparen to a smaller value then we should not need to do
3796      * this every time, only when needed. So if we have tests that fail if
3797      * we remove this, then it suggests somewhere else we are improperly
3798      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3799      * places it is called, and related regcp() routines. - Yves */
3800 #if 1
3801     if (prog->nparens) {
3802         regexp_paren_pair *pp = prog->offs;
3803         I32 i;
3804         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3805             ++pp;
3806             pp->start = -1;
3807             pp->end = -1;
3808         }
3809     }
3810 #endif
3811     REGCP_SET(lastcp);
3812     result = regmatch(reginfo, *startposp, progi->program + 1);
3813     if (result != -1) {
3814         prog->offs[0].end = result;
3815         return 1;
3816     }
3817     if (reginfo->cutpoint)
3818         *startposp= reginfo->cutpoint;
3819     REGCP_UNWIND(lastcp);
3820     return 0;
3821 }
3822
3823
3824 #define sayYES goto yes
3825 #define sayNO goto no
3826 #define sayNO_SILENT goto no_silent
3827
3828 /* we dont use STMT_START/END here because it leads to
3829    "unreachable code" warnings, which are bogus, but distracting. */
3830 #define CACHEsayNO \
3831     if (ST.cache_mask) \
3832        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3833     sayNO
3834
3835 /* this is used to determine how far from the left messages like
3836    'failed...' are printed in regexec.c. It should be set such that
3837    messages are inline with the regop output that created them.
3838 */
3839 #define REPORT_CODE_OFF 29
3840 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3841 #ifdef DEBUGGING
3842 int
3843 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3844 {
3845     va_list ap;
3846     int result;
3847     PerlIO *f= Perl_debug_log;
3848     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3849     va_start(ap, depth);
3850     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3851     result = PerlIO_vprintf(f, fmt, ap);
3852     va_end(ap);
3853     return result;
3854 }
3855 #endif /* DEBUGGING */
3856
3857
3858 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3859 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3860 #define CHRTEST_NOT_A_CP_1 -999
3861 #define CHRTEST_NOT_A_CP_2 -998
3862
3863 /* grab a new slab and return the first slot in it */
3864
3865 STATIC regmatch_state *
3866 S_push_slab(pTHX)
3867 {
3868     regmatch_slab *s = PL_regmatch_slab->next;
3869     if (!s) {
3870         Newx(s, 1, regmatch_slab);
3871         s->prev = PL_regmatch_slab;
3872         s->next = NULL;
3873         PL_regmatch_slab->next = s;
3874     }
3875     PL_regmatch_slab = s;
3876     return SLAB_FIRST(s);
3877 }
3878
3879
3880 /* push a new state then goto it */
3881
3882 #define PUSH_STATE_GOTO(state, node, input) \
3883     pushinput = input; \
3884     scan = node; \
3885     st->resume_state = state; \
3886     goto push_state;
3887
3888 /* push a new state with success backtracking, then goto it */
3889
3890 #define PUSH_YES_STATE_GOTO(state, node, input) \
3891     pushinput = input; \
3892     scan = node; \
3893     st->resume_state = state; \
3894     goto push_yes_state;
3895
3896
3897
3898
3899 /*
3900
3901 regmatch() - main matching routine
3902
3903 This is basically one big switch statement in a loop. We execute an op,
3904 set 'next' to point the next op, and continue. If we come to a point which
3905 we may need to backtrack to on failure such as (A|B|C), we push a
3906 backtrack state onto the backtrack stack. On failure, we pop the top
3907 state, and re-enter the loop at the state indicated. If there are no more
3908 states to pop, we return failure.
3909
3910 Sometimes we also need to backtrack on success; for example /A+/, where
3911 after successfully matching one A, we need to go back and try to
3912 match another one; similarly for lookahead assertions: if the assertion
3913 completes successfully, we backtrack to the state just before the assertion
3914 and then carry on.  In these cases, the pushed state is marked as
3915 'backtrack on success too'. This marking is in fact done by a chain of
3916 pointers, each pointing to the previous 'yes' state. On success, we pop to
3917 the nearest yes state, discarding any intermediate failure-only states.
3918 Sometimes a yes state is pushed just to force some cleanup code to be
3919 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3920 it to free the inner regex.
3921
3922 Note that failure backtracking rewinds the cursor position, while
3923 success backtracking leaves it alone.
3924
3925 A pattern is complete when the END op is executed, while a subpattern
3926 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3927 ops trigger the "pop to last yes state if any, otherwise return true"
3928 behaviour.
3929
3930 A common convention in this function is to use A and B to refer to the two
3931 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3932 the subpattern to be matched possibly multiple times, while B is the entire
3933 rest of the pattern. Variable and state names reflect this convention.
3934
3935 The states in the main switch are the union of ops and failure/success of
3936 substates associated with with that op.  For example, IFMATCH is the op
3937 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3938 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3939 successfully matched A and IFMATCH_A_fail is a state saying that we have
3940 just failed to match A. Resume states always come in pairs. The backtrack
3941 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3942 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3943 on success or failure.
3944
3945 The struct that holds a backtracking state is actually a big union, with
3946 one variant for each major type of op. The variable st points to the
3947 top-most backtrack struct. To make the code clearer, within each
3948 block of code we #define ST to alias the relevant union.
3949
3950 Here's a concrete example of a (vastly oversimplified) IFMATCH
3951 implementation:
3952
3953     switch (state) {
3954     ....
3955
3956 #define ST st->u.ifmatch
3957
3958     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3959         ST.foo = ...; // some state we wish to save
3960         ...
3961         // push a yes backtrack state with a resume value of
3962         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3963         // first node of A:
3964         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3965         // NOTREACHED
3966
3967     case IFMATCH_A: // we have successfully executed A; now continue with B
3968         next = B;
3969         bar = ST.foo; // do something with the preserved value
3970         break;
3971
3972     case IFMATCH_A_fail: // A failed, so the assertion failed
3973         ...;   // do some housekeeping, then ...
3974         sayNO; // propagate the failure
3975
3976 #undef ST
3977
3978     ...
3979     }
3980
3981 For any old-timers reading this who are familiar with the old recursive
3982 approach, the code above is equivalent to:
3983
3984     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3985     {
3986         int foo = ...
3987         ...
3988         if (regmatch(A)) {
3989             next = B;
3990             bar = foo;
3991             break;
3992         }
3993         ...;   // do some housekeeping, then ...
3994         sayNO; // propagate the failure
3995     }
3996
3997 The topmost backtrack state, pointed to by st, is usually free. If you
3998 want to claim it, populate any ST.foo fields in it with values you wish to
3999 save, then do one of
4000
4001         PUSH_STATE_GOTO(resume_state, node, newinput);
4002         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
4003
4004 which sets that backtrack state's resume value to 'resume_state', pushes a
4005 new free entry to the top of the backtrack stack, then goes to 'node'.
4006 On backtracking, the free slot is popped, and the saved state becomes the
4007 new free state. An ST.foo field in this new top state can be temporarily
4008 accessed to retrieve values, but once the main loop is re-entered, it
4009 becomes available for reuse.
4010
4011 Note that the depth of the backtrack stack constantly increases during the
4012 left-to-right execution of the pattern, rather than going up and down with
4013 the pattern nesting. For example the stack is at its maximum at Z at the
4014 end of the pattern, rather than at X in the following:
4015
4016     /(((X)+)+)+....(Y)+....Z/
4017
4018 The only exceptions to this are lookahead/behind assertions and the cut,
4019 (?>A), which pop all the backtrack states associated with A before
4020 continuing.
4021
4022 Backtrack state structs are allocated in slabs of about 4K in size.
4023 PL_regmatch_state and st always point to the currently active state,
4024 and PL_regmatch_slab points to the slab currently containing
4025 PL_regmatch_state.  The first time regmatch() is called, the first slab is
4026 allocated, and is never freed until interpreter destruction. When the slab
4027 is full, a new one is allocated and chained to the end. At exit from
4028 regmatch(), slabs allocated since entry are freed.
4029
4030 */
4031
4032
4033 #define DEBUG_STATE_pp(pp)                                  \
4034     DEBUG_STATE_r({                                         \
4035         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
4036         Perl_re_printf( aTHX_                                           \
4037             "%*s" pp " %s%s%s%s%s\n",                       \
4038             INDENT_CHARS(depth), "",                        \
4039             PL_reg_name[st->resume_state],                  \
4040             ((st==yes_state||st==mark_state) ? "[" : ""),   \
4041             ((st==yes_state) ? "Y" : ""),                   \
4042             ((st==mark_state) ? "M" : ""),                  \
4043             ((st==yes_state||st==mark_state) ? "]" : "")    \
4044         );                                                  \
4045     });
4046
4047
4048 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
4049
4050 #ifdef DEBUGGING
4051
4052 STATIC void
4053 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4054     const char *start, const char *end, const char *blurb)
4055 {
4056     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4057
4058     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4059
4060     if (!PL_colorset)
4061             reginitcolors();
4062     {
4063         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4064             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4065
4066         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4067             start, end - start, PL_dump_re_max_len);
4068
4069         Perl_re_printf( aTHX_
4070             "%s%s REx%s %s against %s\n",
4071                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4072
4073         if (utf8_target||utf8_pat)
4074             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4075                 utf8_pat ? "pattern" : "",
4076                 utf8_pat && utf8_target ? " and " : "",
4077                 utf8_target ? "string" : ""
4078             );
4079     }
4080 }
4081
4082 STATIC void
4083 S_dump_exec_pos(pTHX_ const char *locinput,
4084                       const regnode *scan,
4085                       const char *loc_regeol,
4086                       const char *loc_bostr,
4087                       const char *loc_reg_starttry,
4088                       const bool utf8_target,
4089                       const U32 depth
4090                 )
4091 {
4092     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4093     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4094     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4095     /* The part of the string before starttry has one color
4096        (pref0_len chars), between starttry and current
4097        position another one (pref_len - pref0_len chars),
4098        after the current position the third one.
4099        We assume that pref0_len <= pref_len, otherwise we
4100        decrease pref0_len.  */
4101     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4102         ? (5 + taill) - l : locinput - loc_bostr;
4103     int pref0_len;
4104
4105     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4106
4107     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4108         pref_len++;
4109     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4110     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4111         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4112               ? (5 + taill) - pref_len : loc_regeol - locinput);
4113     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4114         l--;
4115     if (pref0_len < 0)
4116         pref0_len = 0;
4117     if (pref0_len > pref_len)
4118         pref0_len = pref_len;
4119     {
4120         const int is_uni = utf8_target ? 1 : 0;
4121
4122         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4123             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4124
4125         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4126                     (locinput - pref_len + pref0_len),
4127                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4128
4129         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4130                     locinput, loc_regeol - locinput, 10, 0, 1);
4131
4132         const STRLEN tlen=len0+len1+len2;
4133         Perl_re_printf( aTHX_
4134                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4135                     (IV)(locinput - loc_bostr),
4136                     len0, s0,
4137                     len1, s1,
4138                     (docolor ? "" : "> <"),
4139                     len2, s2,
4140                     (int)(tlen > 19 ? 0 :  19 - tlen),
4141                     "",
4142                     depth);
4143     }
4144 }
4145
4146 #endif
4147
4148 /* reg_check_named_buff_matched()
4149  * Checks to see if a named buffer has matched. The data array of
4150  * buffer numbers corresponding to the buffer is expected to reside
4151  * in the regexp->data->data array in the slot stored in the ARG() of
4152  * node involved. Note that this routine doesn't actually care about the
4153  * name, that information is not preserved from compilation to execution.
4154  * Returns the index of the leftmost defined buffer with the given name
4155  * or 0 if non of the buffers matched.
4156  */
4157 STATIC I32
4158 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4159 {
4160     I32 n;
4161     RXi_GET_DECL(rex,rexi);
4162     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4163     I32 *nums=(I32*)SvPVX(sv_dat);
4164
4165     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4166
4167     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4168         if ((I32)rex->lastparen >= nums[n] &&
4169             rex->offs[nums[n]].end != -1)
4170         {
4171             return nums[n];
4172         }
4173     }
4174     return 0;
4175 }
4176
4177
4178 static bool
4179 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4180         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4181 {
4182     /* This function determines if there are one or two characters that match
4183      * the first character of the passed-in EXACTish node <text_node>, and if
4184      * so, returns them in the passed-in pointers.
4185      *
4186      * If it determines that no possible character in the target string can
4187      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4188      * the first character in <text_node> requires UTF-8 to represent, and the
4189      * target string isn't in UTF-8.)
4190      *
4191      * If there are more than two characters that could match the beginning of
4192      * <text_node>, or if more context is required to determine a match or not,
4193      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4194      *
4195      * The motiviation behind this function is to allow the caller to set up
4196      * tight loops for matching.  If <text_node> is of type EXACT, there is
4197      * only one possible character that can match its first character, and so
4198      * the situation is quite simple.  But things get much more complicated if
4199      * folding is involved.  It may be that the first character of an EXACTFish
4200      * node doesn't participate in any possible fold, e.g., punctuation, so it
4201      * can be matched only by itself.  The vast majority of characters that are
4202      * in folds match just two things, their lower and upper-case equivalents.
4203      * But not all are like that; some have multiple possible matches, or match
4204      * sequences of more than one character.  This function sorts all that out.
4205      *
4206      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4207      * loop of trying to match A*, we know we can't exit where the thing
4208      * following it isn't a B.  And something can't be a B unless it is the
4209      * beginning of B.  By putting a quick test for that beginning in a tight
4210      * loop, we can rule out things that can't possibly be B without having to
4211      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4212      * character, we can make a tight loop matching A*, using the outputs of
4213      * this function.
4214      *
4215      * If the target string to match isn't in UTF-8, and there aren't
4216      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4217      * the one or two possible octets (which are characters in this situation)
4218      * that can match.  In all cases, if there is only one character that can
4219      * match, *<c1p> and *<c2p> will be identical.
4220      *
4221      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4222      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4223      * can match the beginning of <text_node>.  They should be declared with at
4224      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4225      * undefined what these contain.)  If one or both of the buffers are
4226      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4227      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4228      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4229      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4230      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4231
4232     const bool utf8_target = reginfo->is_utf8_target;
4233
4234     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4235     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4236     bool use_chrtest_void = FALSE;
4237     const bool is_utf8_pat = reginfo->is_utf8_pat;
4238
4239     /* Used when we have both utf8 input and utf8 output, to avoid converting
4240      * to/from code points */
4241     bool utf8_has_been_setup = FALSE;
4242
4243     dVAR;
4244
4245     U8 *pat = (U8*)STRING(text_node);
4246     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4247
4248     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4249
4250         /* In an exact node, only one thing can be matched, that first
4251          * character.  If both the pat and the target are UTF-8, we can just
4252          * copy the input to the output, avoiding finding the code point of
4253          * that character */
4254         if (!is_utf8_pat) {
4255             c2 = c1 = *pat;
4256         }
4257         else if (utf8_target) {
4258             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4259             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4260             utf8_has_been_setup = TRUE;
4261         }
4262         else {
4263             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4264         }
4265     }
4266     else { /* an EXACTFish node */
4267         U8 *pat_end = pat + STR_LEN(text_node);
4268
4269         /* An EXACTFL node has at least some characters unfolded, because what
4270          * they match is not known until now.  So, now is the time to fold
4271          * the first few of them, as many as are needed to determine 'c1' and
4272          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4273          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4274          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4275          * need to fold as many characters as a single character can fold to,
4276          * so that later we can check if the first ones are such a multi-char
4277          * fold.  But, in such a pattern only locale-problematic characters
4278          * aren't folded, so we can skip this completely if the first character
4279          * in the node isn't one of the tricky ones */
4280         if (OP(text_node) == EXACTFL) {
4281
4282             if (! is_utf8_pat) {
4283                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4284                 {
4285                     folded[0] = folded[1] = 's';
4286                     pat = folded;
4287                     pat_end = folded + 2;
4288                 }
4289             }
4290             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4291                 U8 *s = pat;
4292                 U8 *d = folded;
4293                 int i;
4294
4295                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4296                     if (isASCII(*s)) {
4297                         *(d++) = (U8) toFOLD_LC(*s);
4298                         s++;
4299                     }
4300                     else {
4301                         STRLEN len;
4302                         _toFOLD_utf8_flags(s,
4303                                            pat_end,
4304                                            d,
4305                                            &len,
4306                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4307                         d += len;
4308                         s += UTF8SKIP(s);
4309                     }
4310                 }
4311
4312                 pat = folded;
4313                 pat_end = d;
4314             }
4315         }
4316
4317         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4318              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4319         {
4320             /* Multi-character folds require more context to sort out.  Also
4321              * PL_utf8_foldclosures used below doesn't handle them, so have to
4322              * be handled outside this routine */
4323             use_chrtest_void = TRUE;
4324         }
4325         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4326             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4327             if (c1 > 255) {
4328                 /* Load the folds hash, if not already done */
4329                 SV** listp;
4330                 if (! PL_utf8_foldclosures) {
4331                     _load_PL_utf8_foldclosures();
4332                 }
4333
4334                 /* The fold closures data structure is a hash with the keys
4335                  * being the UTF-8 of every character that is folded to, like
4336                  * 'k', and the values each an array of all code points that
4337                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4338                  * Multi-character folds are not included */
4339                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4340                                         (char *) pat,
4341                                         UTF8SKIP(pat),
4342                                         FALSE))))
4343                 {
4344                     /* Not found in the hash, therefore there are no folds
4345                     * containing it, so there is only a single character that
4346                     * could match */
4347                     c2 = c1;
4348                 }
4349                 else {  /* Does participate in folds */
4350                     AV* list = (AV*) *listp;
4351                     if (av_tindex_skip_len_mg(list) != 1) {
4352
4353                         /* If there aren't exactly two folds to this, it is
4354                          * outside the scope of this function */
4355                         use_chrtest_void = TRUE;
4356                     }
4357                     else {  /* There are two.  Get them */
4358                         SV** c_p = av_fetch(list, 0, FALSE);
4359                         if (c_p == NULL) {
4360                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4361                         }
4362                         c1 = SvUV(*c_p);
4363
4364                         c_p = av_fetch(list, 1, FALSE);
4365                         if (c_p == NULL) {
4366                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4367                         }
4368                         c2 = SvUV(*c_p);
4369
4370                         /* Folds that cross the 255/256 boundary are forbidden
4371                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4372                          * one is ASCIII.  Since the pattern character is above
4373                          * 255, and its only other match is below 256, the only
4374                          * legal match will be to itself.  We have thrown away
4375                          * the original, so have to compute which is the one
4376                          * above 255. */
4377                         if ((c1 < 256) != (c2 < 256)) {
4378                             if ((OP(text_node) == EXACTFL
4379                                  && ! IN_UTF8_CTYPE_LOCALE)
4380                                 || ((OP(text_node) == EXACTFA
4381                                     || OP(text_node) == EXACTFA_NO_TRIE)
4382                                     && (isASCII(c1) || isASCII(c2))))
4383                             {
4384                                 if (c1 < 256) {
4385                                     c1 = c2;
4386                                 }
4387                                 else {
4388                                     c2 = c1;
4389                                 }
4390                             }
4391                         }
4392                     }
4393                 }
4394             }
4395             else /* Here, c1 is <= 255 */
4396                 if (utf8_target
4397                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4398                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4399                     && ((OP(text_node) != EXACTFA
4400                         && OP(text_node) != EXACTFA_NO_TRIE)
4401                         || ! isASCII(c1)))
4402             {
4403                 /* Here, there could be something above Latin1 in the target
4404                  * which folds to this character in the pattern.  All such
4405                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4406                  * than two characters involved in their folds, so are outside
4407                  * the scope of this function */
4408                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4409                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4410                 }
4411                 else {
4412                     use_chrtest_void = TRUE;
4413                 }
4414             }
4415             else { /* Here nothing above Latin1 can fold to the pattern
4416                       character */
4417                 switch (OP(text_node)) {
4418
4419                     case EXACTFL:   /* /l rules */
4420                         c2 = PL_fold_locale[c1];
4421                         break;
4422
4423                     case EXACTF:   /* This node only generated for non-utf8
4424                                     patterns */
4425                         assert(! is_utf8_pat);
4426                         if (! utf8_target) {    /* /d rules */
4427                             c2 = PL_fold[c1];
4428                             break;
4429                         }
4430                         /* FALLTHROUGH */
4431                         /* /u rules for all these.  This happens to work for
4432                         * EXACTFA as nothing in Latin1 folds to ASCII */
4433                     case EXACTFA_NO_TRIE:   /* This node only generated for
4434                                             non-utf8 patterns */
4435                         assert(! is_utf8_pat);
4436                         /* FALLTHROUGH */
4437                     case EXACTFA:
4438                     case EXACTFU_SS:
4439                     case EXACTFU:
4440                         c2 = PL_fold_latin1[c1];
4441                         break;
4442
4443                     default:
4444                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4445                         NOT_REACHED; /* NOTREACHED */
4446                 }
4447             }
4448         }
4449     }
4450
4451     /* Here have figured things out.  Set up the returns */
4452     if (use_chrtest_void) {
4453         *c2p = *c1p = CHRTEST_VOID;
4454     }
4455     else if (utf8_target) {
4456         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4457             uvchr_to_utf8(c1_utf8, c1);
4458             uvchr_to_utf8(c2_utf8, c2);
4459         }
4460
4461         /* Invariants are stored in both the utf8 and byte outputs; Use
4462          * negative numbers otherwise for the byte ones.  Make sure that the
4463          * byte ones are the same iff the utf8 ones are the same */
4464         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4465         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4466                 ? *c2_utf8
4467                 : (c1 == c2)
4468                   ? CHRTEST_NOT_A_CP_1
4469                   : CHRTEST_NOT_A_CP_2;
4470     }
4471     else if (c1 > 255) {
4472        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4473                            can represent */
4474            return FALSE;
4475        }
4476
4477        *c1p = *c2p = c2;    /* c2 is the only representable value */
4478     }
4479     else {  /* c1 is representable; see about c2 */
4480        *c1p = c1;
4481        *c2p = (c2 < 256) ? c2 : c1;
4482     }
4483
4484     return TRUE;
4485 }
4486
4487 STATIC bool
4488 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4489 {
4490     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4491      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4492
4493     PERL_ARGS_ASSERT_ISGCB;
4494
4495     switch (GCB_table[before][after]) {
4496         case GCB_BREAKABLE:
4497             return TRUE;
4498
4499         case GCB_NOBREAK:
4500             return FALSE;
4501
4502         case GCB_RI_then_RI:
4503             {
4504                 int RI_count = 1;
4505                 U8 * temp_pos = (U8 *) curpos;
4506
4507                 /* Do not break within emoji flag sequences. That is, do not
4508                  * break between regional indicator (RI) symbols if there is an
4509                  * odd number of RI characters before the break point.
4510                  *  GB12   sot (RI RI)* RI × RI
4511                  *  GB13 [^RI] (RI RI)* RI × RI */
4512
4513                 while (backup_one_GCB(strbeg,
4514                                     &temp_pos,
4515                                     utf8_target) == GCB_Regional_Indicator)
4516                 {
4517                     RI_count++;
4518                 }
4519
4520                 return RI_count % 2 != 1;
4521             }
4522
4523         case GCB_EX_then_EM:
4524
4525             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4526             {
4527                 U8 * temp_pos = (U8 *) curpos;
4528                 GCB_enum prev;
4529
4530                 do {
4531                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4532                 }
4533                 while (prev == GCB_Extend);
4534
4535                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4536             }
4537
4538         default:
4539             break;
4540     }
4541
4542 #ifdef DEBUGGING
4543     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4544                                   before, after, GCB_table[before][after]);
4545     assert(0);
4546 #endif
4547     return TRUE;
4548 }
4549
4550 STATIC GCB_enum
4551 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4552 {
4553     GCB_enum gcb;
4554
4555     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4556
4557     if (*curpos < strbeg) {
4558         return GCB_EDGE;
4559     }
4560
4561     if (utf8_target) {
4562         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4563         U8 * prev_prev_char_pos;
4564
4565         if (! prev_char_pos) {
4566             return GCB_EDGE;
4567         }
4568
4569         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4570             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4571             *curpos = prev_char_pos;
4572             prev_char_pos = prev_prev_char_pos;
4573         }
4574         else {
4575             *curpos = (U8 *) strbeg;
4576             return GCB_EDGE;
4577         }
4578     }
4579     else {
4580         if (*curpos - 2 < strbeg) {
4581             *curpos = (U8 *) strbeg;
4582             return GCB_EDGE;
4583         }
4584         (*curpos)--;
4585         gcb = getGCB_VAL_CP(*(*curpos - 1));
4586     }
4587
4588     return gcb;
4589 }
4590
4591 /* Combining marks attach to most classes that precede them, but this defines
4592  * the exceptions (from TR14) */
4593 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4594                                      || prev == LB_Mandatory_Break      \
4595                                      || prev == LB_Carriage_Return      \
4596                                      || prev == LB_Line_Feed            \
4597                                      || prev == LB_Next_Line            \
4598                                      || prev == LB_Space                \
4599                                      || prev == LB_ZWSpace))
4600
4601 STATIC bool
4602 S_isLB(pTHX_ LB_enum before,
4603              LB_enum after,
4604              const U8 * const strbeg,
4605              const U8 * const curpos,
4606              const U8 * const strend,
4607              const bool utf8_target)
4608 {
4609     U8 * temp_pos = (U8 *) curpos;
4610     LB_enum prev = before;
4611
4612     /* Is the boundary between 'before' and 'after' line-breakable?
4613      * Most of this is just a table lookup of a generated table from Unicode
4614      * rules.  But some rules require context to decide, and so have to be
4615      * implemented in code */
4616
4617     PERL_ARGS_ASSERT_ISLB;
4618
4619     /* Rule numbers in the comments below are as of Unicode 9.0 */
4620
4621   redo:
4622     before = prev;
4623     switch (LB_table[before][after]) {
4624         case LB_BREAKABLE:
4625             return TRUE;
4626
4627         case LB_NOBREAK:
4628         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4629             return FALSE;
4630
4631         case LB_SP_foo + LB_BREAKABLE:
4632         case LB_SP_foo + LB_NOBREAK:
4633         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4634
4635             /* When we have something following a SP, we have to look at the
4636              * context in order to know what to do.
4637              *
4638              * SP SP should not reach here because LB7: Do not break before
4639              * spaces.  (For two spaces in a row there is nothing that
4640              * overrides that) */
4641             assert(after != LB_Space);
4642
4643             /* Here we have a space followed by a non-space.  Mostly this is a
4644              * case of LB18: "Break after spaces".  But there are complications
4645              * as the handling of spaces is somewhat tricky.  They are in a
4646              * number of rules, which have to be applied in priority order, but
4647              * something earlier in the string can cause a rule to be skipped
4648              * and a lower priority rule invoked.  A prime example is LB7 which
4649              * says don't break before a space.  But rule LB8 (lower priority)
4650              * says that the first break opportunity after a ZW is after any
4651              * span of spaces immediately after it.  If a ZW comes before a SP
4652              * in the input, rule LB8 applies, and not LB7.  Other such rules
4653              * involve combining marks which are rules 9 and 10, but they may
4654              * override higher priority rules if they come earlier in the
4655              * string.  Since we're doing random access into the middle of the
4656              * string, we have to look for rules that should get applied based
4657              * on both string position and priority.  Combining marks do not
4658              * attach to either ZW nor SP, so we don't have to consider them
4659              * until later.
4660              *
4661              * To check for LB8, we have to find the first non-space character
4662              * before this span of spaces */
4663             do {
4664                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4665             }
4666             while (prev == LB_Space);
4667
4668             /* LB8 Break before any character following a zero-width space,
4669              * even if one or more spaces intervene.
4670              *      ZW SP* ÷
4671              * So if we have a ZW just before this span, and to get here this
4672              * is the final space in the span. */
4673             if (prev == LB_ZWSpace) {
4674                 return TRUE;
4675             }
4676
4677             /* Here, not ZW SP+.  There are several rules that have higher
4678              * priority than LB18 and can be resolved now, as they don't depend
4679              * on anything earlier in the string (except ZW, which we have
4680              * already handled).  One of these rules is LB11 Do not break
4681              * before Word joiner, but we have specially encoded that in the
4682              * lookup table so it is caught by the single test below which
4683              * catches the other ones. */
4684             if (LB_table[LB_Space][after] - LB_SP_foo
4685                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4686             {
4687                 return FALSE;
4688             }
4689
4690             /* If we get here, we have to XXX consider combining marks. */
4691             if (prev == LB_Combining_Mark) {
4692
4693                 /* What happens with these depends on the character they
4694                  * follow.  */
4695                 do {
4696                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4697                 }
4698                 while (prev == LB_Combining_Mark);
4699
4700                 /* Most times these attach to and inherit the characteristics
4701                  * of that character, but not always, and when not, they are to
4702                  * be treated as AL by rule LB10. */
4703                 if (! LB_CM_ATTACHES_TO(prev)) {
4704                     prev = LB_Alphabetic;
4705                 }
4706             }
4707
4708             /* Here, we have the character preceding the span of spaces all set
4709              * up.  We follow LB18: "Break after spaces" unless the table shows
4710              * that is overriden */
4711             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4712
4713         case LB_CM_ZWJ_foo:
4714
4715             /* We don't know how to treat the CM except by looking at the first
4716              * non-CM character preceding it.  ZWJ is treated as CM */
4717             do {
4718                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4719             }
4720             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4721
4722             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4723              * attatches to it, then it inherits the behavior of 'prev'.  If it
4724              * doesn't attach, it is to be treated as an AL */
4725             if (! LB_CM_ATTACHES_TO(prev)) {
4726                 prev = LB_Alphabetic;
4727             }
4728
4729             goto redo;
4730
4731         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4732         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4733
4734             /* LB21a Don't break after Hebrew + Hyphen.
4735              * HL (HY | BA) × */
4736
4737             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4738                                                           == LB_Hebrew_Letter)
4739             {
4740                 return FALSE;
4741             }
4742
4743             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4744
4745         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4746         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4747
4748             /* LB25a (PR | PO) × ( OP | HY )? NU */
4749             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4750                 return FALSE;
4751             }
4752
4753             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4754                                                                 == LB_BREAKABLE;
4755
4756         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4757         case LB_SY_or_IS_then_various + LB_NOBREAK:
4758         {
4759             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4760
4761             LB_enum temp = prev;
4762             do {
4763                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4764             }
4765             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4766             if (temp == LB_Numeric) {
4767                 return FALSE;
4768             }
4769
4770             return LB_table[prev][after] - LB_SY_or_IS_then_various
4771                                                                == LB_BREAKABLE;
4772         }
4773
4774         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4775         case LB_various_then_PO_or_PR + LB_NOBREAK:
4776         {
4777             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4778
4779             LB_enum temp = prev;
4780             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4781             {
4782                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4783             }
4784             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4785                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4786             }
4787             if (temp == LB_Numeric) {
4788                 return FALSE;
4789             }
4790             return LB_various_then_PO_or_PR;
4791         }
4792
4793         case LB_RI_then_RI + LB_NOBREAK:
4794         case LB_RI_then_RI + LB_BREAKABLE:
4795             {
4796                 int RI_count = 1;
4797
4798                 /* LB30a Break between two regional indicator symbols if and
4799                  * only if there are an even number of regional indicators
4800                  * preceding the position of the break.
4801                  *
4802                  *    sot (RI RI)* RI × RI
4803                  *  [^RI] (RI RI)* RI × RI */
4804
4805                 while (backup_one_LB(strbeg,
4806                                      &temp_pos,
4807                                      utf8_target) == LB_Regional_Indicator)
4808                 {
4809                     RI_count++;
4810                 }
4811
4812                 return RI_count % 2 == 0;
4813             }
4814
4815         default:
4816             break;
4817     }
4818
4819 #ifdef DEBUGGING
4820     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4821                                   before, after, LB_table[before][after]);
4822     assert(0);
4823 #endif
4824     return TRUE;
4825 }
4826
4827 STATIC LB_enum
4828 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4829 {
4830     LB_enum lb;
4831
4832     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4833
4834     if (*curpos >= strend) {
4835         return LB_EDGE;
4836     }
4837
4838     if (utf8_target) {
4839         *curpos += UTF8SKIP(*curpos);
4840         if (*curpos >= strend) {
4841             return LB_EDGE;
4842         }
4843         lb = getLB_VAL_UTF8(*curpos, strend);
4844     }
4845     else {
4846         (*curpos)++;
4847         if (*curpos >= strend) {
4848             return LB_EDGE;
4849         }
4850         lb = getLB_VAL_CP(**curpos);
4851     }
4852
4853     return lb;
4854 }
4855
4856 STATIC LB_enum
4857 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4858 {
4859     LB_enum lb;
4860
4861     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4862
4863     if (*curpos < strbeg) {
4864         return LB_EDGE;
4865     }
4866
4867     if (utf8_target) {
4868         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4869         U8 * prev_prev_char_pos;
4870
4871         if (! prev_char_pos) {
4872             return LB_EDGE;
4873         }
4874
4875         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4876             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4877             *curpos = prev_char_pos;
4878             prev_char_pos = prev_prev_char_pos;
4879         }
4880         else {
4881             *curpos = (U8 *) strbeg;
4882             return LB_EDGE;
4883         }
4884     }
4885     else {
4886         if (*curpos - 2 < strbeg) {
4887             *curpos = (U8 *) strbeg;
4888             return LB_EDGE;
4889         }
4890         (*curpos)--;
4891         lb = getLB_VAL_CP(*(*curpos - 1));
4892     }
4893
4894     return lb;
4895 }
4896
4897 STATIC bool
4898 S_isSB(pTHX_ SB_enum before,
4899              SB_enum after,
4900              const U8 * const strbeg,
4901              const U8 * const curpos,
4902              const U8 * const strend,
4903              const bool utf8_target)
4904 {
4905     /* returns a boolean indicating if there is a Sentence Boundary Break
4906      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4907
4908     U8 * lpos = (U8 *) curpos;
4909     bool has_para_sep = FALSE;
4910     bool has_sp = FALSE;
4911
4912     PERL_ARGS_ASSERT_ISSB;
4913
4914     /* Break at the start and end of text.
4915         SB1.  sot  ÷
4916         SB2.  ÷  eot
4917       But unstated in Unicode is don't break if the text is empty */
4918     if (before == SB_EDGE || after == SB_EDGE) {
4919         return before != after;
4920     }
4921
4922     /* SB 3: Do not break within CRLF. */
4923     if (before == SB_CR && after == SB_LF) {
4924         return FALSE;
4925     }
4926
4927     /* Break after paragraph separators.  CR and LF are considered
4928      * so because Unicode views text as like word processing text where there
4929      * are no newlines except between paragraphs, and the word processor takes
4930      * care of wrapping without there being hard line-breaks in the text *./
4931        SB4.  Sep | CR | LF  ÷ */
4932     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4933         return TRUE;
4934     }
4935
4936     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4937      * (See Section 6.2, Replacing Ignore Rules.)
4938         SB5.  X (Extend | Format)*  →  X */
4939     if (after == SB_Extend || after == SB_Format) {
4940
4941         /* Implied is that the these characters attach to everything
4942          * immediately prior to them except for those separator-type
4943          * characters.  And the rules earlier have already handled the case
4944          * when one of those immediately precedes the extend char */
4945         return FALSE;
4946     }
4947
4948     if (before == SB_Extend || before == SB_Format) {
4949         U8 * temp_pos = lpos;
4950         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4951         if (   backup != SB_EDGE
4952             && backup != SB_Sep
4953             && backup != SB_CR
4954             && backup != SB_LF)
4955         {
4956             before = backup;
4957             lpos = temp_pos;
4958         }
4959
4960         /* Here, both 'before' and 'backup' are these types; implied is that we
4961          * don't break between them */
4962         if (backup == SB_Extend || backup == SB_Format) {
4963             return FALSE;
4964         }
4965     }
4966
4967     /* Do not break after ambiguous terminators like period, if they are
4968      * immediately followed by a number or lowercase letter, if they are
4969      * between uppercase letters, if the first following letter (optionally
4970      * after certain punctuation) is lowercase, or if they are followed by
4971      * "continuation" punctuation such as comma, colon, or semicolon. For
4972      * example, a period may be an abbreviation or numeric period, and thus may
4973      * not mark the end of a sentence.
4974
4975      * SB6. ATerm  ×  Numeric */
4976     if (before == SB_ATerm && after == SB_Numeric) {
4977         return FALSE;
4978     }
4979
4980     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4981     if (before == SB_ATerm && after == SB_Upper) {
4982         U8 * temp_pos = lpos;
4983         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4984         if (backup == SB_Upper || backup == SB_Lower) {
4985             return FALSE;
4986         }
4987     }
4988
4989     /* The remaining rules that aren't the final one, all require an STerm or
4990      * an ATerm after having backed up over some Close* Sp*, and in one case an
4991      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4992      * So do that backup now, setting flags if either Sp or a paragraph
4993      * separator are found */
4994
4995     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4996         has_para_sep = TRUE;
4997         before = backup_one_SB(strbeg, &lpos, utf8_target);
4998     }
4999
5000     if (before == SB_Sp) {
5001         has_sp = TRUE;
5002         do {
5003             before = backup_one_SB(strbeg, &lpos, utf8_target);
5004         }
5005         while (before == SB_Sp);
5006     }
5007
5008     while (before == SB_Close) {
5009         before = backup_one_SB(strbeg, &lpos, utf8_target);
5010     }
5011
5012     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5013      * most cases an STerm */
5014     if (before == SB_STerm || before == SB_ATerm) {
5015
5016         /* So, here the lhs matches
5017          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5018          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5019          * The rules that apply here are:
5020          *
5021          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5022                                            | LF | STerm | ATerm) )* Lower
5023            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5024            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5025            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5026            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5027          */
5028
5029         /* And all but SB11 forbid having seen a paragraph separator */
5030         if (! has_para_sep) {
5031             if (before == SB_ATerm) {          /* SB8 */
5032                 U8 * rpos = (U8 *) curpos;
5033                 SB_enum later = after;
5034
5035                 while (    later != SB_OLetter
5036                         && later != SB_Upper
5037                         && later != SB_Lower
5038                         && later != SB_Sep
5039                         && later != SB_CR
5040                         && later != SB_LF
5041                         && later != SB_STerm
5042                         && later != SB_ATerm
5043                         && later != SB_EDGE)
5044                 {
5045                     later = advance_one_SB(&rpos, strend, utf8_target);
5046                 }
5047                 if (later == SB_Lower) {
5048                     return FALSE;
5049                 }
5050             }
5051
5052             if (   after == SB_SContinue    /* SB8a */
5053                 || after == SB_STerm
5054                 || after == SB_ATerm)
5055             {
5056                 return FALSE;
5057             }
5058
5059             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5060                 if (   after == SB_Close
5061                     || after == SB_Sp
5062                     || after == SB_Sep
5063                     || after == SB_CR
5064                     || after == SB_LF)
5065                 {
5066                     return FALSE;
5067                 }
5068             }
5069
5070             /* SB10.  This and SB9 could probably be combined some way, but khw
5071              * has decided to follow the Unicode rule book precisely for
5072              * simplified maintenance */
5073             if (   after == SB_Sp
5074                 || after == SB_Sep
5075                 || after == SB_CR
5076                 || after == SB_LF)
5077             {
5078                 return FALSE;
5079             }
5080         }
5081
5082         /* SB11.  */
5083         return TRUE;
5084     }
5085
5086     /* Otherwise, do not break.
5087     SB12.  Any  ×  Any */
5088
5089     return FALSE;
5090 }
5091
5092 STATIC SB_enum
5093 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5094 {
5095     SB_enum sb;
5096
5097     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5098
5099     if (*curpos >= strend) {
5100         return SB_EDGE;
5101     }
5102
5103     if (utf8_target) {
5104         do {
5105             *curpos += UTF8SKIP(*curpos);
5106             if (*curpos >= strend) {
5107                 return SB_EDGE;
5108             }
5109             sb = getSB_VAL_UTF8(*curpos, strend);
5110         } while (sb == SB_Extend || sb == SB_Format);
5111     }
5112     else {
5113         do {
5114             (*curpos)++;
5115             if (*curpos >= strend) {
5116                 return SB_EDGE;
5117             }
5118             sb = getSB_VAL_CP(**curpos);
5119         } while (sb == SB_Extend || sb == SB_Format);
5120     }
5121
5122     return sb;
5123 }
5124
5125 STATIC SB_enum
5126 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5127 {
5128     SB_enum sb;
5129
5130     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5131
5132     if (*curpos < strbeg) {
5133         return SB_EDGE;
5134     }
5135
5136     if (utf8_target) {
5137         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5138         if (! prev_char_pos) {
5139             return SB_EDGE;
5140         }
5141
5142         /* Back up over Extend and Format.  curpos is always just to the right
5143          * of the characater whose value we are getting */
5144         do {
5145             U8 * prev_prev_char_pos;
5146             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5147                                                                       strbeg)))
5148             {
5149                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5150                 *curpos = prev_char_pos;
5151                 prev_char_pos = prev_prev_char_pos;
5152             }
5153             else {
5154                 *curpos = (U8 *) strbeg;
5155                 return SB_EDGE;
5156             }
5157         } while (sb == SB_Extend || sb == SB_Format);
5158     }
5159     else {
5160         do {
5161             if (*curpos - 2 < strbeg) {
5162                 *curpos = (U8 *) strbeg;
5163                 return SB_EDGE;
5164             }
5165             (*curpos)--;
5166             sb = getSB_VAL_CP(*(*curpos - 1));
5167         } while (sb == SB_Extend || sb == SB_Format);
5168     }
5169
5170     return sb;
5171 }
5172
5173 STATIC bool
5174 S_isWB(pTHX_ WB_enum previous,
5175              WB_enum before,
5176              WB_enum after,
5177              const U8 * const strbeg,
5178              const U8 * const curpos,
5179              const U8 * const strend,
5180              const bool utf8_target)
5181 {
5182     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5183      *  a Unicode word break, using their published algorithm, but tailored for
5184      *  Perl by treating spans of white space as one unit.  Context may be
5185      *  needed to make this determination.  If the value for the character
5186      *  before 'before' is known, it is passed as 'previous'; otherwise that
5187      *  should be set to WB_UNKNOWN.  The other input parameters give the
5188      *  boundaries and current position in the matching of the string.  That
5189      *  is, 'curpos' marks the position where the character whose wb value is
5190      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5191
5192     U8 * before_pos = (U8 *) curpos;
5193     U8 * after_pos = (U8 *) curpos;
5194     WB_enum prev = before;
5195     WB_enum next;
5196
5197     PERL_ARGS_ASSERT_ISWB;
5198
5199     /* Rule numbers in the comments below are as of Unicode 9.0 */
5200
5201   redo:
5202     before = prev;
5203     switch (WB_table[before][after]) {
5204         case WB_BREAKABLE:
5205             return TRUE;
5206
5207         case WB_NOBREAK:
5208             return FALSE;
5209
5210         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5211             next = advance_one_WB(&after_pos, strend, utf8_target,
5212                                  FALSE /* Don't skip Extend nor Format */ );
5213             /* A space immediately preceeding an Extend or Format is attached
5214              * to by them, and hence gets separated from previous spaces.
5215              * Otherwise don't break between horizontal white space */
5216             return next == WB_Extend || next == WB_Format;
5217
5218         /* WB4 Ignore Format and Extend characters, except when they appear at
5219          * the beginning of a region of text.  This code currently isn't
5220          * general purpose, but it works as the rules are currently and likely
5221          * to be laid out.  The reason it works is that when 'they appear at
5222          * the beginning of a region of text', the rule is to break before
5223          * them, just like any other character.  Therefore, the default rule
5224          * applies and we don't have to look in more depth.  Should this ever
5225          * change, we would have to have 2 'case' statements, like in the rules
5226          * below, and backup a single character (not spacing over the extend
5227          * ones) and then see if that is one of the region-end characters and
5228          * go from there */
5229         case WB_Ex_or_FO_or_ZWJ_then_foo:
5230             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5231             goto redo;
5232
5233         case WB_DQ_then_HL + WB_BREAKABLE:
5234         case WB_DQ_then_HL + WB_NOBREAK:
5235
5236             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5237
5238             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5239                                                             == WB_Hebrew_Letter)
5240             {
5241                 return FALSE;
5242             }
5243
5244              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5245
5246         case WB_HL_then_DQ + WB_BREAKABLE:
5247         case WB_HL_then_DQ + WB_NOBREAK:
5248
5249             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5250
5251             if (advance_one_WB(&after_pos, strend, utf8_target,
5252                                        TRUE /* Do skip Extend and Format */ )
5253                                                             == WB_Hebrew_Letter)
5254             {
5255                 return FALSE;
5256             }
5257
5258             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5259
5260         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5261         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5262
5263             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5264              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5265
5266             next = advance_one_WB(&after_pos, strend, utf8_target,
5267                                        TRUE /* Do skip Extend and Format */ );
5268
5269             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5270             {
5271                 return FALSE;
5272             }
5273
5274             return WB_table[before][after]
5275                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5276
5277         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5278         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5279
5280             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5281              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5282
5283             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5284             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5285             {
5286                 return FALSE;
5287             }
5288
5289             return WB_table[before][after]
5290                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5291
5292         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5293         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5294
5295             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5296              * */
5297
5298             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5299                                                             == WB_Numeric)
5300             {
5301                 return FALSE;
5302             }
5303
5304             return WB_table[before][after]
5305                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5306
5307         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5308         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5309
5310             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5311
5312             if (advance_one_WB(&after_pos, strend, utf8_target,
5313                                        TRUE /* Do skip Extend and Format */ )
5314                                                             == WB_Numeric)
5315             {
5316                 return FALSE;
5317             }
5318
5319             return WB_table[before][after]
5320                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5321
5322         case WB_RI_then_RI + WB_NOBREAK:
5323         case WB_RI_then_RI + WB_BREAKABLE:
5324             {
5325                 int RI_count = 1;
5326
5327                 /* Do not break within emoji flag sequences. That is, do not
5328                  * break between regional indicator (RI) symbols if there is an
5329                  * odd number of RI characters before the potential break
5330                  * point.
5331                  *
5332                  * WB15   sot (RI RI)* RI × RI
5333                  * WB16 [^RI] (RI RI)* RI × RI */
5334
5335                 while (backup_one_WB(&previous,
5336                                      strbeg,
5337                                      &before_pos,
5338                                      utf8_target) == WB_Regional_Indicator)
5339                 {
5340                     RI_count++;
5341                 }
5342
5343                 return RI_count % 2 != 1;
5344             }
5345
5346         default:
5347             break;
5348     }
5349
5350 #ifdef DEBUGGING
5351     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5352                                   before, after, WB_table[before][after]);
5353     assert(0);
5354 #endif
5355     return TRUE;
5356 }
5357
5358 STATIC WB_enum
5359 S_advance_one_WB(pTHX_ U8 ** curpos,
5360                        const U8 * const strend,
5361                        const bool utf8_target,
5362                        const bool skip_Extend_Format)
5363 {
5364     WB_enum wb;
5365
5366     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5367
5368     if (*curpos >= strend) {
5369         return WB_EDGE;
5370     }
5371
5372     if (utf8_target) {
5373
5374         /* Advance over Extend and Format */
5375         do {
5376             *curpos += UTF8SKIP(*curpos);
5377             if (*curpos >= strend) {
5378                 return WB_EDGE;
5379             }
5380             wb = getWB_VAL_UTF8(*curpos, strend);
5381         } while (    skip_Extend_Format
5382                  && (wb == WB_Extend || wb == WB_Format));
5383     }
5384     else {
5385         do {
5386             (*curpos)++;
5387             if (*curpos >= strend) {
5388                 return WB_EDGE;
5389             }
5390             wb = getWB_VAL_CP(**curpos);
5391         } while (    skip_Extend_Format
5392                  && (wb == WB_Extend || wb == WB_Format));
5393     }
5394
5395     return wb;
5396 }
5397
5398 STATIC WB_enum
5399 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5400 {
5401     WB_enum wb;
5402
5403     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5404
5405     /* If we know what the previous character's break value is, don't have
5406         * to look it up */
5407     if (*previous != WB_UNKNOWN) {
5408         wb = *previous;
5409
5410         /* But we need to move backwards by one */
5411         if (utf8_target) {
5412             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5413             if (! *curpos) {
5414                 *previous = WB_EDGE;
5415                 *curpos = (U8 *) strbeg;
5416             }
5417             else {
5418                 *previous = WB_UNKNOWN;
5419             }
5420         }
5421         else {
5422             (*curpos)--;
5423             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5424         }
5425
5426         /* And we always back up over these three types */
5427         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5428             return wb;
5429         }
5430     }
5431
5432     if (*curpos < strbeg) {
5433         return WB_EDGE;
5434     }
5435
5436     if (utf8_target) {
5437         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5438         if (! prev_char_pos) {
5439             return WB_EDGE;
5440         }
5441
5442         /* Back up over Extend and Format.  curpos is always just to the right
5443          * of the characater whose value we are getting */
5444         do {
5445             U8 * prev_prev_char_pos;
5446             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5447                                                    -1,
5448                                                    strbeg)))
5449             {
5450                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5451                 *curpos = prev_char_pos;
5452                 prev_char_pos = prev_prev_char_pos;
5453             }
5454             else {
5455                 *curpos = (U8 *) strbeg;
5456                 return WB_EDGE;
5457             }
5458         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5459     }
5460     else {
5461         do {
5462             if (*curpos - 2 < strbeg) {
5463                 *curpos = (U8 *) strbeg;
5464                 return WB_EDGE;
5465             }
5466             (*curpos)--;
5467             wb = getWB_VAL_CP(*(*curpos - 1));
5468         } while (wb == WB_Extend || wb == WB_Format);
5469     }
5470
5471     return wb;
5472 }
5473
5474 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5475 (                                                           \
5476     (   ( st )                                         ) && \
5477     (   ( st )->u.eval.close_paren                     ) && \
5478     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5479 )
5480
5481 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5482 (                                                           \
5483     (   ( st )                                         ) && \
5484     (   ( st )->u.eval.close_paren                     ) && \
5485     (   ( expr )                                       ) && \
5486     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5487 )
5488
5489
5490 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5491     (st)->u.eval.close_paren = ( (expr) + 1 )
5492
5493 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5494     (st)->u.eval.close_paren = 0
5495
5496 /* returns -1 on failure, $+[0] on success */
5497 STATIC SSize_t
5498 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5499 {
5500     dVAR;
5501     const bool utf8_target = reginfo->is_utf8_target;
5502     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5503     REGEXP *rex_sv = reginfo->prog;
5504     regexp *rex = ReANY(rex_sv);
5505     RXi_GET_DECL(rex,rexi);
5506     /* the current state. This is a cached copy of PL_regmatch_state */
5507     regmatch_state *st;
5508     /* cache heavy used fields of st in registers */
5509     regnode *scan;
5510     regnode *next;
5511     U32 n = 0;  /* general value; init to avoid compiler warning */
5512     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5513     SSize_t endref = 0; /* offset of end of backref when ln is start */
5514     char *locinput = startpos;
5515     char *pushinput; /* where to continue after a PUSH */
5516     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5517
5518     bool result = 0;        /* return value of S_regmatch */
5519     U32 depth = 0;            /* depth of backtrack stack */
5520     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5521     const U32 max_nochange_depth =
5522         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5523         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5524     regmatch_state *yes_state = NULL; /* state to pop to on success of
5525                                                             subpattern */
5526     /* mark_state piggy backs on the yes_state logic so that when we unwind
5527        the stack on success we can update the mark_state as we go */
5528     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5529     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5530     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5531     U32 state_num;
5532     bool no_final = 0;      /* prevent failure from backtracking? */
5533     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5534     char *startpoint = locinput;
5535     SV *popmark = NULL;     /* are we looking for a mark? */
5536     SV *sv_commit = NULL;   /* last mark name seen in failure */
5537     SV *sv_yes_mark = NULL; /* last mark name we have seen
5538                                during a successful match */
5539     U32 lastopen = 0;       /* last open we saw */
5540     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5541     SV* const oreplsv = GvSVn(PL_replgv);
5542     /* these three flags are set by various ops to signal information to
5543      * the very next op. They have a useful lifetime of exactly one loop
5544      * iteration, and are not preserved or restored by state pushes/pops
5545      */
5546     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5547     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5548     int logical = 0;        /* the following EVAL is:
5549                                 0: (?{...})
5550                                 1: (?(?{...})X|Y)
5551                                 2: (??{...})
5552                                or the following IFMATCH/UNLESSM is:
5553                                 false: plain (?=foo)
5554                                 true:  used as a condition: (?(?=foo))
5555                             */
5556     PAD* last_pad = NULL;
5557     dMULTICALL;
5558     U8 gimme = G_SCALAR;
5559     CV *caller_cv = NULL;       /* who called us */
5560     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5561     U32 maxopenparen = 0;       /* max '(' index seen so far */
5562     int to_complement;  /* Invert the result? */
5563     _char_class_number classnum;
5564     bool is_utf8_pat = reginfo->is_utf8_pat;
5565     bool match = FALSE;
5566     I32 orig_savestack_ix = PL_savestack_ix;
5567     U8 * script_run_begin = NULL;
5568
5569 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5570 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5571 #  define SOLARIS_BAD_OPTIMIZER
5572     const U32 *pl_charclass_dup = PL_charclass;
5573 #  define PL_charclass pl_charclass_dup
5574 #endif
5575
5576 #ifdef DEBUGGING
5577     GET_RE_DEBUG_FLAGS_DECL;
5578 #endif
5579
5580     /* protect against undef(*^R) */
5581     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5582
5583     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5584     multicall_oldcatch = 0;
5585     PERL_UNUSED_VAR(multicall_cop);
5586
5587     PERL_ARGS_ASSERT_REGMATCH;
5588
5589     st = PL_regmatch_state;
5590
5591     /* Note that nextchr is a byte even in UTF */
5592     SET_nextchr;
5593     scan = prog;
5594
5595     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5596             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5597             Perl_re_printf( aTHX_ "regmatch start\n" );
5598     }));
5599
5600     while (scan != NULL) {
5601         next = scan + NEXT_OFF(scan);
5602         if (next == scan)
5603             next = NULL;
5604         state_num = OP(scan);
5605
5606       reenter_switch:
5607         DEBUG_EXECUTE_r(
5608             if (state_num <= REGNODE_MAX) {
5609                 SV * const prop = sv_newmortal();
5610                 regnode *rnext = regnext(scan);
5611
5612                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5613                 regprop(rex, prop, scan, reginfo, NULL);
5614                 Perl_re_printf( aTHX_
5615                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5616                     INDENT_CHARS(depth), "",
5617                     (IV)(scan - rexi->program),
5618                     SvPVX_const(prop),
5619                     (PL_regkind[OP(scan)] == END || !rnext) ?
5620                         0 : (IV)(rnext - rexi->program));
5621             }
5622         );
5623
5624         to_complement = 0;
5625
5626         SET_nextchr;
5627         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5628
5629         switch (state_num) {
5630         case SBOL: /*  /^../ and /\A../  */
5631             if (locinput == reginfo->strbeg)
5632                 break;
5633             sayNO;
5634
5635         case MBOL: /*  /^../m  */
5636             if (locinput == reginfo->strbeg ||
5637                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5638             {
5639                 break;
5640             }
5641             sayNO;
5642
5643         case GPOS: /*  \G  */
5644             if (locinput == reginfo->ganch)
5645                 break;
5646             sayNO;
5647
5648         case KEEPS: /*   \K  */
5649             /* update the startpoint */
5650             st->u.keeper.val = rex->offs[0].start;
5651             rex->offs[0].start = locinput - reginfo->strbeg;
5652             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5653             NOT_REACHED; /* NOTREACHED */
5654
5655         case KEEPS_next_fail:
5656             /* rollback the start point change */
5657             rex->offs[0].start = st->u.keeper.val;
5658             sayNO_SILENT;
5659             NOT_REACHED; /* NOTREACHED */
5660
5661         case MEOL: /* /..$/m  */
5662             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5663                 sayNO;
5664             break;
5665
5666         case SEOL: /* /..$/  */
5667             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5668                 sayNO;
5669             if (reginfo->strend - locinput > 1)
5670                 sayNO;
5671             break;
5672
5673         case EOS: /*  \z  */
5674             if (!NEXTCHR_IS_EOS)
5675                 sayNO;
5676             break;
5677
5678         case SANY: /*  /./s  */
5679             if (NEXTCHR_IS_EOS)
5680                 sayNO;
5681             goto increment_locinput;
5682
5683         case REG_ANY: /*  /./  */
5684             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5685                 sayNO;
5686             goto increment_locinput;
5687
5688
5689 #undef  ST
5690 #define ST st->u.trie
5691         case TRIEC: /* (ab|cd) with known charclass */
5692             /* In this case the charclass data is available inline so
5693                we can fail fast without a lot of extra overhead.
5694              */
5695             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5696                 DEBUG_EXECUTE_r(
5697                     Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5698                               depth, PL_colors[4], PL_colors[5])
5699                 );
5700                 sayNO_SILENT;
5701                 NOT_REACHED; /* NOTREACHED */
5702             }
5703             /* FALLTHROUGH */
5704         case TRIE:  /* (ab|cd)  */
5705             /* the basic plan of execution of the trie is:
5706              * At the beginning, run though all the states, and
5707              * find the longest-matching word. Also remember the position
5708              * of the shortest matching word. For example, this pattern:
5709              *    1  2 3 4    5
5710              *    ab|a|x|abcd|abc
5711              * when matched against the string "abcde", will generate
5712              * accept states for all words except 3, with the longest
5713              * matching word being 4, and the shortest being 2 (with
5714              * the position being after char 1 of the string).
5715              *
5716              * Then for each matching word, in word order (i.e. 1,2,4,5),
5717              * we run the remainder of the pattern; on each try setting
5718              * the current position to the character following the word,
5719              * returning to try the next word on failure.
5720              *
5721              * We avoid having to build a list of words at runtime by
5722              * using a compile-time structure, wordinfo[].prev, which
5723              * gives, for each word, the previous accepting word (if any).
5724              * In the case above it would contain the mappings 1->2, 2->0,
5725              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5726              * the longest word (4 above), a list of all words, by
5727              * following the list of prev pointers; this gives us the
5728              * unordered list 4,5,1,2. Then given the current word we have
5729              * just tried, we can go through the list and find the
5730              * next-biggest word to try (so if we just failed on word 2,
5731              * the next in the list is 4).
5732              *
5733              * Since at runtime we don't record the matching position in
5734              * the string for each word, we have to work that out for
5735              * each word we're about to process. The wordinfo table holds
5736              * the character length of each word; given that we recorded
5737              * at the start: the position of the shortest word and its
5738              * length in chars, we just need to move the pointer the
5739              * difference between the two char lengths. Depending on
5740              * Unicode status and folding, that's cheap or expensive.
5741              *
5742              * This algorithm is optimised for the case where are only a
5743              * small number of accept states, i.e. 0,1, or maybe 2.
5744              * With lots of accepts states, and having to try all of them,
5745              * it becomes quadratic on number of accept states to find all
5746              * the next words.
5747              */
5748
5749             {
5750                 /* what type of TRIE am I? (utf8 makes this contextual) */
5751                 DECL_TRIE_TYPE(scan);
5752
5753                 /* what trie are we using right now */
5754                 reg_trie_data * const trie
5755                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5756                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5757                 U32 state = trie->startstate;
5758
5759                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5760                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5761                     if (utf8_target
5762                         && nextchr >= 0 /* guard against negative EOS value in nextchr */
5763                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5764                         && scan->flags == EXACTL)
5765                     {
5766                         /* We only output for EXACTL, as we let the folder
5767                          * output this message for EXACTFLU8 to avoid
5768                          * duplication */
5769                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5770                                                                reginfo->strend);
5771                     }
5772                 }
5773                 if (   trie->bitmap
5774                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5775                 {
5776                     if (trie->states[ state ].wordnum) {
5777                          DEBUG_EXECUTE_r(
5778                             Perl_re_exec_indentf( aTHX_  "%smatched empty string...%s\n",
5779                                           depth, PL_colors[4], PL_colors[5])
5780                         );
5781                         if (!trie->jump)
5782                             break;
5783                     } else {
5784                         DEBUG_EXECUTE_r(
5785                             Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5786                                           depth, PL_colors[4], PL_colors[5])
5787                         );
5788                         sayNO_SILENT;
5789                    }
5790                 }
5791
5792             {
5793                 U8 *uc = ( U8* )locinput;
5794
5795                 STRLEN len = 0;
5796                 STRLEN foldlen = 0;
5797                 U8 *uscan = (U8*)NULL;
5798                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5799                 U32 charcount = 0; /* how many input chars we have matched */
5800                 U32 accepted = 0; /* have we seen any accepting states? */
5801
5802                 ST.jump = trie->jump;
5803                 ST.me = scan;
5804                 ST.firstpos = NULL;
5805                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5806                 ST.nextword = 0;
5807
5808                 /* fully traverse the TRIE; note the position of the
5809                    shortest accept state and the wordnum of the longest
5810                    accept state */
5811
5812                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5813                     U32 base = trie->states[ state ].trans.base;
5814                     UV uvc = 0;
5815                     U16 charid = 0;
5816                     U16 wordnum;
5817                     wordnum = trie->states[ state ].wordnum;
5818
5819                     if (wordnum) { /* it's an accept state */
5820                         if (!accepted) {
5821                             accepted = 1;
5822                             /* record first match position */
5823                             if (ST.longfold) {
5824                                 ST.firstpos = (U8*)locinput;
5825                                 ST.firstchars = 0;
5826                             }
5827                             else {
5828                                 ST.firstpos = uc;
5829                                 ST.firstchars = charcount;
5830                             }
5831                         }
5832                         if (!ST.nextword || wordnum < ST.nextword)
5833                             ST.nextword = wordnum;
5834                         ST.topword = wordnum;
5835                     }
5836
5837                     DEBUG_TRIE_EXECUTE_r({
5838                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5839                                 /* HERE */
5840                                 PerlIO_printf( Perl_debug_log,
5841                                     "%*s%sState: %4" UVxf " Accepted: %c ",
5842                                     INDENT_CHARS(depth), "", PL_colors[4],
5843                                     (UV)state, (accepted ? 'Y' : 'N'));
5844                     });
5845
5846                     /* read a char and goto next state */
5847                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5848                         I32 offset;
5849                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5850                                              uscan, len, uvc, charid, foldlen,
5851                                              foldbuf, uniflags);
5852                         charcount++;
5853                         if (foldlen>0)
5854                             ST.longfold = TRUE;
5855                         if (charid &&
5856                              ( ((offset =
5857                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5858
5859                              && ((U32)offset < trie->lasttrans)
5860                              && trie->trans[offset].check == state)
5861                         {
5862                             state = trie->trans[offset].next;
5863                         }
5864                         else {
5865                             state = 0;
5866                         }
5867                         uc += len;
5868
5869                     }
5870                     else {
5871                         state = 0;
5872                     }
5873                     DEBUG_TRIE_EXECUTE_r(
5874                         Perl_re_printf( aTHX_
5875                             "Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
5876                             charid, uvc, (UV)state, PL_colors[5] );
5877                     );
5878                 }
5879                 if (!accepted)
5880                    sayNO;
5881
5882                 /* calculate total number of accept states */
5883                 {
5884                     U16 w = ST.topword;
5885                     accepted = 0;
5886                     while (w) {
5887                         w = trie->wordinfo[w].prev;
5888                         accepted++;
5889                     }
5890                     ST.accepted = accepted;
5891                 }
5892
5893                 DEBUG_EXECUTE_r(
5894                     Perl_re_exec_indentf( aTHX_  "%sgot %" IVdf " possible matches%s\n",
5895                         depth,
5896                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5897                 );
5898                 goto trie_first_try; /* jump into the fail handler */
5899             }}
5900             NOT_REACHED; /* NOTREACHED */
5901
5902         case TRIE_next_fail: /* we failed - try next alternative */
5903         {
5904             U8 *uc;
5905             if ( ST.jump ) {
5906                 /* undo any captures done in the tail part of a branch,
5907                  * e.g.
5908                  *    /(?:X(.)(.)|Y(.)).../
5909                  * where the trie just matches X then calls out to do the
5910                  * rest of the branch */
5911                 REGCP_UNWIND(ST.cp);
5912                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5913             }
5914             if (!--ST.accepted) {
5915                 DEBUG_EXECUTE_r({
5916                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
5917                         depth,
5918                         PL_colors[4],
5919                         PL_colors[5] );
5920                 });
5921                 sayNO_SILENT;
5922             }
5923             {
5924                 /* Find next-highest word to process.  Note that this code
5925                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5926                 U16 min = 0;
5927                 U16 word;
5928                 U16 const nextword = ST.nextword;
5929                 reg_trie_wordinfo * const wordinfo
5930                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5931                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5932                     if (word > nextword && (!min || word < min))
5933                         min = word;
5934                 }
5935                 ST.nextword = min;
5936             }
5937
5938           trie_first_try:
5939             if (do_cutgroup) {
5940                 do_cutgroup = 0;
5941                 no_final = 0;
5942             }
5943
5944             if ( ST.jump ) {
5945                 ST.lastparen = rex->lastparen;
5946                 ST.lastcloseparen = rex->lastcloseparen;
5947                 REGCP_SET(ST.cp);
5948             }
5949
5950             /* find start char of end of current word */
5951             {
5952                 U32 chars; /* how many chars to skip */
5953                 reg_trie_data * const trie
5954                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5955
5956                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5957                             >=  ST.firstchars);
5958                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5959                             - ST.firstchars;
5960                 uc = ST.firstpos;
5961
5962                 if (ST.longfold) {
5963                     /* the hard option - fold each char in turn and find
5964                      * its folded length (which may be different */
5965                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5966                     STRLEN foldlen;
5967                     STRLEN len;
5968                     UV uvc;
5969                     U8 *uscan;
5970
5971                     while (chars) {
5972                         if (utf8_target) {
5973                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5974                                                     uniflags);
5975                             uc += len;
5976                         }
5977                         else {
5978                             uvc = *uc;
5979                             uc++;
5980                         }
5981                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5982                         uscan = foldbuf;
5983                         while (foldlen) {
5984                             if (!--chars)
5985                                 break;
5986                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5987                                             uniflags);
5988                             uscan += len;
5989                             foldlen -= len;
5990                         }
5991                     }
5992                 }
5993                 else {
5994                     if (utf8_target)
5995                         while (chars--)
5996                             uc += UTF8SKIP(uc);
5997                     else
5998                         uc += chars;
5999                 }
6000             }
6001
6002             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6003                             ? ST.jump[ST.nextword]
6004                             : NEXT_OFF(ST.me));
6005
6006             DEBUG_EXECUTE_r({
6007                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6008                     depth,
6009                     PL_colors[4],
6010                     ST.nextword,
6011                     PL_colors[5]
6012                     );
6013             });
6014
6015             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6016                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
6017                 NOT_REACHED; /* NOTREACHED */
6018             }
6019             /* only one choice left - just continue */
6020             DEBUG_EXECUTE_r({
6021                 AV *const trie_words
6022                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6023                 SV ** const tmp = trie_words
6024                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6025                 SV *sv= tmp ? sv_newmortal() : NULL;
6026
6027                 Perl_re_exec_indentf( aTHX_  "%sonly one match left, short-circuiting: #%d <%s>%s\n",
6028                     depth, PL_colors[4],
6029                     ST.nextword,
6030                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6031                             PL_colors[0], PL_colors[1],
6032                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6033                         )
6034                     : "not compiled under -Dr",
6035                     PL_colors[5] );
6036             });
6037
6038             locinput = (char*)uc;
6039             continue; /* execute rest of RE */
6040             /* NOTREACHED */
6041         }
6042 #undef  ST
6043
6044         case EXACTL:             /*  /abc/l       */
6045             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6046
6047             /* Complete checking would involve going through every character
6048              * matched by the string to see if any is above latin1.  But the
6049              * comparision otherwise might very well be a fast assembly
6050              * language routine, and I (khw) don't think slowing things down
6051              * just to check for this warning is worth it.  So this just checks
6052              * the first character */
6053             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6054                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6055             }
6056             /* FALLTHROUGH */
6057         case EXACT: {            /*  /abc/        */
6058             char *s = STRING(scan);
6059             ln = STR_LEN(scan);
6060             if (utf8_target != is_utf8_pat) {
6061                 /* The target and the pattern have differing utf8ness. */
6062                 char *l = locinput;
6063                 const char * const e = s + ln;
6064
6065                 if (utf8_target) {
6066                     /* The target is utf8, the pattern is not utf8.
6067                      * Above-Latin1 code points can't match the pattern;
6068                      * invariants match exactly, and the other Latin1 ones need
6069                      * to be downgraded to a single byte in order to do the
6070                      * comparison.  (If we could be confident that the target
6071                      * is not malformed, this could be refactored to have fewer
6072                      * tests by just assuming that if the first bytes match, it
6073                      * is an invariant, but there are tests in the test suite
6074                      * dealing with (??{...}) which violate this) */
6075                     while (s < e) {
6076                         if (l >= reginfo->strend
6077                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6078                         {
6079                             sayNO;
6080                         }
6081                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6082                             if (*l != *s) {
6083                                 sayNO;
6084                             }
6085                             l++;
6086                         }
6087                         else {
6088                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6089                             {
6090                                 sayNO;
6091                             }
6092                             l += 2;
6093                         }
6094                         s++;
6095                     }
6096                 }
6097                 else {
6098                     /* The target is not utf8, the pattern is utf8. */
6099                     while (s < e) {
6100                         if (l >= reginfo->strend
6101                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6102                         {
6103                             sayNO;
6104                         }
6105                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6106                             if (*s != *l) {
6107                                 sayNO;
6108                             }
6109                             s++;
6110                         }
6111                         else {
6112                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6113                             {
6114                                 sayNO;
6115                             }
6116                             s += 2;
6117                         }
6118                         l++;
6119                     }
6120                 }
6121                 locinput = l;
6122             }
6123             else {
6124                 /* The target and the pattern have the same utf8ness. */
6125                 /* Inline the first character, for speed. */
6126                 if (reginfo->strend - locinput < ln
6127                     || UCHARAT(s) != nextchr
6128                     || (ln > 1 && memNE(s, locinput, ln)))
6129                 {
6130                     sayNO;
6131                 }
6132                 locinput += ln;
6133             }
6134             break;
6135             }
6136
6137         case EXACTFL: {          /*  /abc/il      */
6138             re_fold_t folder;
6139             const U8 * fold_array;
6140             const char * s;
6141             U32 fold_utf8_flags;
6142
6143             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6144             folder = foldEQ_locale;
6145             fold_array = PL_fold_locale;
6146             fold_utf8_flags = FOLDEQ_LOCALE;
6147             goto do_exactf;
6148
6149         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6150                                       is effectively /u; hence to match, target
6151                                       must be UTF-8. */
6152             if (! utf8_target) {
6153                 sayNO;
6154             }
6155             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6156                                              | FOLDEQ_S1_FOLDS_SANE;
6157             folder = foldEQ_latin1;
6158             fold_array = PL_fold_latin1;
6159             goto do_exactf;
6160
6161         case EXACTFU_SS:         /*  /\x{df}/iu   */
6162         case EXACTFU:            /*  /abc/iu      */
6163             folder = foldEQ_latin1;
6164             fold_array = PL_fold_latin1;
6165             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6166             goto do_exactf;
6167
6168         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
6169                                    patterns */
6170             assert(! is_utf8_pat);
6171             /* FALLTHROUGH */
6172         case EXACTFA:            /*  /abc/iaa     */
6173             folder = foldEQ_latin1;
6174             fold_array = PL_fold_latin1;
6175             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6176             goto do_exactf;
6177
6178         case EXACTF:             /*  /abc/i    This node only generated for
6179                                                non-utf8 patterns */
6180             assert(! is_utf8_pat);
6181             folder = foldEQ;
6182             fold_array = PL_fold;
6183             fold_utf8_flags = 0;
6184
6185           do_exactf:
6186             s = STRING(scan);
6187             ln = STR_LEN(scan);
6188
6189             if (utf8_target
6190                 || is_utf8_pat
6191                 || state_num == EXACTFU_SS
6192                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6193             {
6194               /* Either target or the pattern are utf8, or has the issue where
6195                * the fold lengths may differ. */
6196                 const char * const l = locinput;
6197                 char *e = reginfo->strend;
6198
6199                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6200                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6201                 {
6202                     sayNO;
6203                 }
6204                 locinput = e;
6205                 break;
6206             }
6207
6208             /* Neither the target nor the pattern are utf8 */
6209             if (UCHARAT(s) != nextchr
6210                 && !NEXTCHR_IS_EOS
6211                 && UCHARAT(s) != fold_array[nextchr])
6212             {
6213                 sayNO;
6214             }
6215             if (reginfo->strend - locinput < ln)
6216                 sayNO;
6217             if (ln > 1 && ! folder(s, locinput, ln))
6218                 sayNO;
6219             locinput += ln;
6220             break;
6221         }
6222
6223         case NBOUNDL: /*  /\B/l  */
6224             to_complement = 1;
6225             /* FALLTHROUGH */
6226
6227         case BOUNDL:  /*  /\b/l  */
6228         {
6229             bool b1, b2;
6230             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6231
6232             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6233                 if (! IN_UTF8_CTYPE_LOCALE) {
6234                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6235                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6236                 }
6237                 goto boundu;
6238             }
6239
6240             if (utf8_target) {
6241                 if (locinput == reginfo->strbeg)
6242                     b1 = isWORDCHAR_LC('\n');
6243                 else {
6244                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6245                                                         (U8*)(reginfo->strbeg)),
6246                                                  (U8*)(reginfo->strend));
6247                 }
6248                 b2 = (NEXTCHR_IS_EOS)
6249                     ? isWORDCHAR_LC('\n')
6250                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6251                                               (U8*) reginfo->strend);
6252             }
6253             else { /* Here the string isn't utf8 */
6254                 b1 = (locinput == reginfo->strbeg)
6255                      ? isWORDCHAR_LC('\n')
6256                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6257                 b2 = (NEXTCHR_IS_EOS)
6258                     ? isWORDCHAR_LC('\n')
6259                     : isWORDCHAR_LC(nextchr);
6260             }
6261             if (to_complement ^ (b1 == b2)) {
6262                 sayNO;
6263             }
6264             break;
6265         }
6266
6267         case NBOUND:  /*  /\B/   */
6268             to_complement = 1;
6269             /* FALLTHROUGH */
6270
6271         case BOUND:   /*  /\b/   */
6272             if (utf8_target) {
6273                 goto bound_utf8;
6274             }
6275             goto bound_ascii_match_only;
6276
6277         case NBOUNDA: /*  /\B/a  */
6278             to_complement = 1;
6279             /* FALLTHROUGH */
6280
6281         case BOUNDA:  /*  /\b/a  */
6282         {
6283             bool b1, b2;
6284
6285           bound_ascii_match_only:
6286             /* Here the string isn't utf8, or is utf8 and only ascii characters
6287              * are to match \w.  In the latter case looking at the byte just
6288              * prior to the current one may be just the final byte of a
6289              * multi-byte character.  This is ok.  There are two cases:
6290              * 1) it is a single byte character, and then the test is doing
6291              *    just what it's supposed to.
6292              * 2) it is a multi-byte character, in which case the final byte is
6293              *    never mistakable for ASCII, and so the test will say it is
6294              *    not a word character, which is the correct answer. */
6295             b1 = (locinput == reginfo->strbeg)
6296                  ? isWORDCHAR_A('\n')
6297                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6298             b2 = (NEXTCHR_IS_EOS)
6299                 ? isWORDCHAR_A('\n')
6300                 : isWORDCHAR_A(nextchr);
6301             if (to_complement ^ (b1 == b2)) {
6302                 sayNO;
6303             }
6304             break;
6305         }
6306
6307         case NBOUNDU: /*  /\B/u  */
6308             to_complement = 1;
6309             /* FALLTHROUGH */
6310
6311         case BOUNDU:  /*  /\b/u  */
6312
6313           boundu:
6314             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6315                 match = FALSE;
6316             }
6317             else if (utf8_target) {
6318               bound_utf8:
6319                 switch((bound_type) FLAGS(scan)) {
6320                     case TRADITIONAL_BOUND:
6321                     {
6322                         bool b1, b2;
6323                         b1 = (locinput == reginfo->strbeg)
6324                              ? 0 /* isWORDCHAR_L1('\n') */
6325                              : isWORDCHAR_utf8_safe(
6326                                                reghop3((U8*)locinput,
6327                                                        -1,
6328                                                        (U8*)(reginfo->strbeg)),
6329                                                     (U8*) reginfo->strend);
6330                         b2 = (NEXTCHR_IS_EOS)
6331                             ? 0 /* isWORDCHAR_L1('\n') */
6332                             : isWORDCHAR_utf8_safe((U8*)locinput,
6333                                                    (U8*) reginfo->strend);
6334                         match = cBOOL(b1 != b2);
6335                         break;
6336                     }
6337                     case GCB_BOUND:
6338                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6339                             match = TRUE; /* GCB always matches at begin and
6340                                              end */
6341                         }
6342                         else {
6343                             /* Find the gcb values of previous and current
6344                              * chars, then see if is a break point */
6345                             match = isGCB(getGCB_VAL_UTF8(
6346                                                 reghop3((U8*)locinput,
6347                                                         -1,
6348                                                         (U8*)(reginfo->strbeg)),
6349                                                 (U8*) reginfo->strend),
6350                                           getGCB_VAL_UTF8((U8*) locinput,
6351                                                         (U8*) reginfo->strend),
6352                                           (U8*) reginfo->strbeg,
6353                                           (U8*) locinput,
6354                                           utf8_target);
6355                         }
6356                         break;
6357
6358                     case LB_BOUND:
6359                         if (locinput == reginfo->strbeg) {
6360                             match = FALSE;
6361                         }
6362                         else if (NEXTCHR_IS_EOS) {
6363                             match = TRUE;
6364                         }
6365                         else {
6366                             match = isLB(getLB_VAL_UTF8(
6367                                                 reghop3((U8*)locinput,
6368                                                         -1,
6369                                                         (U8*)(reginfo->strbeg)),
6370                                                 (U8*) reginfo->strend),
6371                                           getLB_VAL_UTF8((U8*) locinput,
6372                                                         (U8*) reginfo->strend),
6373                                           (U8*) reginfo->strbeg,
6374                                           (U8*) locinput,
6375                                           (U8*) reginfo->strend,
6376                                           utf8_target);
6377                         }
6378                         break;
6379
6380                     case SB_BOUND: /* Always matches at begin and end */
6381                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6382                             match = TRUE;
6383                         }
6384                         else {
6385                             match = isSB(getSB_VAL_UTF8(
6386                                                 reghop3((U8*)locinput,
6387                                                         -1,
6388                                                         (U8*)(reginfo->strbeg)),
6389                                                 (U8*) reginfo->strend),
6390                                           getSB_VAL_UTF8((U8*) locinput,
6391                                                         (U8*) reginfo->strend),
6392                                           (U8*) reginfo->strbeg,
6393                                           (U8*) locinput,
6394                                           (U8*) reginfo->strend,
6395                                           utf8_target);
6396                         }
6397                         break;
6398
6399                     case WB_BOUND:
6400                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6401                             match = TRUE;
6402                         }
6403                         else {
6404                             match = isWB(WB_UNKNOWN,
6405                                          getWB_VAL_UTF8(
6406                                                 reghop3((U8*)locinput,
6407                                                         -1,
6408                                                         (U8*)(reginfo->strbeg)),
6409                                                 (U8*) reginfo->strend),
6410                                           getWB_VAL_UTF8((U8*) locinput,
6411                                                         (U8*) reginfo->strend),
6412                                           (U8*) reginfo->strbeg,
6413                                           (U8*) locinput,
6414                                           (U8*) reginfo->strend,
6415                                           utf8_target);
6416                         }
6417                         break;
6418                 }
6419             }
6420             else {  /* Not utf8 target */
6421                 switch((bound_type) FLAGS(scan)) {
6422                     case TRADITIONAL_BOUND:
6423                     {
6424                         bool b1, b2;
6425                         b1 = (locinput == reginfo->strbeg)
6426                             ? 0 /* isWORDCHAR_L1('\n') */
6427                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6428                         b2 = (NEXTCHR_IS_EOS)
6429                             ? 0 /* isWORDCHAR_L1('\n') */
6430                             : isWORDCHAR_L1(nextchr);
6431                         match = cBOOL(b1 != b2);
6432                         break;
6433                     }
6434
6435                     case GCB_BOUND:
6436                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6437                             match = TRUE; /* GCB always matches at begin and
6438                                              end */
6439                         }
6440                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6441                                    range */
6442                             match =    UCHARAT(locinput - 1) != '\r'
6443                                     || UCHARAT(locinput) != '\n';
6444                         }
6445                         break;
6446
6447                     case LB_BOUND:
6448                         if (locinput == reginfo->strbeg) {
6449                             match = FALSE;
6450                         }
6451                         else if (NEXTCHR_IS_EOS) {
6452                             match = TRUE;
6453                         }
6454                         else {
6455                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6456                                          getLB_VAL_CP(UCHARAT(locinput)),
6457                                          (U8*) reginfo->strbeg,
6458                                          (U8*) locinput,
6459                                          (U8*) reginfo->strend,
6460                                          utf8_target);
6461                         }
6462                         break;
6463
6464                     case SB_BOUND: /* Always matches at begin and end */
6465                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6466                             match = TRUE;
6467                         }
6468                         else {
6469                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6470                                          getSB_VAL_CP(UCHARAT(locinput)),
6471                                          (U8*) reginfo->strbeg,
6472                                          (U8*) locinput,
6473                                          (U8*) reginfo->strend,
6474                                          utf8_target);
6475                         }
6476                         break;
6477
6478                     case WB_BOUND:
6479                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6480                             match = TRUE;
6481                         }
6482                         else {
6483                             match = isWB(WB_UNKNOWN,
6484                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6485                                          getWB_VAL_CP(UCHARAT(locinput)),
6486                                          (U8*) reginfo->strbeg,
6487                                          (U8*) locinput,
6488                                          (U8*) reginfo->strend,
6489                                          utf8_target);
6490                         }
6491                         break;
6492                 }
6493             }
6494
6495             if (to_complement ^ ! match) {
6496                 sayNO;
6497             }
6498             break;
6499
6500         case ANYOFL:  /*  /[abc]/l      */
6501             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6502
6503             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6504             {
6505               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6506             }
6507             /* FALLTHROUGH */
6508         case ANYOFD:  /*   /[abc]/d       */
6509         case ANYOF:  /*   /[abc]/       */
6510             if (NEXTCHR_IS_EOS)
6511                 sayNO;
6512             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6513                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6514                                                                    utf8_target))
6515                     sayNO;
6516                 locinput += UTF8SKIP(locinput);
6517             }
6518             else {
6519                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6520                     sayNO;
6521                 locinput++;
6522             }
6523             break;
6524
6525         case ASCII:
6526             if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
6527                 sayNO;
6528             }
6529
6530             locinput++;     /* ASCII is always single byte */
6531             break;
6532
6533         case NASCII:
6534             if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
6535                 sayNO;
6536             }
6537
6538             goto increment_locinput;
6539             break;
6540
6541         /* The argument (FLAGS) to all the POSIX node types is the class number
6542          * */
6543
6544         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6545             to_complement = 1;
6546             /* FALLTHROUGH */
6547
6548         case POSIXL:    /* \w or [:punct:] etc. under /l */
6549             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6550             if (NEXTCHR_IS_EOS)
6551                 sayNO;
6552
6553             /* Use isFOO_lc() for characters within Latin1.  (Note that
6554              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6555              * wouldn't be invariant) */
6556             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6557                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6558                     sayNO;
6559                 }
6560
6561                 locinput++;
6562                 break;
6563             }
6564
6565             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6566                 /* An above Latin-1 code point, or malformed */
6567                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6568                                                        reginfo->strend);
6569                 goto utf8_posix_above_latin1;
6570             }
6571
6572             /* Here is a UTF-8 variant code point below 256 and the target is
6573              * UTF-8 */
6574             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6575                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6576                                             *(locinput + 1))))))
6577             {
6578                 sayNO;
6579             }
6580
6581             goto increment_locinput;
6582
6583         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6584             to_complement = 1;
6585             /* FALLTHROUGH */
6586
6587         case POSIXD:    /* \w or [:punct:] etc. under /d */
6588             if (utf8_target) {
6589                 goto utf8_posix;
6590             }
6591             goto posixa;
6592
6593         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6594
6595             if (NEXTCHR_IS_EOS) {
6596                 sayNO;
6597             }
6598
6599             /* All UTF-8 variants match */
6600             if (! UTF8_IS_INVARIANT(nextchr)) {
6601                 goto increment_locinput;
6602             }
6603
6604             to_complement = 1;
6605             goto join_nposixa;
6606
6607         case POSIXA:    /* \w or [:punct:] etc. under /a */
6608
6609           posixa:
6610             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6611              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6612              * character is a single byte */
6613
6614             if (NEXTCHR_IS_EOS) {
6615                 sayNO;
6616             }
6617
6618           join_nposixa:
6619
6620             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6621                                                                 FLAGS(scan)))))
6622             {
6623                 sayNO;
6624             }
6625
6626             /* Here we are either not in utf8, or we matched a utf8-invariant,
6627              * so the next char is the next byte */
6628             locinput++;
6629             break;
6630
6631         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6632             to_complement = 1;
6633             /* FALLTHROUGH */
6634
6635         case POSIXU:    /* \w or [:punct:] etc. under /u */
6636           utf8_posix:
6637             if (NEXTCHR_IS_EOS) {
6638                 sayNO;
6639             }
6640
6641             /* Use _generic_isCC() for characters within Latin1.  (Note that
6642              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6643              * wouldn't be invariant) */
6644             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6645                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6646                                                            FLAGS(scan)))))
6647                 {
6648                     sayNO;
6649                 }
6650                 locinput++;
6651             }
6652             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6653                 if (! (to_complement
6654                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6655                                                                *(locinput + 1)),
6656                                              FLAGS(scan)))))
6657                 {
6658                     sayNO;
6659                 }
6660                 locinput += 2;
6661             }
6662             else {  /* Handle above Latin-1 code points */
6663               utf8_posix_above_latin1:
6664                 classnum = (_char_class_number) FLAGS(scan);
6665                 if (classnum < _FIRST_NON_SWASH_CC) {
6666
6667                     /* Here, uses a swash to find such code points.  Load if if
6668                      * not done already */
6669                     if (! PL_utf8_swash_ptrs[classnum]) {
6670                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6671                         PL_utf8_swash_ptrs[classnum]
6672                                 = _core_swash_init("utf8",
6673                                         "",
6674                                         &PL_sv_undef, 1, 0,
6675                                         PL_XPosix_ptrs[classnum], &flags);
6676                     }
6677                     if (! (to_complement
6678                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6679                                                (U8 *) locinput, TRUE))))
6680                     {
6681                         sayNO;
6682                     }
6683                 }
6684                 else {  /* Here, uses macros to find above Latin-1 code points */
6685                     switch (classnum) {
6686                         case _CC_ENUM_SPACE:
6687                             if (! (to_complement
6688                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
6689                             {
6690                                 sayNO;
6691                             }
6692                             break;
6693                         case _CC_ENUM_BLANK:
6694                             if (! (to_complement
6695                                             ^ cBOOL(is_HORIZWS_high(locinput))))
6696                             {
6697                                 sayNO;
6698                             }
6699                             break;
6700                         case _CC_ENUM_XDIGIT:
6701                             if (! (to_complement
6702                                             ^ cBOOL(is_XDIGIT_high(locinput))))
6703                             {
6704                                 sayNO;
6705                             }
6706                             break;
6707                         case _CC_ENUM_VERTSPACE:
6708                             if (! (to_complement
6709                                             ^ cBOOL(is_VERTWS_high(locinput))))
6710                             {
6711                                 sayNO;
6712                             }
6713                             break;
6714                         default:    /* The rest, e.g. [:cntrl:], can't match
6715                                        above Latin1 */
6716                             if (! to_complement) {
6717                                 sayNO;
6718                             }
6719                             break;
6720                     }
6721                 }
6722                 locinput += UTF8SKIP(locinput);
6723             }
6724             break;
6725
6726         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6727                        a Unicode extended Grapheme Cluster */
6728             if (NEXTCHR_IS_EOS)
6729                 sayNO;
6730             if  (! utf8_target) {
6731
6732                 /* Match either CR LF  or '.', as all the other possibilities
6733                  * require utf8 */
6734                 locinput++;         /* Match the . or CR */
6735                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6736                                        match the LF */
6737                     && locinput < reginfo->strend
6738                     && UCHARAT(locinput) == '\n')
6739                 {
6740                     locinput++;
6741                 }
6742             }
6743             else {
6744
6745                 /* Get the gcb type for the current character */
6746                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6747                                                        (U8*) reginfo->strend);
6748
6749                 /* Then scan through the input until we get to the first
6750                  * character whose type is supposed to be a gcb with the
6751                  * current character.  (There is always a break at the
6752                  * end-of-input) */
6753                 locinput += UTF8SKIP(locinput);
6754                 while (locinput < reginfo->strend) {
6755                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6756                                                          (U8*) reginfo->strend);
6757                     if (isGCB(prev_gcb, cur_gcb,
6758                               (U8*) reginfo->strbeg, (U8*) locinput,
6759                               utf8_target))
6760                     {
6761                         break;
6762                     }
6763
6764                     prev_gcb = cur_gcb;
6765                     locinput += UTF8SKIP(locinput);
6766                 }
6767
6768
6769             }
6770             break;
6771
6772         case NREFFL:  /*  /\g{name}/il  */
6773         {   /* The capture buffer cases.  The ones beginning with N for the
6774                named buffers just convert to the equivalent numbered and
6775                pretend they were called as the corresponding numbered buffer
6776                op.  */
6777             /* don't initialize these in the declaration, it makes C++
6778                unhappy */
6779             const char *s;
6780             char type;
6781             re_fold_t folder;
6782             const U8 *fold_array;
6783             UV utf8_fold_flags;
6784
6785             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6786             folder = foldEQ_locale;
6787             fold_array = PL_fold_locale;
6788             type = REFFL;
6789             utf8_fold_flags = FOLDEQ_LOCALE;
6790             goto do_nref;
6791
6792         case NREFFA:  /*  /\g{name}/iaa  */
6793             folder = foldEQ_latin1;
6794             fold_array = PL_fold_latin1;
6795             type = REFFA;
6796             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6797             goto do_nref;
6798
6799         case NREFFU:  /*  /\g{name}/iu  */
6800             folder = foldEQ_latin1;
6801             fold_array = PL_fold_latin1;
6802             type = REFFU;
6803             utf8_fold_flags = 0;
6804             goto do_nref;
6805
6806         case NREFF:  /*  /\g{name}/i  */
6807             folder = foldEQ;
6808             fold_array = PL_fold;
6809             type = REFF;
6810             utf8_fold_flags = 0;
6811             goto do_nref;
6812
6813         case NREF:  /*  /\g{name}/   */
6814             type = REF;
6815             folder = NULL;
6816             fold_array = NULL;
6817             utf8_fold_flags = 0;
6818           do_nref:
6819
6820             /* For the named back references, find the corresponding buffer
6821              * number */
6822             n = reg_check_named_buff_matched(rex,scan);
6823
6824             if ( ! n ) {
6825                 sayNO;
6826             }
6827             goto do_nref_ref_common;
6828
6829         case REFFL:  /*  /\1/il  */
6830             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6831             folder = foldEQ_locale;
6832             fold_array = PL_fold_locale;
6833             utf8_fold_flags = FOLDEQ_LOCALE;
6834             goto do_ref;
6835
6836         case REFFA:  /*  /\1/iaa  */
6837             folder = foldEQ_latin1;
6838             fold_array = PL_fold_latin1;
6839             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6840             goto do_ref;
6841
6842         case REFFU:  /*  /\1/iu  */
6843             folder = foldEQ_latin1;
6844             fold_array = PL_fold_latin1;
6845             utf8_fold_flags = 0;
6846             goto do_ref;
6847
6848         case REFF:  /*  /\1/i  */
6849             folder = foldEQ;
6850             fold_array = PL_fold;
6851             utf8_fold_flags = 0;
6852             goto do_ref;
6853
6854         case REF:  /*  /\1/    */
6855             folder = NULL;
6856             fold_array = NULL;
6857             utf8_fold_flags = 0;
6858
6859           do_ref:
6860             type = OP(scan);
6861             n = ARG(scan);  /* which paren pair */
6862
6863           do_nref_ref_common:
6864             ln = rex->offs[n].start;
6865             endref = rex->offs[n].end;
6866             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6867             if (rex->lastparen < n || ln == -1 || endref == -1)
6868                 sayNO;                  /* Do not match unless seen CLOSEn. */
6869             if (ln == endref)
6870                 break;
6871
6872             s = reginfo->strbeg + ln;
6873             if (type != REF     /* REF can do byte comparison */
6874                 && (utf8_target || type == REFFU || type == REFFL))
6875             {
6876                 char * limit = reginfo->strend;
6877
6878                 /* This call case insensitively compares the entire buffer
6879                     * at s, with the current input starting at locinput, but
6880                     * not going off the end given by reginfo->strend, and
6881                     * returns in <limit> upon success, how much of the
6882                     * current input was matched */
6883                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
6884                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6885                 {
6886                     sayNO;
6887                 }
6888                 locinput = limit;
6889                 break;
6890             }
6891
6892             /* Not utf8:  Inline the first character, for speed. */
6893             if (!NEXTCHR_IS_EOS &&
6894                 UCHARAT(s) != nextchr &&
6895                 (type == REF ||
6896                  UCHARAT(s) != fold_array[nextchr]))
6897                 sayNO;
6898             ln = endref - ln;
6899             if (locinput + ln > reginfo->strend)
6900                 sayNO;
6901             if (ln > 1 && (type == REF
6902                            ? memNE(s, locinput, ln)
6903                            : ! folder(s, locinput, ln)))
6904                 sayNO;
6905             locinput += ln;
6906             break;
6907         }
6908
6909         case NOTHING: /* null op; e.g. the 'nothing' following
6910                        * the '*' in m{(a+|b)*}' */
6911             break;
6912         case TAIL: /* placeholder while compiling (A|B|C) */
6913             break;
6914
6915 #undef  ST
6916 #define ST st->u.eval
6917 #define CUR_EVAL cur_eval->u.eval
6918
6919         {
6920             SV *ret;
6921             REGEXP *re_sv;
6922             regexp *re;
6923             regexp_internal *rei;
6924             regnode *startpoint;
6925             U32 arg;
6926
6927         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6928             arg= (U32)ARG(scan);
6929             if (cur_eval && cur_eval->locinput == locinput) {
6930                 if ( ++nochange_depth > max_nochange_depth )
6931                     Perl_croak(aTHX_
6932                         "Pattern subroutine nesting without pos change"
6933                         " exceeded limit in regex");
6934             } else {
6935                 nochange_depth = 0;
6936             }
6937             re_sv = rex_sv;
6938             re = rex;
6939             rei = rexi;
6940             startpoint = scan + ARG2L(scan);
6941             EVAL_CLOSE_PAREN_SET( st, arg );
6942             /* Detect infinite recursion
6943              *
6944              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6945              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6946              * So we track the position in the string we are at each time
6947              * we recurse and if we try to enter the same routine twice from
6948              * the same position we throw an error.
6949              */
6950             if ( rex->recurse_locinput[arg] == locinput ) {
6951                 /* FIXME: we should show the regop that is failing as part
6952                  * of the error message. */
6953                 Perl_croak(aTHX_ "Infinite recursion in regex");
6954             } else {
6955                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6956                 rex->recurse_locinput[arg]= locinput;
6957
6958                 DEBUG_r({
6959                     GET_RE_DEBUG_FLAGS_DECL;
6960                     DEBUG_STACK_r({
6961                         Perl_re_exec_indentf( aTHX_
6962                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6963                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6964                         );
6965                     });
6966                 });
6967             }
6968
6969             /* Save all the positions seen so far. */
6970             ST.cp = regcppush(rex, 0, maxopenparen);
6971             REGCP_SET(ST.lastcp);
6972
6973             /* and then jump to the code we share with EVAL */
6974             goto eval_recurse_doit;
6975             /* NOTREACHED */
6976
6977         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
6978             if (cur_eval && cur_eval->locinput==locinput) {
6979                 if ( ++nochange_depth > max_nochange_depth )
6980                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6981             } else {
6982                 nochange_depth = 0;
6983             }
6984             {
6985                 /* execute the code in the {...} */
6986
6987                 dSP;
6988                 IV before;
6989                 OP * const oop = PL_op;
6990                 COP * const ocurcop = PL_curcop;
6991                 OP *nop;
6992                 CV *newcv;
6993
6994                 /* save *all* paren positions */
6995                 regcppush(rex, 0, maxopenparen);
6996                 REGCP_SET(ST.lastcp);
6997
6998                 if (!caller_cv)
6999                     caller_cv = find_runcv(NULL);
7000
7001                 n = ARG(scan);
7002
7003                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7004                     newcv = (ReANY(
7005                                     (REGEXP*)(rexi->data->data[n])
7006                             ))->qr_anoncv;
7007                     nop = (OP*)rexi->data->data[n+1];
7008                 }
7009                 else if (rexi->data->what[n] == 'l') { /* literal code */
7010                     newcv = caller_cv;
7011                     nop = (OP*)rexi->data->data[n];
7012                     assert(CvDEPTH(newcv));
7013                 }
7014                 else {
7015                     /* literal with own CV */
7016                     assert(rexi->data->what[n] == 'L');
7017                     newcv = rex->qr_anoncv;
7018                     nop = (OP*)rexi->data->data[n];
7019                 }
7020
7021                 /* Some notes about MULTICALL and the context and save stacks.
7022                  *
7023                  * In something like
7024                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7025                  * since codeblocks don't introduce a new scope (so that
7026                  * local() etc accumulate), at the end of a successful
7027                  * match there will be a SAVEt_CLEARSV on the savestack
7028                  * for each of $x, $y, $z. If the three code blocks above
7029                  * happen to have come from different CVs (e.g. via
7030                  * embedded qr//s), then we must ensure that during any
7031                  * savestack unwinding, PL_comppad always points to the
7032                  * right pad at each moment. We achieve this by
7033                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7034                  * there is a change of pad.
7035                  * In theory whenever we call a code block, we should
7036                  * push a CXt_SUB context, then pop it on return from
7037                  * that code block. This causes a bit of an issue in that
7038                  * normally popping a context also clears the savestack
7039                  * back to cx->blk_oldsaveix, but here we specifically
7040                  * don't want to clear the save stack on exit from the
7041                  * code block.
7042                  * Also for efficiency we don't want to keep pushing and
7043                  * popping the single SUB context as we backtrack etc.
7044                  * So instead, we push a single context the first time
7045                  * we need, it, then hang onto it until the end of this
7046                  * function. Whenever we encounter a new code block, we
7047                  * update the CV etc if that's changed. During the times
7048                  * in this function where we're not executing a code
7049                  * block, having the SUB context still there is a bit
7050                  * naughty - but we hope that no-one notices.
7051                  * When the SUB context is initially pushed, we fake up
7052                  * cx->blk_oldsaveix to be as if we'd pushed this context
7053                  * on first entry to S_regmatch rather than at some random
7054                  * point during the regexe execution. That way if we
7055                  * croak, popping the context stack will ensure that
7056                  * *everything* SAVEd by this function is undone and then
7057                  * the context popped, rather than e.g., popping the
7058                  * context (and restoring the original PL_comppad) then
7059                  * popping more of the savestack and restoring a bad
7060                  * PL_comppad.
7061                  */
7062
7063                 /* If this is the first EVAL, push a MULTICALL. On
7064                  * subsequent calls, if we're executing a different CV, or
7065                  * if PL_comppad has got messed up from backtracking
7066                  * through SAVECOMPPADs, then refresh the context.
7067                  */
7068                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7069                 {
7070                     U8 flags = (CXp_SUB_RE |
7071                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7072                     SAVECOMPPAD();
7073                     if (last_pushed_cv) {
7074                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7075                     }
7076                     else {
7077                         PUSH_MULTICALL_FLAGS(newcv, flags);
7078                     }
7079                     /* see notes above */
7080                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7081
7082                     last_pushed_cv = newcv;
7083                 }
7084                 else {
7085                     /* these assignments are just to silence compiler
7086                      * warnings */
7087                     multicall_cop = NULL;
7088                 }
7089                 last_pad = PL_comppad;
7090
7091                 /* the initial nextstate you would normally execute
7092                  * at the start of an eval (which would cause error
7093                  * messages to come from the eval), may be optimised
7094                  * away from the execution path in the regex code blocks;
7095                  * so manually set PL_curcop to it initially */
7096                 {
7097                     OP *o = cUNOPx(nop)->op_first;
7098                     assert(o->op_type == OP_NULL);
7099                     if (o->op_targ == OP_SCOPE) {
7100                         o = cUNOPo->op_first;
7101                     }
7102                     else {
7103                         assert(o->op_targ == OP_LEAVE);
7104                         o = cUNOPo->op_first;
7105                         assert(o->op_type == OP_ENTER);
7106                         o = OpSIBLING(o);
7107                     }
7108
7109                     if (o->op_type != OP_STUB) {
7110                         assert(    o->op_type == OP_NEXTSTATE
7111                                 || o->op_type == OP_DBSTATE
7112                                 || (o->op_type == OP_NULL
7113                                     &&  (  o->op_targ == OP_NEXTSTATE
7114                                         || o->op_targ == OP_DBSTATE
7115                                         )
7116                                     )
7117                         );
7118                         PL_curcop = (COP*)o;
7119                     }
7120                 }
7121                 nop = nop->op_next;
7122
7123                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7124                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7125
7126                 rex->offs[0].end = locinput - reginfo->strbeg;
7127                 if (reginfo->info_aux_eval->pos_magic)
7128                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7129                                   reginfo->sv, reginfo->strbeg,
7130                                   locinput - reginfo->strbeg);
7131
7132                 if (sv_yes_mark) {
7133                     SV *sv_mrk = get_sv("REGMARK", 1);
7134                     sv_setsv(sv_mrk, sv_yes_mark);
7135                 }
7136
7137                 /* we don't use MULTICALL here as we want to call the
7138                  * first op of the block of interest, rather than the
7139                  * first op of the sub. Also, we don't want to free
7140                  * the savestack frame */
7141                 before = (IV)(SP-PL_stack_base);
7142                 PL_op = nop;
7143                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7144                 SPAGAIN;
7145                 if ((IV)(SP-PL_stack_base) == before)
7146                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7147                 else {
7148                     ret = POPs;
7149                     PUTBACK;
7150                 }
7151
7152                 /* before restoring everything, evaluate the returned
7153                  * value, so that 'uninit' warnings don't use the wrong
7154                  * PL_op or pad. Also need to process any magic vars
7155                  * (e.g. $1) *before* parentheses are restored */
7156
7157                 PL_op = NULL;
7158
7159                 re_sv = NULL;
7160                 if (logical == 0)        /*   (?{})/   */
7161                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
7162                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7163                     sw = cBOOL(SvTRUE_NN(ret));
7164                     logical = 0;
7165                 }
7166                 else {                   /*  /(??{})  */
7167                     /*  if its overloaded, let the regex compiler handle
7168                      *  it; otherwise extract regex, or stringify  */
7169                     if (SvGMAGICAL(ret))
7170                         ret = sv_mortalcopy(ret);
7171                     if (!SvAMAGIC(ret)) {
7172                         SV *sv = ret;
7173                         if (SvROK(sv))
7174                             sv = SvRV(sv);
7175                         if (SvTYPE(sv) == SVt_REGEXP)
7176                             re_sv = (REGEXP*) sv;
7177                         else if (SvSMAGICAL(ret)) {
7178                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7179                             if (mg)
7180                                 re_sv = (REGEXP *) mg->mg_obj;
7181                         }
7182
7183                         /* force any undef warnings here */
7184                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7185                             ret = sv_mortalcopy(ret);
7186                             (void) SvPV_force_nolen(ret);
7187                         }
7188                     }
7189
7190                 }
7191
7192                 /* *** Note that at this point we don't restore
7193                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7194                  * be used again soon. This is safe as long as nothing
7195                  * in the regexp code uses the pad ! */
7196                 PL_op = oop;
7197                 PL_curcop = ocurcop;
7198                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7199                 PL_curpm_under = PL_curpm;
7200                 PL_curpm = PL_reg_curpm;
7201
7202                 if (logical != 2) {
7203                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7204                     /* NOTREACHED */
7205                 }
7206             }
7207
7208                 /* only /(??{})/  from now on */
7209                 logical = 0;
7210                 {
7211                     /* extract RE object from returned value; compiling if
7212                      * necessary */
7213
7214                     if (re_sv) {
7215                         re_sv = reg_temp_copy(NULL, re_sv);
7216                     }
7217                     else {
7218                         U32 pm_flags = 0;
7219
7220                         if (SvUTF8(ret) && IN_BYTES) {
7221                             /* In use 'bytes': make a copy of the octet
7222                              * sequence, but without the flag on */
7223                             STRLEN len;
7224                             const char *const p = SvPV(ret, len);
7225                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7226                         }
7227                         if (rex->intflags & PREGf_USE_RE_EVAL)
7228                             pm_flags |= PMf_USE_RE_EVAL;
7229
7230                         /* if we got here, it should be an engine which
7231                          * supports compiling code blocks and stuff */
7232                         assert(rex->engine && rex->engine->op_comp);
7233                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7234                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7235                                     rex->engine, NULL, NULL,
7236                                     /* copy /msixn etc to inner pattern */
7237                                     ARG2L(scan),
7238                                     pm_flags);
7239
7240                         if (!(SvFLAGS(ret)
7241                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7242                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7243                             /* This isn't a first class regexp. Instead, it's
7244                                caching a regexp onto an existing, Perl visible
7245                                scalar.  */
7246                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7247                         }
7248                     }
7249                     SAVEFREESV(re_sv);
7250                     re = ReANY(re_sv);
7251                 }
7252                 RXp_MATCH_COPIED_off(re);
7253                 re->subbeg = rex->subbeg;
7254                 re->sublen = rex->sublen;
7255                 re->suboffset = rex->suboffset;
7256                 re->subcoffset = rex->subcoffset;
7257                 re->lastparen = 0;
7258                 re->lastcloseparen = 0;
7259                 rei = RXi_GET(re);
7260                 DEBUG_EXECUTE_r(
7261                     debug_start_match(re_sv, utf8_target, locinput,
7262                                     reginfo->strend, "Matching embedded");
7263                 );
7264                 startpoint = rei->program + 1;
7265                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7266                                              * close_paren only for GOSUB */
7267                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7268                 /* Save all the seen positions so far. */
7269                 ST.cp = regcppush(rex, 0, maxopenparen);
7270                 REGCP_SET(ST.lastcp);
7271                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7272                 maxopenparen = 0;
7273                 /* run the pattern returned from (??{...}) */
7274
7275               eval_recurse_doit: /* Share code with GOSUB below this line
7276                             * At this point we expect the stack context to be
7277                             * set up correctly */
7278
7279                 /* invalidate the S-L poscache. We're now executing a
7280                  * different set of WHILEM ops (and their associated
7281                  * indexes) against the same string, so the bits in the
7282                  * cache are meaningless. Setting maxiter to zero forces
7283                  * the cache to be invalidated and zeroed before reuse.
7284                  * XXX This is too dramatic a measure. Ideally we should
7285                  * save the old cache and restore when running the outer
7286                  * pattern again */
7287                 reginfo->poscache_maxiter = 0;
7288
7289                 /* the new regexp might have a different is_utf8_pat than we do */
7290                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7291
7292                 ST.prev_rex = rex_sv;
7293                 ST.prev_curlyx = cur_curlyx;
7294                 rex_sv = re_sv;
7295                 SET_reg_curpm(rex_sv);
7296                 rex = re;
7297                 rexi = rei;
7298                 cur_curlyx = NULL;
7299                 ST.B = next;
7300                 ST.prev_eval = cur_eval;
7301                 cur_eval = st;
7302                 /* now continue from first node in postoned RE */
7303                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7304                 NOT_REACHED; /* NOTREACHED */
7305         }
7306
7307         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7308             /* note: this is called twice; first after popping B, then A */
7309             DEBUG_STACK_r({
7310                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7311                     depth, cur_eval, ST.prev_eval);
7312             });
7313
7314 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7315             if ( cur_eval && CUR_EVAL.close_paren ) {\
7316                 DEBUG_STACK_r({ \
7317                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7318                         depth,    \
7319                         CUR_EVAL.close_paren - 1,\
7320                         cur_eval, \
7321                         VAL);     \
7322                 });               \
7323                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7324             }
7325
7326             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7327
7328             rex_sv = ST.prev_rex;
7329             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7330             SET_reg_curpm(rex_sv);
7331             rex = ReANY(rex_sv);
7332             rexi = RXi_GET(rex);
7333             {
7334                 /* preserve $^R across LEAVE's. See Bug 121070. */
7335                 SV *save_sv= GvSV(PL_replgv);
7336                 SvREFCNT_inc(save_sv);
7337                 regcpblow(ST.cp); /* LEAVE in disguise */
7338                 sv_setsv(GvSV(PL_replgv), save_sv);
7339                 SvREFCNT_dec(save_sv);
7340             }
7341             cur_eval = ST.prev_eval;
7342             cur_curlyx = ST.prev_curlyx;
7343
7344             /* Invalidate cache. See "invalidate" comment above. */
7345             reginfo->poscache_maxiter = 0;
7346             if ( nochange_depth )
7347                 nochange_depth--;
7348
7349             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7350             sayYES;
7351
7352
7353         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7354             REGCP_UNWIND(ST.lastcp);
7355             sayNO;
7356
7357         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7358             /* note: this is called twice; first after popping B, then A */
7359             DEBUG_STACK_r({
7360                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7361                     depth, cur_eval, ST.prev_eval);
7362             });
7363
7364             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7365
7366             rex_sv = ST.prev_rex;
7367             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7368             SET_reg_curpm(rex_sv);
7369             rex = ReANY(rex_sv);
7370             rexi = RXi_GET(rex);
7371
7372             REGCP_UNWIND(ST.lastcp);
7373             regcppop(rex, &maxopenparen);
7374             cur_eval = ST.prev_eval;
7375             cur_curlyx = ST.prev_curlyx;
7376
7377             /* Invalidate cache. See "invalidate" comment above. */
7378             reginfo->poscache_maxiter = 0;
7379             if ( nochange_depth )
7380                 nochange_depth--;
7381
7382             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7383             sayNO_SILENT;
7384 #undef ST
7385
7386         case OPEN: /*  (  */
7387             n = ARG(scan);  /* which paren pair */
7388             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7389             if (n > maxopenparen)
7390                 maxopenparen = n;
7391             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7392                 "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7393                 depth,
7394                 PTR2UV(rex),
7395                 PTR2UV(rex->offs),
7396                 (UV)n,
7397                 (IV)rex->offs[n].start_tmp,
7398                 (UV)maxopenparen
7399             ));
7400             lastopen = n;
7401             break;
7402
7403         case SROPEN: /*  (*SCRIPT_RUN:  */
7404             script_run_begin = (U8 *) locinput;
7405             break;
7406
7407 /* XXX really need to log other places start/end are set too */
7408 #define CLOSE_CAPTURE                                                      \
7409     rex->offs[n].start = rex->offs[n].start_tmp;                           \
7410     rex->offs[n].end = locinput - reginfo->strbeg;                         \
7411     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
7412         "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
7413         depth,                                                             \
7414         PTR2UV(rex),                                                       \
7415         PTR2UV(rex->offs),                                                 \
7416         (UV)n,                                                             \
7417         (IV)rex->offs[n].start,                                            \
7418         (IV)rex->offs[n].end                                               \
7419     ))
7420
7421         case CLOSE:  /*  )  */
7422             n = ARG(scan);  /* which paren pair */
7423             CLOSE_CAPTURE;
7424             if (n > rex->lastparen)
7425                 rex->lastparen = n;
7426             rex->lastcloseparen = n;
7427             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7428                 goto fake_end;
7429
7430             break;
7431
7432         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7433
7434             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7435             {
7436                 sayNO;
7437             }
7438
7439             break;
7440
7441
7442         case ACCEPT:  /*  (*ACCEPT)  */
7443             if (scan->flags)
7444                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7445             if (ARG2L(scan)){
7446                 regnode *cursor;
7447                 for (cursor=scan;
7448                      cursor && OP(cursor)!=END;
7449                      cursor=regnext(cursor))
7450                 {
7451                     if ( OP(cursor)==CLOSE ){
7452                         n = ARG(cursor);
7453                         if ( n <= lastopen ) {
7454                             CLOSE_CAPTURE;
7455                             if (n > rex->lastparen)
7456                                 rex->lastparen = n;
7457                             rex->lastcloseparen = n;
7458                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7459                                 break;
7460                         }
7461                     }
7462                 }
7463             }
7464             goto fake_end;
7465             /* NOTREACHED */
7466
7467         case GROUPP:  /*  (?(1))  */
7468             n = ARG(scan);  /* which paren pair */
7469             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7470             break;
7471
7472         case NGROUPP:  /*  (?(<name>))  */
7473             /* reg_check_named_buff_matched returns 0 for no match */
7474             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7475             break;
7476
7477         case INSUBP:   /*  (?(R))  */
7478             n = ARG(scan);
7479             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7480              * of SCAN is already set up as matches a eval.close_paren */
7481             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7482             break;
7483
7484         case DEFINEP:  /*  (?(DEFINE))  */
7485             sw = 0;
7486             break;
7487
7488         case IFTHEN:   /*  (?(cond)A|B)  */
7489             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7490             if (sw)
7491                 next = NEXTOPER(NEXTOPER(scan));
7492             else {
7493                 next = scan + ARG(scan);
7494                 if (OP(next) == IFTHEN) /* Fake one. */
7495                     next = NEXTOPER(NEXTOPER(next));
7496             }
7497             break;
7498
7499         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7500             logical = scan->flags;
7501             break;
7502
7503 /*******************************************************************
7504
7505 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7506 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7507 STAR/PLUS/CURLY/CURLYN are used instead.)
7508
7509 A*B is compiled as <CURLYX><A><WHILEM><B>
7510
7511 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7512 state, which contains the current count, initialised to -1. It also sets
7513 cur_curlyx to point to this state, with any previous value saved in the
7514 state block.
7515
7516 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7517 since the pattern may possibly match zero times (i.e. it's a while {} loop
7518 rather than a do {} while loop).
7519
7520 Each entry to WHILEM represents a successful match of A. The count in the
7521 CURLYX block is incremented, another WHILEM state is pushed, and execution
7522 passes to A or B depending on greediness and the current count.
7523
7524 For example, if matching against the string a1a2a3b (where the aN are
7525 substrings that match /A/), then the match progresses as follows: (the
7526 pushed states are interspersed with the bits of strings matched so far):
7527
7528     <CURLYX cnt=-1>
7529     <CURLYX cnt=0><WHILEM>
7530     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7531     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7532     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7533     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7534
7535 (Contrast this with something like CURLYM, which maintains only a single
7536 backtrack state:
7537
7538     <CURLYM cnt=0> a1
7539     a1 <CURLYM cnt=1> a2
7540     a1 a2 <CURLYM cnt=2> a3
7541     a1 a2 a3 <CURLYM cnt=3> b
7542 )
7543
7544 Each WHILEM state block marks a point to backtrack to upon partial failure
7545 of A or B, and also contains some minor state data related to that
7546 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7547 overall state, such as the count, and pointers to the A and B ops.
7548
7549 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7550 must always point to the *current* CURLYX block, the rules are:
7551
7552 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7553 and set cur_curlyx to point the new block.
7554
7555 When popping the CURLYX block after a successful or unsuccessful match,
7556 restore the previous cur_curlyx.
7557
7558 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7559 to the outer one saved in the CURLYX block.
7560
7561 When popping the WHILEM block after a successful or unsuccessful B match,
7562 restore the previous cur_curlyx.
7563
7564 Here's an example for the pattern (AI* BI)*BO
7565 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7566
7567 cur_
7568 curlyx backtrack stack
7569 ------ ---------------
7570 NULL
7571 CO     <CO prev=NULL> <WO>
7572 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7573 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7574 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7575
7576 At this point the pattern succeeds, and we work back down the stack to
7577 clean up, restoring as we go:
7578
7579 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7580 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7581 CO     <CO prev=NULL> <WO>
7582 NULL
7583
7584 *******************************************************************/
7585
7586 #define ST st->u.curlyx
7587
7588         case CURLYX:    /* start of /A*B/  (for complex A) */
7589         {
7590             /* No need to save/restore up to this paren */
7591             I32 parenfloor = scan->flags;
7592
7593             assert(next); /* keep Coverity happy */
7594             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7595                 next += ARG(next);
7596
7597             /* XXXX Probably it is better to teach regpush to support
7598                parenfloor > maxopenparen ... */
7599             if (parenfloor > (I32)rex->lastparen)
7600                 parenfloor = rex->lastparen; /* Pessimization... */
7601
7602             ST.prev_curlyx= cur_curlyx;
7603             cur_curlyx = st;
7604             ST.cp = PL_savestack_ix;
7605
7606             /* these fields contain the state of the current curly.
7607              * they are accessed by subsequent WHILEMs */
7608             ST.parenfloor = parenfloor;
7609             ST.me = scan;
7610             ST.B = next;
7611             ST.minmod = minmod;
7612             minmod = 0;
7613             ST.count = -1;      /* this will be updated by WHILEM */
7614             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7615
7616             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7617             NOT_REACHED; /* NOTREACHED */
7618         }
7619
7620         case CURLYX_end: /* just finished matching all of A*B */
7621             cur_curlyx = ST.prev_curlyx;
7622             sayYES;
7623             NOT_REACHED; /* NOTREACHED */
7624
7625         case CURLYX_end_fail: /* just failed to match all of A*B */
7626             regcpblow(ST.cp);
7627             cur_curlyx = ST.prev_curlyx;
7628             sayNO;
7629             NOT_REACHED; /* NOTREACHED */
7630
7631
7632 #undef ST
7633 #define ST st->u.whilem
7634
7635         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7636         {
7637             /* see the discussion above about CURLYX/WHILEM */
7638             I32 n;
7639             int min, max;
7640             regnode *A;
7641
7642             assert(cur_curlyx); /* keep Coverity happy */
7643
7644             min = ARG1(cur_curlyx->u.curlyx.me);
7645             max = ARG2(cur_curlyx->u.curlyx.me);
7646             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7647             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7648             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7649             ST.cache_offset = 0;
7650             ST.cache_mask = 0;
7651
7652
7653             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: matched %ld out of %d..%d\n",
7654                   depth, (long)n, min, max)
7655             );
7656
7657             /* First just match a string of min A's. */
7658
7659             if (n < min) {
7660                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7661                 cur_curlyx->u.curlyx.lastloc = locinput;
7662                 REGCP_SET(ST.lastcp);
7663
7664                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7665                 NOT_REACHED; /* NOTREACHED */
7666             }
7667
7668             /* If degenerate A matches "", assume A done. */
7669
7670             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7671                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: empty match detected, trying continuation...\n",
7672                    depth)
7673                 );
7674                 goto do_whilem_B_max;
7675             }
7676
7677             /* super-linear cache processing.
7678              *
7679              * The idea here is that for certain types of CURLYX/WHILEM -
7680              * principally those whose upper bound is infinity (and
7681              * excluding regexes that have things like \1 and other very
7682              * non-regular expresssiony things), then if a pattern like
7683              * /....A*.../ fails and we backtrack to the WHILEM, then we
7684              * make a note that this particular WHILEM op was at string
7685              * position 47 (say) when the rest of pattern failed. Then, if
7686              * we ever find ourselves back at that WHILEM, and at string
7687              * position 47 again, we can just fail immediately rather than
7688              * running the rest of the pattern again.
7689              *
7690              * This is very handy when patterns start to go
7691              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7692              * with a combinatorial explosion of backtracking.
7693              *
7694              * The cache is implemented as a bit array, with one bit per
7695              * string byte position per WHILEM op (up to 16) - so its
7696              * between 0.25 and 2x the string size.
7697              *
7698              * To avoid allocating a poscache buffer every time, we do an
7699              * initially countdown; only after we have  executed a WHILEM
7700              * op (string-length x #WHILEMs) times do we allocate the
7701              * cache.
7702              *
7703              * The top 4 bits of scan->flags byte say how many different
7704              * relevant CURLLYX/WHILEM op pairs there are, while the
7705              * bottom 4-bits is the identifying index number of this
7706              * WHILEM.
7707              */
7708
7709             if (scan->flags) {
7710
7711                 if (!reginfo->poscache_maxiter) {
7712                     /* start the countdown: Postpone detection until we
7713                      * know the match is not *that* much linear. */
7714                     reginfo->poscache_maxiter
7715                         =    (reginfo->strend - reginfo->strbeg + 1)
7716                            * (scan->flags>>4);
7717                     /* possible overflow for long strings and many CURLYX's */
7718                     if (reginfo->poscache_maxiter < 0)
7719                         reginfo->poscache_maxiter = I32_MAX;
7720                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7721                 }
7722
7723                 if (reginfo->poscache_iter-- == 0) {
7724                     /* initialise cache */
7725                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7726                     regmatch_info_aux *const aux = reginfo->info_aux;
7727                     if (aux->poscache) {
7728                         if ((SSize_t)reginfo->poscache_size < size) {
7729                             Renew(aux->poscache, size, char);
7730                             reginfo->poscache_size = size;
7731                         }
7732                         Zero(aux->poscache, size, char);
7733                     }
7734                     else {
7735                         reginfo->poscache_size = size;
7736                         Newxz(aux->poscache, size, char);
7737                     }
7738                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7739       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
7740                               PL_colors[4], PL_colors[5])
7741                     );
7742                 }
7743
7744                 if (reginfo->poscache_iter < 0) {
7745                     /* have we already failed at this position? */
7746                     SSize_t offset, mask;
7747
7748                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7749                     offset  = (scan->flags & 0xf) - 1
7750                                 +   (locinput - reginfo->strbeg)
7751                                   * (scan->flags>>4);
7752                     mask    = 1 << (offset % 8);
7753                     offset /= 8;
7754                     if (reginfo->info_aux->poscache[offset] & mask) {
7755                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: (cache) already tried at this position...\n",
7756                             depth)
7757                         );
7758                         cur_curlyx->u.curlyx.count--;
7759                         sayNO; /* cache records failure */
7760                     }
7761                     ST.cache_offset = offset;
7762                     ST.cache_mask   = mask;
7763                 }
7764             }
7765
7766             /* Prefer B over A for minimal matching. */
7767
7768             if (cur_curlyx->u.curlyx.minmod) {
7769                 ST.save_curlyx = cur_curlyx;
7770                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7771                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7772                                     locinput);
7773                 NOT_REACHED; /* NOTREACHED */
7774             }
7775
7776             /* Prefer A over B for maximal matching. */
7777
7778             if (n < max) { /* More greed allowed? */
7779                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7780                             maxopenparen);
7781                 cur_curlyx->u.curlyx.lastloc = locinput;
7782                 REGCP_SET(ST.lastcp);
7783                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7784                 NOT_REACHED; /* NOTREACHED */
7785             }
7786             goto do_whilem_B_max;
7787         }
7788         NOT_REACHED; /* NOTREACHED */
7789
7790         case WHILEM_B_min: /* just matched B in a minimal match */
7791         case WHILEM_B_max: /* just matched B in a maximal match */
7792             cur_curlyx = ST.save_curlyx;
7793             sayYES;
7794             NOT_REACHED; /* NOTREACHED */
7795
7796         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7797             cur_curlyx = ST.save_curlyx;
7798             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7799             cur_curlyx->u.curlyx.count--;
7800             CACHEsayNO;
7801             NOT_REACHED; /* NOTREACHED */
7802
7803         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7804             REGCP_UNWIND(ST.lastcp);
7805             regcppop(rex, &maxopenparen);
7806             /* FALLTHROUGH */
7807         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7808             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7809             cur_curlyx->u.curlyx.count--;
7810             CACHEsayNO;
7811             NOT_REACHED; /* NOTREACHED */
7812
7813         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7814             REGCP_UNWIND(ST.lastcp);
7815             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
7816             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "whilem: failed, trying continuation...\n",
7817                 depth)
7818             );
7819           do_whilem_B_max:
7820             if (cur_curlyx->u.curlyx.count >= REG_INFTY
7821                 && ckWARN(WARN_REGEXP)
7822                 && !reginfo->warned)
7823             {
7824                 reginfo->warned = TRUE;
7825                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7826                      "Complex regular subexpression recursion limit (%d) "
7827                      "exceeded",
7828                      REG_INFTY - 1);
7829             }
7830
7831             /* now try B */
7832             ST.save_curlyx = cur_curlyx;
7833             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7834             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7835                                 locinput);
7836             NOT_REACHED; /* NOTREACHED */
7837
7838         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7839             cur_curlyx = ST.save_curlyx;
7840
7841             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
7842                 /* Maximum greed exceeded */
7843                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7844                     && ckWARN(WARN_REGEXP)
7845                     && !reginfo->warned)
7846                 {
7847                     reginfo->warned     = TRUE;
7848                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7849                         "Complex regular subexpression recursion "
7850                         "limit (%d) exceeded",
7851                         REG_INFTY - 1);
7852                 }
7853                 cur_curlyx->u.curlyx.count--;
7854                 CACHEsayNO;
7855             }
7856
7857             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "trying longer...\n", depth)
7858             );
7859             /* Try grabbing another A and see if it helps. */
7860             cur_curlyx->u.curlyx.lastloc = locinput;
7861             PUSH_STATE_GOTO(WHILEM_A_min,
7862                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
7863                 locinput);
7864             NOT_REACHED; /* NOTREACHED */
7865
7866 #undef  ST
7867 #define ST st->u.branch
7868
7869         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
7870             next = scan + ARG(scan);
7871             if (next == scan)
7872                 next = NULL;
7873             scan = NEXTOPER(scan);
7874             /* FALLTHROUGH */
7875
7876         case BRANCH:        /*  /(...|A|...)/ */
7877             scan = NEXTOPER(scan); /* scan now points to inner node */
7878             ST.lastparen = rex->lastparen;
7879             ST.lastcloseparen = rex->lastcloseparen;
7880             ST.next_branch = next;
7881             REGCP_SET(ST.cp);
7882
7883             /* Now go into the branch */
7884             if (has_cutgroup) {
7885                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7886             } else {
7887                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7888             }
7889             NOT_REACHED; /* NOTREACHED */
7890
7891         case CUTGROUP:  /*  /(*THEN)/  */
7892             sv_yes_mark = st->u.mark.mark_name = scan->flags
7893                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7894                 : NULL;
7895             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7896             NOT_REACHED; /* NOTREACHED */
7897
7898         case CUTGROUP_next_fail:
7899             do_cutgroup = 1;
7900             no_final = 1;
7901             if (st->u.mark.mark_name)
7902                 sv_commit = st->u.mark.mark_name;
7903             sayNO;
7904             NOT_REACHED; /* NOTREACHED */
7905
7906         case BRANCH_next:
7907             sayYES;
7908             NOT_REACHED; /* NOTREACHED */
7909
7910         case BRANCH_next_fail: /* that branch failed; try the next, if any */
7911             if (do_cutgroup) {
7912                 do_cutgroup = 0;
7913                 no_final = 0;
7914             }
7915             REGCP_UNWIND(ST.cp);
7916             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7917             scan = ST.next_branch;
7918             /* no more branches? */
7919             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7920                 DEBUG_EXECUTE_r({
7921                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
7922                         depth,
7923                         PL_colors[4],
7924                         PL_colors[5] );
7925                 });
7926                 sayNO_SILENT;
7927             }
7928             continue; /* execute next BRANCH[J] op */
7929             /* NOTREACHED */
7930
7931         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7932             minmod = 1;
7933             break;
7934
7935 #undef  ST
7936 #define ST st->u.curlym
7937
7938         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
7939
7940             /* This is an optimisation of CURLYX that enables us to push
7941              * only a single backtracking state, no matter how many matches
7942              * there are in {m,n}. It relies on the pattern being constant
7943              * length, with no parens to influence future backrefs
7944              */
7945
7946             ST.me = scan;
7947             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7948
7949             ST.lastparen      = rex->lastparen;
7950             ST.lastcloseparen = rex->lastcloseparen;
7951
7952             /* if paren positive, emulate an OPEN/CLOSE around A */
7953             if (ST.me->flags) {
7954                 U32 paren = ST.me->flags;
7955                 if (paren > maxopenparen)
7956                     maxopenparen = paren;
7957                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7958             }
7959             ST.A = scan;
7960             ST.B = next;
7961             ST.alen = 0;
7962             ST.count = 0;
7963             ST.minmod = minmod;
7964             minmod = 0;
7965             ST.c1 = CHRTEST_UNINIT;
7966             REGCP_SET(ST.cp);
7967
7968             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7969                 goto curlym_do_B;
7970
7971           curlym_do_A: /* execute the A in /A{m,n}B/  */
7972             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7973             NOT_REACHED; /* NOTREACHED */
7974
7975         case CURLYM_A: /* we've just matched an A */
7976             ST.count++;
7977             /* after first match, determine A's length: u.curlym.alen */
7978             if (ST.count == 1) {
7979                 if (reginfo->is_utf8_target) {
7980                     char *s = st->locinput;
7981                     while (s < locinput) {
7982                         ST.alen++;
7983                         s += UTF8SKIP(s);
7984                     }
7985                 }
7986                 else {
7987                     ST.alen = locinput - st->locinput;
7988                 }
7989                 if (ST.alen == 0)
7990                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7991             }
7992             DEBUG_EXECUTE_r(
7993                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
7994                           depth, (IV) ST.count, (IV)ST.alen)
7995             );
7996
7997             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7998                 goto fake_end;
7999
8000             {
8001                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8002                 if ( max == REG_INFTY || ST.count < max )
8003                     goto curlym_do_A; /* try to match another A */
8004             }
8005             goto curlym_do_B; /* try to match B */
8006
8007         case CURLYM_A_fail: /* just failed to match an A */
8008             REGCP_UNWIND(ST.cp);
8009
8010
8011             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8012                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8013                 sayNO;
8014
8015           curlym_do_B: /* execute the B in /A{m,n}B/  */
8016             if (ST.c1 == CHRTEST_UNINIT) {
8017                 /* calculate c1 and c2 for possible match of 1st char
8018                  * following curly */
8019                 ST.c1 = ST.c2 = CHRTEST_VOID;
8020                 assert(ST.B);
8021                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8022                     regnode *text_node = ST.B;
8023                     if (! HAS_TEXT(text_node))
8024                         FIND_NEXT_IMPT(text_node);
8025                     /* this used to be
8026
8027                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
8028
8029                         But the former is redundant in light of the latter.
8030
8031                         if this changes back then the macro for
8032                         IS_TEXT and friends need to change.
8033                      */
8034                     if (PL_regkind[OP(text_node)] == EXACT) {
8035                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8036                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8037                            reginfo))
8038                         {
8039                             sayNO;
8040                         }
8041                     }
8042                 }
8043             }
8044
8045             DEBUG_EXECUTE_r(
8046                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8047                     depth, (IV)ST.count)
8048                 );
8049             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8050                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8051                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8052                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8053                     {
8054                         /* simulate B failing */
8055                         DEBUG_OPTIMISE_r(
8056                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8057                                 depth,
8058                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8059                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8060                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8061                         );
8062                         state_num = CURLYM_B_fail;
8063                         goto reenter_switch;
8064                     }
8065                 }
8066                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8067                     /* simulate B failing */
8068                     DEBUG_OPTIMISE_r(
8069                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8070                             depth,
8071                             (int) nextchr, ST.c1, ST.c2)
8072                     );
8073                     state_num = CURLYM_B_fail;
8074                     goto reenter_switch;
8075                 }
8076             }
8077
8078             if (ST.me->flags) {
8079                 /* emulate CLOSE: mark current A as captured */
8080                 I32 paren = ST.me->flags;
8081                 if (ST.count) {
8082                     rex->offs[paren].start
8083                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
8084                     rex->offs[paren].end = locinput - reginfo->strbeg;
8085                     if ((U32)paren > rex->lastparen)
8086                         rex->lastparen = paren;
8087                     rex->lastcloseparen = paren;
8088                 }
8089                 else
8090                     rex->offs[paren].end = -1;
8091
8092                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8093                 {
8094                     if (ST.count)
8095                         goto fake_end;
8096                     else
8097                         sayNO;
8098                 }
8099             }
8100
8101             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
8102             NOT_REACHED; /* NOTREACHED */
8103
8104         case CURLYM_B_fail: /* just failed to match a B */
8105             REGCP_UNWIND(ST.cp);
8106             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8107             if (ST.minmod) {
8108                 I32 max = ARG2(ST.me);
8109                 if (max != REG_INFTY && ST.count == max)
8110                     sayNO;
8111                 goto curlym_do_A; /* try to match a further A */
8112             }
8113             /* backtrack one A */
8114             if (ST.count == ARG1(ST.me) /* min */)
8115                 sayNO;
8116             ST.count--;
8117             SET_locinput(HOPc(locinput, -ST.alen));
8118             goto curlym_do_B; /* try to match B */
8119
8120 #undef ST
8121 #define ST st->u.curly
8122
8123 #define CURLY_SETPAREN(paren, success) \
8124     if (paren) { \
8125         if (success) { \
8126             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
8127             rex->offs[paren].end = locinput - reginfo->strbeg; \
8128             if (paren > rex->lastparen) \
8129                 rex->lastparen = paren; \
8130             rex->lastcloseparen = paren; \
8131         } \
8132         else { \
8133             rex->offs[paren].end = -1; \
8134             rex->lastparen      = ST.lastparen; \
8135             rex->lastcloseparen = ST.lastcloseparen; \
8136         } \
8137     }
8138
8139         case STAR:              /*  /A*B/ where A is width 1 char */
8140             ST.paren = 0;
8141             ST.min = 0;
8142             ST.max = REG_INFTY;
8143             scan = NEXTOPER(scan);
8144             goto repeat;
8145
8146         case PLUS:              /*  /A+B/ where A is width 1 char */
8147             ST.paren = 0;
8148             ST.min = 1;
8149             ST.max = REG_INFTY;
8150             scan = NEXTOPER(scan);
8151             goto repeat;
8152
8153         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8154             ST.paren = scan->flags;     /* Which paren to set */
8155             ST.lastparen      = rex->lastparen;
8156             ST.lastcloseparen = rex->lastcloseparen;
8157             if (ST.paren > maxopenparen)
8158                 maxopenparen = ST.paren;
8159             ST.min = ARG1(scan);  /* min to match */
8160             ST.max = ARG2(scan);  /* max to match */
8161             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8162             {
8163                 ST.min=1;
8164                 ST.max=1;
8165             }
8166             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8167             goto repeat;
8168
8169         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8170             ST.paren = 0;
8171             ST.min = ARG1(scan);  /* min to match */
8172             ST.max = ARG2(scan);  /* max to match */
8173             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8174           repeat:
8175             /*
8176             * Lookahead to avoid useless match attempts
8177             * when we know what character comes next.
8178             *
8179             * Used to only do .*x and .*?x, but now it allows
8180             * for )'s, ('s and (?{ ... })'s to be in the way
8181             * of the quantifier and the EXACT-like node.  -- japhy
8182             */
8183
8184             assert(ST.min <= ST.max);
8185             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8186                 ST.c1 = ST.c2 = CHRTEST_VOID;
8187             }
8188             else {
8189                 regnode *text_node = next;
8190
8191                 if (! HAS_TEXT(text_node))
8192                     FIND_NEXT_IMPT(text_node);
8193
8194                 if (! HAS_TEXT(text_node))
8195                     ST.c1 = ST.c2 = CHRTEST_VOID;
8196                 else {
8197                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8198                         ST.c1 = ST.c2 = CHRTEST_VOID;
8199                     }
8200                     else {
8201
8202                     /*  Currently we only get here when
8203
8204                         PL_rekind[OP(text_node)] == EXACT
8205
8206                         if this changes back then the macro for IS_TEXT and
8207                         friends need to change. */
8208                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8209                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8210                            reginfo))
8211                         {
8212                             sayNO;
8213                         }
8214                     }
8215                 }
8216             }
8217
8218             ST.A = scan;
8219             ST.B = next;
8220             if (minmod) {
8221                 char *li = locinput;
8222                 minmod = 0;
8223                 if (ST.min &&
8224                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8225                             < ST.min)
8226                     sayNO;
8227                 SET_locinput(li);
8228                 ST.count = ST.min;
8229                 REGCP_SET(ST.cp);
8230                 if (ST.c1 == CHRTEST_VOID)
8231                     goto curly_try_B_min;
8232
8233                 ST.oldloc = locinput;
8234
8235                 /* set ST.maxpos to the furthest point along the
8236                  * string that could possibly match */
8237                 if  (ST.max == REG_INFTY) {
8238                     ST.maxpos = reginfo->strend - 1;
8239                     if (utf8_target)
8240                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8241                             ST.maxpos--;
8242                 }
8243                 else if (utf8_target) {
8244                     int m = ST.max - ST.min;
8245                     for (ST.maxpos = locinput;
8246                          m >0 && ST.maxpos < reginfo->strend; m--)
8247                         ST.maxpos += UTF8SKIP(ST.maxpos);
8248                 }
8249                 else {
8250                     ST.maxpos = locinput + ST.max - ST.min;
8251                     if (ST.maxpos >= reginfo->strend)
8252                         ST.maxpos = reginfo->strend - 1;
8253                 }
8254                 goto curly_try_B_min_known;
8255
8256             }
8257             else {
8258                 /* avoid taking address of locinput, so it can remain
8259                  * a register var */
8260                 char *li = locinput;
8261                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8262                 if (ST.count < ST.min)
8263                     sayNO;
8264                 SET_locinput(li);
8265                 if ((ST.count > ST.min)
8266                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8267                 {
8268                     /* A{m,n} must come at the end of the string, there's
8269                      * no point in backing off ... */
8270                     ST.min = ST.count;
8271                     /* ...except that $ and \Z can match before *and* after
8272                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8273                        We may back off by one in this case. */
8274                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8275                         ST.min--;
8276                 }
8277                 REGCP_SET(ST.cp);
8278                 goto curly_try_B_max;
8279             }
8280             NOT_REACHED; /* NOTREACHED */
8281
8282         case CURLY_B_min_known_fail:
8283             /* failed to find B in a non-greedy match where c1,c2 valid */
8284
8285             REGCP_UNWIND(ST.cp);
8286             if (ST.paren) {
8287                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8288             }
8289             /* Couldn't or didn't -- move forward. */
8290             ST.oldloc = locinput;
8291             if (utf8_target)
8292                 locinput += UTF8SKIP(locinput);
8293             else
8294                 locinput++;
8295             ST.count++;
8296           curly_try_B_min_known:
8297              /* find the next place where 'B' could work, then call B */
8298             {
8299                 int n;
8300                 if (utf8_target) {
8301                     n = (ST.oldloc == locinput) ? 0 : 1;
8302                     if (ST.c1 == ST.c2) {
8303                         /* set n to utf8_distance(oldloc, locinput) */
8304                         while (locinput <= ST.maxpos
8305                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8306                         {
8307                             locinput += UTF8SKIP(locinput);
8308                             n++;
8309                         }
8310                     }
8311                     else {
8312                         /* set n to utf8_distance(oldloc, locinput) */
8313                         while (locinput <= ST.maxpos
8314                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8315                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8316                         {
8317                             locinput += UTF8SKIP(locinput);
8318                             n++;
8319                         }
8320                     }
8321                 }
8322                 else {  /* Not utf8_target */
8323                     if (ST.c1 == ST.c2) {
8324                         locinput = (char *) memchr(locinput,
8325                                                    ST.c1,
8326                                                    ST.maxpos + 1 - locinput);
8327                         if (! locinput) {
8328                             locinput = ST.maxpos + 1;
8329                         }
8330                     }
8331                     else {
8332                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8333
8334                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8335                             while (   locinput <= ST.maxpos
8336                                    && UCHARAT(locinput) != ST.c1
8337                                    && UCHARAT(locinput) != ST.c2)
8338                             {
8339                                 locinput++;
8340                             }
8341                         }
8342                         else {
8343                             /* If c1 and c2 only differ by a single bit, we can
8344                              * avoid a conditional each time through the loop,
8345                              * at the expense of a little preliminary setup and
8346                              * an extra mask each iteration.  By masking out
8347                              * that bit, we match exactly two characters, c1
8348                              * and c2, and so we don't have to test for both.
8349                              * On both ASCII and EBCDIC platforms, most of the
8350                              * ASCII-range and Latin1-range folded equivalents
8351                              * differ only in a single bit, so this is actually
8352                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8353                              * 0x61). */
8354                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8355                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8356                             while (   locinput <= ST.maxpos
8357                                    && (UCHARAT(locinput) & c1_c2_mask)
8358                                                                 != c1_masked)
8359                             {
8360                                 locinput++;
8361                             }
8362                         }
8363                     }
8364                     n = locinput - ST.oldloc;
8365                 }
8366                 if (locinput > ST.maxpos)
8367                     sayNO;
8368                 if (n) {
8369                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8370                      * at b; check that everything between oldloc and
8371                      * locinput matches */
8372                     char *li = ST.oldloc;
8373                     ST.count += n;
8374                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8375                         sayNO;
8376                     assert(n == REG_INFTY || locinput == li);
8377                 }
8378                 CURLY_SETPAREN(ST.paren, ST.count);
8379                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8380                     goto fake_end;
8381                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8382             }
8383             NOT_REACHED; /* NOTREACHED */
8384
8385         case CURLY_B_min_fail:
8386             /* failed to find B in a non-greedy match where c1,c2 invalid */
8387
8388             REGCP_UNWIND(ST.cp);
8389             if (ST.paren) {
8390                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8391             }
8392             /* failed -- move forward one */
8393             {
8394                 char *li = locinput;
8395                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8396                     sayNO;
8397                 }
8398                 locinput = li;
8399             }
8400             {
8401                 ST.count++;
8402                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8403                         ST.count > 0)) /* count overflow ? */
8404                 {
8405                   curly_try_B_min:
8406                     CURLY_SETPAREN(ST.paren, ST.count);
8407                     if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8408                         goto fake_end;
8409                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8410                 }
8411             }
8412             sayNO;
8413             NOT_REACHED; /* NOTREACHED */
8414
8415           curly_try_B_max:
8416             /* a successful greedy match: now try to match B */
8417             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8418                 goto fake_end;
8419             {
8420                 bool could_match = locinput < reginfo->strend;
8421
8422                 /* If it could work, try it. */
8423                 if (ST.c1 != CHRTEST_VOID && could_match) {
8424                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8425                     {
8426                         could_match = memEQ(locinput,
8427                                             ST.c1_utf8,
8428                                             UTF8SKIP(locinput))
8429                                     || memEQ(locinput,
8430                                              ST.c2_utf8,
8431                                              UTF8SKIP(locinput));
8432                     }
8433                     else {
8434                         could_match = UCHARAT(locinput) == ST.c1
8435                                       || UCHARAT(locinput) == ST.c2;
8436                     }
8437                 }
8438                 if (ST.c1 == CHRTEST_VOID || could_match) {
8439                     CURLY_SETPAREN(ST.paren, ST.count);
8440                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8441                     NOT_REACHED; /* NOTREACHED */
8442                 }
8443             }
8444             /* FALLTHROUGH */
8445
8446         case CURLY_B_max_fail:
8447             /* failed to find B in a greedy match */
8448
8449             REGCP_UNWIND(ST.cp);
8450             if (ST.paren) {
8451                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8452             }
8453             /*  back up. */
8454             if (--ST.count < ST.min)
8455                 sayNO;
8456             locinput = HOPc(locinput, -1);
8457             goto curly_try_B_max;
8458
8459 #undef ST
8460
8461         case END: /*  last op of main pattern  */
8462           fake_end:
8463             if (cur_eval) {
8464                 /* we've just finished A in /(??{A})B/; now continue with B */
8465                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8466                 st->u.eval.prev_rex = rex_sv;           /* inner */
8467
8468                 /* Save *all* the positions. */
8469                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8470                 rex_sv = CUR_EVAL.prev_rex;
8471                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8472                 SET_reg_curpm(rex_sv);
8473                 rex = ReANY(rex_sv);
8474                 rexi = RXi_GET(rex);
8475
8476                 st->u.eval.prev_curlyx = cur_curlyx;
8477                 cur_curlyx = CUR_EVAL.prev_curlyx;
8478
8479                 REGCP_SET(st->u.eval.lastcp);
8480
8481                 /* Restore parens of the outer rex without popping the
8482                  * savestack */
8483                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8484
8485                 st->u.eval.prev_eval = cur_eval;
8486                 cur_eval = CUR_EVAL.prev_eval;
8487                 DEBUG_EXECUTE_r(
8488                     Perl_re_exec_indentf( aTHX_  "EVAL trying tail ... (cur_eval=%p)\n",
8489                                       depth, cur_eval););
8490                 if ( nochange_depth )
8491                     nochange_depth--;
8492
8493                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8494
8495                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8496                                     locinput); /* match B */
8497             }
8498
8499             if (locinput < reginfo->till) {
8500                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8501                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8502                                       PL_colors[4],
8503                                       (long)(locinput - startpos),
8504                                       (long)(reginfo->till - startpos),
8505                                       PL_colors[5]));
8506
8507                 sayNO_SILENT;           /* Cannot match: too short. */
8508             }
8509             sayYES;                     /* Success! */
8510
8511         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8512             DEBUG_EXECUTE_r(
8513             Perl_re_exec_indentf( aTHX_  "%ssubpattern success...%s\n",
8514                 depth, PL_colors[4], PL_colors[5]));
8515             sayYES;                     /* Success! */
8516
8517 #undef  ST
8518 #define ST st->u.ifmatch
8519
8520         {
8521             char *newstart;
8522
8523         case SUSPEND:   /* (?>A) */
8524             ST.wanted = 1;
8525             newstart = locinput;
8526             goto do_ifmatch;
8527
8528         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8529             ST.wanted = 0;
8530             goto ifmatch_trivial_fail_test;
8531
8532         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8533             ST.wanted = 1;
8534           ifmatch_trivial_fail_test:
8535             if (scan->flags) {
8536                 char * const s = HOPBACKc(locinput, scan->flags);
8537                 if (!s) {
8538                     /* trivial fail */
8539                     if (logical) {
8540                         logical = 0;
8541                         sw = 1 - cBOOL(ST.wanted);
8542                     }
8543                     else if (ST.wanted)
8544                         sayNO;
8545                     next = scan + ARG(scan);
8546                     if (next == scan)
8547                         next = NULL;
8548                     break;
8549                 }
8550                 newstart = s;
8551             }
8552             else
8553                 newstart = locinput;
8554
8555           do_ifmatch:
8556             ST.me = scan;
8557             ST.logical = logical;
8558             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8559
8560             /* execute body of (?...A) */
8561             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8562             NOT_REACHED; /* NOTREACHED */
8563         }
8564
8565         case IFMATCH_A_fail: /* body of (?...A) failed */
8566             ST.wanted = !ST.wanted;
8567             /* FALLTHROUGH */
8568
8569         case IFMATCH_A: /* body of (?...A) succeeded */
8570             if (ST.logical) {
8571                 sw = cBOOL(ST.wanted);
8572             }
8573             else if (!ST.wanted)
8574                 sayNO;
8575
8576             if (OP(ST.me) != SUSPEND) {
8577                 /* restore old position except for (?>...) */
8578                 locinput = st->locinput;
8579             }
8580             scan = ST.me + ARG(ST.me);
8581             if (scan == ST.me)
8582                 scan = NULL;
8583             continue; /* execute B */
8584
8585 #undef ST
8586
8587         case LONGJMP: /*  alternative with many branches compiles to
8588                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8589             next = scan + ARG(scan);
8590             if (next == scan)
8591                 next = NULL;
8592             break;
8593
8594         case COMMIT:  /*  (*COMMIT)  */
8595             reginfo->cutpoint = reginfo->strend;
8596             /* FALLTHROUGH */
8597
8598         case PRUNE:   /*  (*PRUNE)   */
8599             if (scan->flags)
8600                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8601             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8602             NOT_REACHED; /* NOTREACHED */
8603
8604         case COMMIT_next_fail:
8605             no_final = 1;
8606             /* FALLTHROUGH */
8607             sayNO;
8608             NOT_REACHED; /* NOTREACHED */
8609
8610         case OPFAIL:   /* (*FAIL)  */
8611             if (scan->flags)
8612                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8613             if (logical) {
8614                 /* deal with (?(?!)X|Y) properly,
8615                  * make sure we trigger the no branch
8616                  * of the trailing IFTHEN structure*/
8617                 sw= 0;
8618                 break;
8619             } else {
8620                 sayNO;
8621             }
8622             NOT_REACHED; /* NOTREACHED */
8623
8624 #define ST st->u.mark
8625         case MARKPOINT: /*  (*MARK:foo)  */
8626             ST.prev_mark = mark_state;
8627             ST.mark_name = sv_commit = sv_yes_mark
8628                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8629             mark_state = st;
8630             ST.mark_loc = locinput;
8631             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8632             NOT_REACHED; /* NOTREACHED */
8633
8634         case MARKPOINT_next:
8635             mark_state = ST.prev_mark;
8636             sayYES;
8637             NOT_REACHED; /* NOTREACHED */
8638
8639         case MARKPOINT_next_fail:
8640             if (popmark && sv_eq(ST.mark_name,popmark))
8641             {
8642                 if (ST.mark_loc > startpoint)
8643                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8644                 popmark = NULL; /* we found our mark */
8645                 sv_commit = ST.mark_name;
8646
8647                 DEBUG_EXECUTE_r({
8648                         Perl_re_exec_indentf( aTHX_  "%ssetting cutpoint to mark:%" SVf "...%s\n",
8649                             depth,
8650                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8651                 });
8652             }
8653             mark_state = ST.prev_mark;
8654             sv_yes_mark = mark_state ?
8655                 mark_state->u.mark.mark_name : NULL;
8656             sayNO;
8657             NOT_REACHED; /* NOTREACHED */
8658
8659         case SKIP:  /*  (*SKIP)  */
8660             if (!scan->flags) {
8661                 /* (*SKIP) : if we fail we cut here*/
8662                 ST.mark_name = NULL;
8663                 ST.mark_loc = locinput;
8664                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8665             } else {
8666                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8667                    otherwise do nothing.  Meaning we need to scan
8668                  */
8669                 regmatch_state *cur = mark_state;
8670                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8671
8672                 while (cur) {
8673                     if ( sv_eq( cur->u.mark.mark_name,
8674                                 find ) )
8675                     {
8676                         ST.mark_name = find;
8677                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8678                     }
8679                     cur = cur->u.mark.prev_mark;
8680                 }
8681             }
8682             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8683             break;
8684
8685         case SKIP_next_fail:
8686             if (ST.mark_name) {
8687                 /* (*CUT:NAME) - Set up to search for the name as we
8688                    collapse the stack*/
8689                 popmark = ST.mark_name;
8690             } else {
8691                 /* (*CUT) - No name, we cut here.*/
8692                 if (ST.mark_loc > startpoint)
8693                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8694                 /* but we set sv_commit to latest mark_name if there
8695                    is one so they can test to see how things lead to this
8696                    cut */
8697                 if (mark_state)
8698                     sv_commit=mark_state->u.mark.mark_name;
8699             }
8700             no_final = 1;
8701             sayNO;
8702             NOT_REACHED; /* NOTREACHED */
8703 #undef ST
8704
8705         case LNBREAK: /* \R */
8706             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8707                 locinput += n;
8708             } else
8709                 sayNO;
8710             break;
8711
8712         default:
8713             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8714                           PTR2UV(scan), OP(scan));
8715             Perl_croak(aTHX_ "regexp memory corruption");
8716
8717         /* this is a point to jump to in order to increment
8718          * locinput by one character */
8719           increment_locinput:
8720             assert(!NEXTCHR_IS_EOS);
8721             if (utf8_target) {
8722                 locinput += PL_utf8skip[nextchr];
8723                 /* locinput is allowed to go 1 char off the end (signifying
8724                  * EOS), but not 2+ */
8725                 if (locinput > reginfo->strend)
8726                     sayNO;
8727             }
8728             else
8729                 locinput++;
8730             break;
8731
8732         } /* end switch */
8733
8734         /* switch break jumps here */
8735         scan = next; /* prepare to execute the next op and ... */
8736         continue;    /* ... jump back to the top, reusing st */
8737         /* NOTREACHED */
8738
8739       push_yes_state:
8740         /* push a state that backtracks on success */
8741         st->u.yes.prev_yes_state = yes_state;
8742         yes_state = st;
8743         /* FALLTHROUGH */
8744       push_state:
8745         /* push a new regex state, then continue at scan  */
8746         {
8747             regmatch_state *newst;
8748
8749             DEBUG_STACK_r({
8750                 regmatch_state *cur = st;
8751                 regmatch_state *curyes = yes_state;
8752                 U32 i;
8753                 regmatch_slab *slab = PL_regmatch_slab;
8754                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
8755                     if (cur < SLAB_FIRST(slab)) {
8756                         slab = slab->prev;
8757                         cur = SLAB_LAST(slab);
8758                     }
8759                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
8760                         depth,
8761                         i ? "    " : "push",
8762                         depth - i, PL_reg_name[cur->resume_state],
8763                         (curyes == cur) ? "yes" : ""
8764                     );
8765                     if (curyes == cur)
8766                         curyes = cur->u.yes.prev_yes_state;
8767                 }
8768             } else
8769                 DEBUG_STATE_pp("push")
8770             );
8771             depth++;
8772             st->locinput = locinput;
8773             newst = st+1;
8774             if (newst >  SLAB_LAST(PL_regmatch_slab))
8775                 newst = S_push_slab(aTHX);
8776             PL_regmatch_state = newst;
8777
8778             locinput = pushinput;
8779             st = newst;
8780             continue;
8781             /* NOTREACHED */
8782         }
8783     }
8784 #ifdef SOLARIS_BAD_OPTIMIZER
8785 #  undef PL_charclass
8786 #endif
8787
8788     /*
8789     * We get here only if there's trouble -- normally "case END" is
8790     * the terminating point.
8791     */
8792     Perl_croak(aTHX_ "corrupted regexp pointers");
8793     NOT_REACHED; /* NOTREACHED */
8794
8795   yes:
8796     if (yes_state) {
8797         /* we have successfully completed a subexpression, but we must now
8798          * pop to the state marked by yes_state and continue from there */
8799         assert(st != yes_state);
8800 #ifdef DEBUGGING
8801         while (st != yes_state) {
8802             st--;
8803             if (st < SLAB_FIRST(PL_regmatch_slab)) {
8804                 PL_regmatch_slab = PL_regmatch_slab->prev;
8805                 st = SLAB_LAST(PL_regmatch_slab);
8806             }
8807             DEBUG_STATE_r({
8808                 if (no_final) {
8809                     DEBUG_STATE_pp("pop (no final)");
8810                 } else {
8811                     DEBUG_STATE_pp("pop (yes)");
8812                 }
8813             });
8814             depth--;
8815         }
8816 #else
8817         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8818             || yes_state > SLAB_LAST(PL_regmatch_slab))
8819         {
8820             /* not in this slab, pop slab */
8821             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8822             PL_regmatch_slab = PL_regmatch_slab->prev;
8823             st = SLAB_LAST(PL_regmatch_slab);
8824         }
8825         depth -= (st - yes_state);
8826 #endif
8827         st = yes_state;
8828         yes_state = st->u.yes.prev_yes_state;
8829         PL_regmatch_state = st;
8830
8831         if (no_final)
8832             locinput= st->locinput;
8833         state_num = st->resume_state + no_final;
8834         goto reenter_switch;
8835     }
8836
8837     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
8838                           PL_colors[4], PL_colors[5]));
8839
8840     if (reginfo->info_aux_eval) {
8841         /* each successfully executed (?{...}) block does the equivalent of
8842          *   local $^R = do {...}
8843          * When popping the save stack, all these locals would be undone;
8844          * bypass this by setting the outermost saved $^R to the latest
8845          * value */
8846         /* I dont know if this is needed or works properly now.
8847          * see code related to PL_replgv elsewhere in this file.
8848          * Yves
8849          */
8850         if (oreplsv != GvSV(PL_replgv))
8851             sv_setsv(oreplsv, GvSV(PL_replgv));
8852     }
8853     result = 1;
8854     goto final_exit;
8855
8856   no:
8857     DEBUG_EXECUTE_r(
8858         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
8859             depth,
8860             PL_colors[4], PL_colors[5])
8861         );
8862
8863   no_silent:
8864     if (no_final) {
8865         if (yes_state) {
8866             goto yes;
8867         } else {
8868             goto final_exit;
8869         }
8870     }
8871     if (depth) {
8872         /* there's a previous state to backtrack to */
8873         st--;
8874         if (st < SLAB_FIRST(PL_regmatch_slab)) {
8875             PL_regmatch_slab = PL_regmatch_slab->prev;
8876             st = SLAB_LAST(PL_regmatch_slab);
8877         }
8878         PL_regmatch_state = st;
8879         locinput= st->locinput;
8880
8881         DEBUG_STATE_pp("pop");
8882         depth--;
8883         if (yes_state == st)
8884             yes_state = st->u.yes.prev_yes_state;
8885
8886         state_num = st->resume_state + 1; /* failure = success + 1 */
8887         PERL_ASYNC_CHECK();
8888         goto reenter_switch;
8889     }
8890     result = 0;
8891
8892   final_exit:
8893     if (rex->intflags & PREGf_VERBARG_SEEN) {
8894         SV *sv_err = get_sv("REGERROR", 1);
8895         SV *sv_mrk = get_sv("REGMARK", 1);
8896         if (result) {
8897             sv_commit = &PL_sv_no;
8898             if (!sv_yes_mark)
8899                 sv_yes_mark = &PL_sv_yes;
8900         } else {
8901             if (!sv_commit)
8902                 sv_commit = &PL_sv_yes;
8903             sv_yes_mark = &PL_sv_no;
8904         }
8905         assert(sv_err);
8906         assert(sv_mrk);
8907         sv_setsv(sv_err, sv_commit);
8908         sv_setsv(sv_mrk, sv_yes_mark);
8909     }
8910
8911
8912     if (last_pushed_cv) {
8913         dSP;
8914         /* see "Some notes about MULTICALL" above */
8915         POP_MULTICALL;
8916         PERL_UNUSED_VAR(SP);
8917     }
8918     else
8919         LEAVE_SCOPE(orig_savestack_ix);
8920
8921     assert(!result ||  locinput - reginfo->strbeg >= 0);
8922     return result ?  locinput - reginfo->strbeg : -1;
8923 }
8924
8925 /*
8926  - regrepeat - repeatedly match something simple, report how many
8927  *
8928  * What 'simple' means is a node which can be the operand of a quantifier like
8929  * '+', or {1,3}
8930  *
8931  * startposp - pointer a pointer to the start position.  This is updated
8932  *             to point to the byte following the highest successful
8933  *             match.
8934  * p         - the regnode to be repeatedly matched against.
8935  * reginfo   - struct holding match state, such as strend
8936  * max       - maximum number of things to match.
8937  * depth     - (for debugging) backtracking depth.
8938  */
8939 STATIC I32
8940 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8941             regmatch_info *const reginfo, I32 max _pDEPTH)
8942 {
8943     char *scan;     /* Pointer to current position in target string */
8944     I32 c;
8945     char *loceol = reginfo->strend;   /* local version */
8946     I32 hardcount = 0;  /* How many matches so far */
8947     bool utf8_target = reginfo->is_utf8_target;
8948     unsigned int to_complement = 0;  /* Invert the result? */
8949     UV utf8_flags;
8950     _char_class_number classnum;
8951
8952     PERL_ARGS_ASSERT_REGREPEAT;
8953
8954     scan = *startposp;
8955     if (max == REG_INFTY)
8956         max = I32_MAX;
8957     else if (! utf8_target && loceol - scan > max)
8958         loceol = scan + max;
8959
8960     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8961      * to the maximum of how far we should go in it (leaving it set to the real
8962      * end, if the maximum permissible would take us beyond that).  This allows
8963      * us to make the loop exit condition that we haven't gone past <loceol> to
8964      * also mean that we haven't exceeded the max permissible count, saving a
8965      * test each time through the loop.  But it assumes that the OP matches a
8966      * single byte, which is true for most of the OPs below when applied to a
8967      * non-UTF-8 target.  Those relatively few OPs that don't have this
8968      * characteristic will have to compensate.
8969      *
8970      * There is no adjustment for UTF-8 targets, as the number of bytes per
8971      * character varies.  OPs will have to test both that the count is less
8972      * than the max permissible (using <hardcount> to keep track), and that we
8973      * are still within the bounds of the string (using <loceol>.  A few OPs
8974      * match a single byte no matter what the encoding.  They can omit the max
8975      * test if, for the UTF-8 case, they do the adjustment that was skipped
8976      * above.
8977      *
8978      * Thus, the code above sets things up for the common case; and exceptional
8979      * cases need extra work; the common case is to make sure <scan> doesn't
8980      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8981      * count doesn't exceed the maximum permissible */
8982
8983     switch (OP(p)) {
8984     case REG_ANY:
8985         if (utf8_target) {
8986             while (scan < loceol && hardcount < max && *scan != '\n') {
8987                 scan += UTF8SKIP(scan);
8988                 hardcount++;
8989             }
8990         } else {
8991             while (scan < loceol && *scan != '\n')
8992                 scan++;
8993         }
8994         break;
8995     case SANY:
8996         if (utf8_target) {
8997             while (scan < loceol && hardcount < max) {
8998                 scan += UTF8SKIP(scan);
8999                 hardcount++;
9000             }
9001         }
9002         else
9003             scan = loceol;
9004         break;
9005     case EXACTL:
9006         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9007         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9008             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9009         }
9010         /* FALLTHROUGH */
9011     case EXACT:
9012         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9013
9014         c = (U8)*STRING(p);
9015
9016         /* Can use a simple loop if the pattern char to match on is invariant
9017          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9018          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9019          * true iff it doesn't matter if the argument is in UTF-8 or not */
9020         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9021             if (utf8_target && loceol - scan > max) {
9022                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
9023                  * since here, to match at all, 1 char == 1 byte */
9024                 loceol = scan + max;
9025             }
9026             while (scan < loceol && UCHARAT(scan) == c) {
9027                 scan++;
9028             }
9029         }
9030         else if (reginfo->is_utf8_pat) {
9031             if (utf8_target) {
9032                 STRLEN scan_char_len;
9033
9034                 /* When both target and pattern are UTF-8, we have to do
9035                  * string EQ */
9036                 while (hardcount < max
9037                        && scan < loceol
9038                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9039                        && memEQ(scan, STRING(p), scan_char_len))
9040                 {
9041                     scan += scan_char_len;
9042                     hardcount++;
9043                 }
9044             }
9045             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9046
9047                 /* Target isn't utf8; convert the character in the UTF-8
9048                  * pattern to non-UTF8, and do a simple loop */
9049                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9050                 while (scan < loceol && UCHARAT(scan) == c) {
9051                     scan++;
9052                 }
9053             } /* else pattern char is above Latin1, can't possibly match the
9054                  non-UTF-8 target */
9055         }
9056         else {
9057
9058             /* Here, the string must be utf8; pattern isn't, and <c> is
9059              * different in utf8 than not, so can't compare them directly.
9060              * Outside the loop, find the two utf8 bytes that represent c, and
9061              * then look for those in sequence in the utf8 string */
9062             U8 high = UTF8_TWO_BYTE_HI(c);
9063             U8 low = UTF8_TWO_BYTE_LO(c);
9064
9065             while (hardcount < max
9066                     && scan + 1 < loceol
9067                     && UCHARAT(scan) == high
9068                     && UCHARAT(scan + 1) == low)
9069             {
9070                 scan += 2;
9071                 hardcount++;
9072             }
9073         }
9074         break;
9075
9076     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
9077         assert(! reginfo->is_utf8_pat);
9078         /* FALLTHROUGH */
9079     case EXACTFA:
9080         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9081         goto do_exactf;
9082
9083     case EXACTFL:
9084         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9085         utf8_flags = FOLDEQ_LOCALE;
9086         goto do_exactf;
9087
9088     case EXACTF:   /* This node only generated for non-utf8 patterns */
9089         assert(! reginfo->is_utf8_pat);
9090         utf8_flags = 0;
9091         goto do_exactf;
9092
9093     case EXACTFLU8:
9094         if (! utf8_target) {
9095             break;
9096         }
9097         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9098                                     | FOLDEQ_S2_FOLDS_SANE;
9099         goto do_exactf;
9100
9101     case EXACTFU_SS:
9102     case EXACTFU:
9103         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
9104
9105       do_exactf: {
9106         int c1, c2;
9107         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9108
9109         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9110
9111         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9112                                         reginfo))
9113         {
9114             if (c1 == CHRTEST_VOID) {
9115                 /* Use full Unicode fold matching */
9116                 char *tmpeol = reginfo->strend;
9117                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9118                 while (hardcount < max
9119                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9120                                              STRING(p), NULL, pat_len,
9121                                              reginfo->is_utf8_pat, utf8_flags))
9122                 {
9123                     scan = tmpeol;
9124                     tmpeol = reginfo->strend;
9125                     hardcount++;
9126                 }
9127             }
9128             else if (utf8_target) {
9129                 if (c1 == c2) {
9130                     while (scan < loceol
9131                            && hardcount < max
9132                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
9133                     {
9134                         scan += UTF8SKIP(scan);
9135                         hardcount++;
9136                     }
9137                 }
9138                 else {
9139                     while (scan < loceol
9140                            && hardcount < max
9141                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
9142                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
9143                     {
9144                         scan += UTF8SKIP(scan);
9145                         hardcount++;
9146                     }
9147                 }
9148             }
9149             else if (c1 == c2) {
9150                 while (scan < loceol && UCHARAT(scan) == c1) {
9151                     scan++;
9152                 }
9153             }
9154             else {
9155                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9156                  * a conditional each time through the loop if the characters
9157                  * differ only in a single bit, as is the usual situation */
9158                 U8 c1_c2_bits_differing = c1 ^ c2;
9159
9160                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9161                     U8 c1_masked = c1 & ~ c1_c2_bits_differing;
9162                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9163
9164                     while (   scan < loceol
9165                            && (UCHARAT(scan) & c1_c2_mask) == c1_masked)
9166                     {
9167                         scan++;
9168                     }
9169                 }
9170                 else {
9171                     while (    scan < loceol
9172                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9173                     {
9174                         scan++;
9175                     }
9176                 }
9177             }
9178         }
9179         break;
9180     }
9181     case ANYOFL:
9182         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9183
9184         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9185             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9186         }
9187         /* FALLTHROUGH */
9188     case ANYOFD:
9189     case ANYOF:
9190         if (utf8_target) {
9191             while (hardcount < max
9192                    && scan < loceol
9193                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
9194             {
9195                 scan += UTF8SKIP(scan);
9196                 hardcount++;
9197             }
9198         }
9199         else if (ANYOF_FLAGS(p)) {
9200             while (scan < loceol
9201                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9202                 scan++;
9203         }
9204         else {
9205             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9206                 scan++;
9207         }
9208         break;
9209
9210     case ASCII:
9211         if (utf8_target && loceol - scan > max) {
9212
9213             /* We didn't adjust <loceol> at the beginning of this routine
9214              * because is UTF-8, but it is actually ok to do so, since here, to
9215              * match, 1 char == 1 byte. */
9216             loceol = scan + max;
9217         }
9218
9219         scan = find_next_non_ascii(scan, loceol, utf8_target);
9220         break;
9221
9222     case NASCII:
9223         if (utf8_target) {
9224             while (     hardcount < max
9225                    &&   scan < loceol
9226                    && ! isASCII_utf8_safe(scan, loceol))
9227             {
9228                 scan += UTF8SKIP(scan);
9229                 hardcount++;
9230             }
9231         }
9232         else {
9233             scan = find_next_ascii(scan, loceol, utf8_target);
9234         }
9235         break;
9236
9237     /* The argument (FLAGS) to all the POSIX node types is the class number */
9238
9239     case NPOSIXL:
9240         to_complement = 1;
9241         /* FALLTHROUGH */
9242
9243     case POSIXL:
9244         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9245         if (! utf8_target) {
9246             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9247                                                                    *scan)))
9248             {
9249                 scan++;
9250             }
9251         } else {
9252             while (hardcount < max && scan < loceol
9253                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9254                                                                   (U8 *) scan)))
9255             {
9256                 scan += UTF8SKIP(scan);
9257                 hardcount++;
9258             }
9259         }
9260         break;
9261
9262     case POSIXD:
9263         if (utf8_target) {
9264             goto utf8_posix;
9265         }
9266         /* FALLTHROUGH */
9267
9268     case POSIXA:
9269         if (utf8_target && loceol - scan > max) {
9270
9271             /* We didn't adjust <loceol> at the beginning of this routine
9272              * because is UTF-8, but it is actually ok to do so, since here, to
9273              * match, 1 char == 1 byte. */
9274             loceol = scan + max;
9275         }
9276         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9277             scan++;
9278         }
9279         break;
9280
9281     case NPOSIXD:
9282         if (utf8_target) {
9283             to_complement = 1;
9284             goto utf8_posix;
9285         }
9286         /* FALLTHROUGH */
9287
9288     case NPOSIXA:
9289         if (! utf8_target) {
9290             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9291                 scan++;
9292             }
9293         }
9294         else {
9295
9296             /* The complement of something that matches only ASCII matches all
9297              * non-ASCII, plus everything in ASCII that isn't in the class. */
9298             while (hardcount < max && scan < loceol
9299                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9300                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9301             {
9302                 scan += UTF8SKIP(scan);
9303                 hardcount++;
9304             }
9305         }
9306         break;
9307
9308     case NPOSIXU:
9309         to_complement = 1;
9310         /* FALLTHROUGH */
9311
9312     case POSIXU:
9313         if (! utf8_target) {
9314             while (scan < loceol && to_complement
9315                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9316             {
9317                 scan++;
9318             }
9319         }
9320         else {
9321           utf8_posix:
9322             classnum = (_char_class_number) FLAGS(p);
9323             if (classnum < _FIRST_NON_SWASH_CC) {
9324
9325                 /* Here, a swash is needed for above-Latin1 code points.
9326                  * Process as many Latin1 code points using the built-in rules.
9327                  * Go to another loop to finish processing upon encountering
9328                  * the first Latin1 code point.  We could do that in this loop
9329                  * as well, but the other way saves having to test if the swash
9330                  * has been loaded every time through the loop: extra space to
9331                  * save a test. */
9332                 while (hardcount < max && scan < loceol) {
9333                     if (UTF8_IS_INVARIANT(*scan)) {
9334                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
9335                                                                    classnum))))
9336                         {
9337                             break;
9338                         }
9339                         scan++;
9340                     }
9341                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
9342                         if (! (to_complement
9343                               ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
9344                                                                      *(scan + 1)),
9345                                                     classnum))))
9346                         {
9347                             break;
9348                         }
9349                         scan += 2;
9350                     }
9351                     else {
9352                         goto found_above_latin1;
9353                     }
9354
9355                     hardcount++;
9356                 }
9357             }
9358             else {
9359                 /* For these character classes, the knowledge of how to handle
9360                  * every code point is compiled in to Perl via a macro.  This
9361                  * code is written for making the loops as tight as possible.
9362                  * It could be refactored to save space instead */
9363                 switch (classnum) {
9364                     case _CC_ENUM_SPACE:
9365                         while (hardcount < max
9366                                && scan < loceol
9367                                && (to_complement
9368                                    ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9369                         {
9370                             scan += UTF8SKIP(scan);
9371                             hardcount++;
9372                         }
9373                         break;
9374                     case _CC_ENUM_BLANK:
9375                         while (hardcount < max
9376                                && scan < loceol
9377                                && (to_complement
9378                                     ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9379                         {
9380                             scan += UTF8SKIP(scan);
9381                             hardcount++;
9382                         }
9383                         break;
9384                     case _CC_ENUM_XDIGIT:
9385                         while (hardcount < max
9386                                && scan < loceol
9387                                && (to_complement
9388                                    ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9389                         {
9390                             scan += UTF8SKIP(scan);
9391                             hardcount++;
9392                         }
9393                         break;
9394                     case _CC_ENUM_VERTSPACE:
9395                         while (hardcount < max
9396                                && scan < loceol
9397                                && (to_complement
9398                                    ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9399                         {
9400                             scan += UTF8SKIP(scan);
9401                             hardcount++;
9402                         }
9403                         break;
9404                     case _CC_ENUM_CNTRL:
9405                         while (hardcount < max
9406                                && scan < loceol
9407                                && (to_complement
9408                                    ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9409                         {
9410                             scan += UTF8SKIP(scan);
9411                             hardcount++;
9412                         }
9413                         break;
9414                     default:
9415                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9416                 }
9417             }
9418         }
9419         break;
9420
9421       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
9422
9423         /* Load the swash if not already present */
9424         if (! PL_utf8_swash_ptrs[classnum]) {
9425             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9426             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9427                                         "utf8",
9428                                         "",
9429                                         &PL_sv_undef, 1, 0,
9430                                         PL_XPosix_ptrs[classnum], &flags);
9431         }
9432
9433         while (hardcount < max && scan < loceol
9434                && to_complement ^ cBOOL(_generic_utf8_safe(
9435                                        classnum,
9436                                        scan,
9437                                        loceol,
9438                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
9439                                                    (U8 *) scan,
9440                                                    TRUE))))
9441         {
9442             scan += UTF8SKIP(scan);
9443             hardcount++;
9444         }
9445         break;
9446
9447     case LNBREAK:
9448         if (utf8_target) {
9449             while (hardcount < max && scan < loceol &&
9450                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9451                 scan += c;
9452                 hardcount++;
9453             }
9454         } else {
9455             /* LNBREAK can match one or two latin chars, which is ok, but we
9456              * have to use hardcount in this situation, and throw away the
9457              * adjustment to <loceol> done before the switch statement */
9458             loceol = reginfo->strend;
9459             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9460                 scan+=c;
9461                 hardcount++;
9462             }
9463         }
9464         break;
9465
9466     case BOUNDL:
9467     case NBOUNDL:
9468         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9469         /* FALLTHROUGH */
9470     case BOUND:
9471     case BOUNDA:
9472     case BOUNDU:
9473     case EOS:
9474     case GPOS:
9475     case KEEPS:
9476     case NBOUND:
9477     case NBOUNDA:
9478     case NBOUNDU:
9479     case OPFAIL:
9480     case SBOL:
9481     case SEOL:
9482         /* These are all 0 width, so match right here or not at all. */
9483         break;
9484
9485     default:
9486         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9487         NOT_REACHED; /* NOTREACHED */
9488
9489     }
9490
9491     if (hardcount)
9492         c = hardcount;
9493     else
9494         c = scan - *startposp;
9495     *startposp = scan;
9496
9497     DEBUG_r({
9498         GET_RE_DEBUG_FLAGS_DECL;
9499         DEBUG_EXECUTE_r({
9500             SV * const prop = sv_newmortal();
9501             regprop(prog, prop, p, reginfo, NULL);
9502             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9503                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9504         });
9505     });
9506
9507     return(c);
9508 }
9509
9510
9511 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9512 /*
9513 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9514 create a copy so that changes the caller makes won't change the shared one.
9515 If <altsvp> is non-null, will return NULL in it, for back-compat.
9516  */
9517 SV *
9518 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9519 {
9520     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9521
9522     if (altsvp) {
9523         *altsvp = NULL;
9524     }
9525
9526     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9527 }
9528
9529 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9530
9531 /*
9532  - reginclass - determine if a character falls into a character class
9533
9534   n is the ANYOF-type regnode
9535   p is the target string
9536   p_end points to one byte beyond the end of the target string
9537   utf8_target tells whether p is in UTF-8.
9538
9539   Returns true if matched; false otherwise.
9540
9541   Note that this can be a synthetic start class, a combination of various
9542   nodes, so things you think might be mutually exclusive, such as locale,
9543   aren't.  It can match both locale and non-locale
9544
9545  */
9546
9547 STATIC bool
9548 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9549 {
9550     dVAR;
9551     const char flags = ANYOF_FLAGS(n);
9552     bool match = FALSE;
9553     UV c = *p;
9554
9555     PERL_ARGS_ASSERT_REGINCLASS;
9556
9557     /* If c is not already the code point, get it.  Note that
9558      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9559     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9560         STRLEN c_len = 0;
9561         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9562         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9563         if (c_len == (STRLEN)-1) {
9564             _force_out_malformed_utf8_message(p, p_end,
9565                                               utf8n_flags,
9566                                               1 /* 1 means die */ );
9567             NOT_REACHED; /* NOTREACHED */
9568         }
9569         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9570             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9571         }
9572     }
9573
9574     /* If this character is potentially in the bitmap, check it */
9575     if (c < NUM_ANYOF_CODE_POINTS) {
9576         if (ANYOF_BITMAP_TEST(n, c))
9577             match = TRUE;
9578         else if ((flags
9579                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9580                   && OP(n) == ANYOFD
9581                   && ! utf8_target
9582                   && ! isASCII(c))
9583         {
9584             match = TRUE;
9585         }
9586         else if (flags & ANYOF_LOCALE_FLAGS) {
9587             if ((flags & ANYOFL_FOLD)
9588                 && c < 256
9589                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9590             {
9591                 match = TRUE;
9592             }
9593             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9594                      && c < 256
9595             ) {
9596
9597                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9598                  * if the class includes the Posix character class given by
9599                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9600                  * complemented Posix class given by int(bit/2).  So we loop
9601                  * through the bits, each time changing whether we complement
9602                  * the result or not.  Suppose for the sake of illustration
9603                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9604                  * is set, it means there is a match for this ANYOF node if the
9605                  * character is in the class given by the expression (0 / 2 = 0
9606                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9607                  * and since 'to_complement' is 0, the result will stay TRUE,
9608                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9609                  * bit 1 is 1.  That means there is a match if the character
9610                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9611                  * but will on bit 1.  On the second iteration 'to_complement'
9612                  * will be 1, so the exclusive or will reverse things, so we
9613                  * are testing for \W.  On the third iteration, 'to_complement'
9614                  * will be 0, and we would be testing for \s; the fourth
9615                  * iteration would test for \S, etc.
9616                  *
9617                  * Note that this code assumes that all the classes are closed
9618                  * under folding.  For example, if a character matches \w, then
9619                  * its fold does too; and vice versa.  This should be true for
9620                  * any well-behaved locale for all the currently defined Posix
9621                  * classes, except for :lower: and :upper:, which are handled
9622                  * by the pseudo-class :cased: which matches if either of the
9623                  * other two does.  To get rid of this assumption, an outer
9624                  * loop could be used below to iterate over both the source
9625                  * character, and its fold (if different) */
9626
9627                 int count = 0;
9628                 int to_complement = 0;
9629
9630                 while (count < ANYOF_MAX) {
9631                     if (ANYOF_POSIXL_TEST(n, count)
9632                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9633                     {
9634                         match = TRUE;
9635                         break;
9636                     }
9637                     count++;
9638                     to_complement ^= 1;
9639                 }
9640             }
9641         }
9642     }
9643
9644
9645     /* If the bitmap didn't (or couldn't) match, and something outside the
9646      * bitmap could match, try that. */
9647     if (!match) {
9648         if (c >= NUM_ANYOF_CODE_POINTS
9649             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9650         {
9651             match = TRUE;       /* Everything above the bitmap matches */
9652         }
9653             /* Here doesn't match everything above the bitmap.  If there is
9654              * some information available beyond the bitmap, we may find a
9655              * match in it.  If so, this is most likely because the code point
9656              * is outside the bitmap range.  But rarely, it could be because of
9657              * some other reason.  If so, various flags are set to indicate
9658              * this possibility.  On ANYOFD nodes, there may be matches that
9659              * happen only when the target string is UTF-8; or for other node
9660              * types, because runtime lookup is needed, regardless of the
9661              * UTF-8ness of the target string.  Finally, under /il, there may
9662              * be some matches only possible if the locale is a UTF-8 one. */
9663         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9664                  && (   c >= NUM_ANYOF_CODE_POINTS
9665                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9666                          && (   UNLIKELY(OP(n) != ANYOFD)
9667                              || (utf8_target && ! isASCII_uni(c)
9668 #                               if NUM_ANYOF_CODE_POINTS > 256
9669                                                                  && c < 256
9670 #                               endif
9671                                 )))
9672                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9673                          && IN_UTF8_CTYPE_LOCALE)))
9674         {
9675             SV* only_utf8_locale = NULL;
9676             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9677                                                        &only_utf8_locale, NULL);
9678             if (sw) {
9679                 U8 utf8_buffer[2];
9680                 U8 * utf8_p;
9681                 if (utf8_target) {
9682                     utf8_p = (U8 *) p;
9683                 } else { /* Convert to utf8 */
9684                     utf8_p = utf8_buffer;
9685                     append_utf8_from_native_byte(*p, &utf8_p);
9686                     utf8_p = utf8_buffer;
9687                 }
9688
9689                 if (swash_fetch(sw, utf8_p, TRUE)) {
9690                     match = TRUE;
9691                 }
9692             }
9693             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9694                 match = _invlist_contains_cp(only_utf8_locale, c);
9695             }
9696         }
9697
9698         if (UNICODE_IS_SUPER(c)
9699             && (flags
9700                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9701             && OP(n) != ANYOFD
9702             && ckWARN_d(WARN_NON_UNICODE))
9703         {
9704             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9705                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9706         }
9707     }
9708
9709 #if ANYOF_INVERT != 1
9710     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9711      * use it, don't */
9712 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9713 #endif
9714
9715     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9716     return (flags & ANYOF_INVERT) ^ match;
9717 }
9718
9719 STATIC U8 *
9720 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9721 {
9722     /* return the position 'off' UTF-8 characters away from 's', forward if
9723      * 'off' >= 0, backwards if negative.  But don't go outside of position
9724      * 'lim', which better be < s  if off < 0 */
9725
9726     PERL_ARGS_ASSERT_REGHOP3;
9727
9728     if (off >= 0) {
9729         while (off-- && s < lim) {
9730             /* XXX could check well-formedness here */
9731             U8 *new_s = s + UTF8SKIP(s);
9732             if (new_s > lim) /* lim may be in the middle of a long character */
9733                 return s;
9734             s = new_s;
9735         }
9736     }
9737     else {
9738         while (off++ && s > lim) {
9739             s--;
9740             if (UTF8_IS_CONTINUED(*s)) {
9741                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9742                     s--;
9743                 if (! UTF8_IS_START(*s)) {
9744                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9745                 }
9746             }
9747             /* XXX could check well-formedness here */
9748         }
9749     }
9750     return s;
9751 }
9752
9753 STATIC U8 *
9754 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9755 {
9756     PERL_ARGS_ASSERT_REGHOP4;
9757
9758     if (off >= 0) {
9759         while (off-- && s < rlim) {
9760             /* XXX could check well-formedness here */
9761             s += UTF8SKIP(s);
9762         }
9763     }
9764     else {
9765         while (off++ && s > llim) {
9766             s--;
9767             if (UTF8_IS_CONTINUED(*s)) {
9768                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9769                     s--;
9770                 if (! UTF8_IS_START(*s)) {
9771                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9772                 }
9773             }
9774             /* XXX could check well-formedness here */
9775         }
9776     }
9777     return s;
9778 }
9779
9780 /* like reghop3, but returns NULL on overrun, rather than returning last
9781  * char pos */
9782
9783 STATIC U8 *
9784 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9785 {
9786     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9787
9788     if (off >= 0) {
9789         while (off-- && s < lim) {
9790             /* XXX could check well-formedness here */
9791             s += UTF8SKIP(s);
9792         }
9793         if (off >= 0)
9794             return NULL;
9795     }
9796     else {
9797         while (off++ && s > lim) {
9798             s--;
9799             if (UTF8_IS_CONTINUED(*s)) {
9800                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9801                     s--;
9802                 if (! UTF8_IS_START(*s)) {
9803                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9804                 }
9805             }
9806             /* XXX could check well-formedness here */
9807         }
9808         if (off <= 0)
9809             return NULL;
9810     }
9811     return s;
9812 }
9813
9814
9815 /* when executing a regex that may have (?{}), extra stuff needs setting
9816    up that will be visible to the called code, even before the current
9817    match has finished. In particular:
9818
9819    * $_ is localised to the SV currently being matched;
9820    * pos($_) is created if necessary, ready to be updated on each call-out
9821      to code;
9822    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9823      isn't set until the current pattern is successfully finished), so that
9824      $1 etc of the match-so-far can be seen;
9825    * save the old values of subbeg etc of the current regex, and  set then
9826      to the current string (again, this is normally only done at the end
9827      of execution)
9828 */
9829
9830 static void
9831 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9832 {
9833     MAGIC *mg;
9834     regexp *const rex = ReANY(reginfo->prog);
9835     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9836
9837     eval_state->rex = rex;
9838
9839     if (reginfo->sv) {
9840         /* Make $_ available to executed code. */
9841         if (reginfo->sv != DEFSV) {
9842             SAVE_DEFSV;
9843             DEFSV_set(reginfo->sv);
9844         }
9845
9846         if (!(mg = mg_find_mglob(reginfo->sv))) {
9847             /* prepare for quick setting of pos */
9848             mg = sv_magicext_mglob(reginfo->sv);
9849             mg->mg_len = -1;
9850         }
9851         eval_state->pos_magic = mg;
9852         eval_state->pos       = mg->mg_len;
9853         eval_state->pos_flags = mg->mg_flags;
9854     }
9855     else
9856         eval_state->pos_magic = NULL;
9857
9858     if (!PL_reg_curpm) {
9859         /* PL_reg_curpm is a fake PMOP that we can attach the current
9860          * regex to and point PL_curpm at, so that $1 et al are visible
9861          * within a /(?{})/. It's just allocated once per interpreter the
9862          * first time its needed */
9863         Newxz(PL_reg_curpm, 1, PMOP);
9864 #ifdef USE_ITHREADS
9865         {
9866             SV* const repointer = &PL_sv_undef;
9867             /* this regexp is also owned by the new PL_reg_curpm, which
9868                will try to free it.  */
9869             av_push(PL_regex_padav, repointer);
9870             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
9871             PL_regex_pad = AvARRAY(PL_regex_padav);
9872         }
9873 #endif
9874     }
9875     SET_reg_curpm(reginfo->prog);
9876     eval_state->curpm = PL_curpm;
9877     PL_curpm_under = PL_curpm;
9878     PL_curpm = PL_reg_curpm;
9879     if (RXp_MATCH_COPIED(rex)) {
9880         /*  Here is a serious problem: we cannot rewrite subbeg,
9881             since it may be needed if this match fails.  Thus
9882             $` inside (?{}) could fail... */
9883         eval_state->subbeg     = rex->subbeg;
9884         eval_state->sublen     = rex->sublen;
9885         eval_state->suboffset  = rex->suboffset;
9886         eval_state->subcoffset = rex->subcoffset;
9887 #ifdef PERL_ANY_COW
9888         eval_state->saved_copy = rex->saved_copy;
9889 #endif
9890         RXp_MATCH_COPIED_off(rex);
9891     }
9892     else
9893         eval_state->subbeg = NULL;
9894     rex->subbeg = (char *)reginfo->strbeg;
9895     rex->suboffset = 0;
9896     rex->subcoffset = 0;
9897     rex->sublen = reginfo->strend - reginfo->strbeg;
9898 }
9899
9900
9901 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
9902
9903 static void
9904 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
9905 {
9906     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
9907     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
9908     regmatch_slab *s;
9909
9910     Safefree(aux->poscache);
9911
9912     if (eval_state) {
9913
9914         /* undo the effects of S_setup_eval_state() */
9915
9916         if (eval_state->subbeg) {
9917             regexp * const rex = eval_state->rex;
9918             rex->subbeg     = eval_state->subbeg;
9919             rex->sublen     = eval_state->sublen;
9920             rex->suboffset  = eval_state->suboffset;
9921             rex->subcoffset = eval_state->subcoffset;
9922 #ifdef PERL_ANY_COW
9923             rex->saved_copy = eval_state->saved_copy;
9924 #endif
9925             RXp_MATCH_COPIED_on(rex);
9926         }
9927         if (eval_state->pos_magic)
9928         {
9929             eval_state->pos_magic->mg_len = eval_state->pos;
9930             eval_state->pos_magic->mg_flags =
9931                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
9932                | (eval_state->pos_flags & MGf_BYTES);
9933         }
9934
9935         PL_curpm = eval_state->curpm;
9936     }
9937
9938     PL_regmatch_state = aux->old_regmatch_state;
9939     PL_regmatch_slab  = aux->old_regmatch_slab;
9940
9941     /* free all slabs above current one - this must be the last action
9942      * of this function, as aux and eval_state are allocated within
9943      * slabs and may be freed here */
9944
9945     s = PL_regmatch_slab->next;
9946     if (s) {
9947         PL_regmatch_slab->next = NULL;
9948         while (s) {
9949             regmatch_slab * const osl = s;
9950             s = s->next;
9951             Safefree(osl);
9952         }
9953     }
9954 }
9955
9956
9957 STATIC void
9958 S_to_utf8_substr(pTHX_ regexp *prog)
9959 {
9960     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9961      * on the converted value */
9962
9963     int i = 1;
9964
9965     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9966
9967     do {
9968         if (prog->substrs->data[i].substr
9969             && !prog->substrs->data[i].utf8_substr) {
9970             SV* const sv = newSVsv(prog->substrs->data[i].substr);
9971             prog->substrs->data[i].utf8_substr = sv;
9972             sv_utf8_upgrade(sv);
9973             if (SvVALID(prog->substrs->data[i].substr)) {
9974                 if (SvTAIL(prog->substrs->data[i].substr)) {
9975                     /* Trim the trailing \n that fbm_compile added last
9976                        time.  */
9977                     SvCUR_set(sv, SvCUR(sv) - 1);
9978                     /* Whilst this makes the SV technically "invalid" (as its
9979                        buffer is no longer followed by "\0") when fbm_compile()
9980                        adds the "\n" back, a "\0" is restored.  */
9981                     fbm_compile(sv, FBMcf_TAIL);
9982                 } else
9983                     fbm_compile(sv, 0);
9984             }
9985             if (prog->substrs->data[i].substr == prog->check_substr)
9986                 prog->check_utf8 = sv;
9987         }
9988     } while (i--);
9989 }
9990
9991 STATIC bool
9992 S_to_byte_substr(pTHX_ regexp *prog)
9993 {
9994     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9995      * on the converted value; returns FALSE if can't be converted. */
9996
9997     int i = 1;
9998
9999     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10000
10001     do {
10002         if (prog->substrs->data[i].utf8_substr
10003             && !prog->substrs->data[i].substr) {
10004             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10005             if (! sv_utf8_downgrade(sv, TRUE)) {
10006                 return FALSE;
10007             }
10008             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10009                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10010                     /* Trim the trailing \n that fbm_compile added last
10011                         time.  */
10012                     SvCUR_set(sv, SvCUR(sv) - 1);
10013                     fbm_compile(sv, FBMcf_TAIL);
10014                 } else
10015                     fbm_compile(sv, 0);
10016             }
10017             prog->substrs->data[i].substr = sv;
10018             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10019                 prog->check_substr = sv;
10020         }
10021     } while (i--);
10022
10023     return TRUE;
10024 }
10025
10026 #ifndef PERL_IN_XSUB_RE
10027
10028 bool
10029 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10030 {
10031     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10032      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10033      * the larger string bounded by 'strbeg' and 'strend'.
10034      *
10035      * 'cp' needs to be assigned (if not a future version of the Unicode
10036      * Standard could make it something that combines with adjacent characters,
10037      * so code using it would then break), and there has to be a GCB break
10038      * before and after the character. */
10039
10040     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10041     const U8 * prev_cp_start;
10042
10043     PERL_ARGS_ASSERT__IS_GRAPHEME;
10044
10045     /* Unassigned code points are forbidden */
10046     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10047                                     _invlist_search(PL_Assigned_invlist, cp))))
10048     {
10049         return FALSE;
10050     }
10051
10052     cp_gcb_val = getGCB_VAL_CP(cp);
10053
10054     /* Find the GCB value of the previous code point in the input */
10055     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10056     if (UNLIKELY(prev_cp_start == s)) {
10057         prev_cp_gcb_val = GCB_EDGE;
10058     }
10059     else {
10060         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10061     }
10062
10063     /* And check that is a grapheme boundary */
10064     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10065                 TRUE /* is UTF-8 encoded */ ))
10066     {
10067         return FALSE;
10068     }
10069
10070     /* Similarly verify there is a break between the current character and the
10071      * following one */
10072     s += UTF8SKIP(s);
10073     if (s >= strend) {
10074         next_cp_gcb_val = GCB_EDGE;
10075     }
10076     else {
10077         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10078     }
10079
10080     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10081 }
10082
10083 bool
10084 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10085 {
10086     /* Checks that every character in the sequence from 's' to 'send' is one of
10087      * three scripts: Common, Inherited, and possibly one other.  Additionally
10088      * all decimal digits must come from the same consecutive sequence of 10.
10089      * 'utf8_target' is TRUE iff the sequence is encoded in UTF-8.
10090      *
10091      * Basically, it looks at each character in the sequence to see if the
10092      * above conditions are met; if not it fails.  It uses an inversion map to
10093      * find the enum corresponding to the script of each character.  But this
10094      * is complicated by the fact that a few code points can be in any of
10095      * several scripts.  The data has been constructed so that there are
10096      * additional enum values (all negative) for these situations.  The
10097      * absolute value of those is an index into another table which contains
10098      * pointers to auxiliary tables for each such situation.  Each aux array
10099      * lists all the scripts for the given situation.  There is another,
10100      * parallel, table that gives the number of entries in each aux table.
10101      * These are all defined in charclass_invlists.h */
10102
10103     /* XXX Here are the additional things UTS 39 says could be done:
10104      * Mark Chinese strings as “mixed script” if they contain both simplified
10105      * (S) and traditional (T) Chinese characters, using the Unihan data in the
10106      * Unicode Character Database [UCD].  The criterion can only be applied if
10107      * the language of the string is known to be Chinese. So, for example, the
10108      * string “写真だけの結婚式 ” is Japanese, and should not be marked as
10109      * mixed script because of a mixture of S and T characters.  Testing for
10110      * whether a character is S or T needs to be based not on whether the
10111      * character has a S or T variant , but whether the character is an S or T
10112      * variant. khw notes that the sample contains a Hiragana character, and it
10113      * is unclear if absence of any foreign script marks the script as
10114      * "Chinese"
10115      *
10116      * Forbid sequences of the same nonspacing mark
10117      *
10118      * Check to see that all the characters are in the sets of exemplar
10119      * characters for at least one language in the Unicode Common Locale Data
10120      * Repository [CLDR]. */
10121
10122
10123     /* Things that match /\d/u */
10124     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10125     UV * decimals_array = invlist_array(decimals_invlist);
10126
10127     /* What code point is the digit '0' of the script run? */
10128     UV zero_of_run = 0;
10129     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10130     SCX_enum script_of_char = SCX_INVALID;
10131
10132     /* If the script remains not fully determined from iteration to iteration,
10133      * this is the current intersection of the possiblities.  */
10134     SCX_enum * intersection = NULL;
10135     PERL_UINT_FAST8_T intersection_len = 0;
10136
10137     bool retval = TRUE;
10138
10139     assert(send > s);
10140
10141     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10142
10143     /* Look at each character in the sequence */
10144     while (s < send) {
10145         UV cp;
10146
10147         /* The code allows all scripts to use the ASCII digits.  This is
10148          * because they are used in commerce even in scripts that have their
10149          * own set.  Hence any ASCII ones found are ok, unless a digit from
10150          * another set has already been encountered.  (The other digit ranges
10151          * in Common are not similarly blessed */
10152         if (UNLIKELY(isDIGIT(*s))) {
10153             if (zero_of_run > 0) {
10154                 if (zero_of_run != '0') {
10155                     retval = FALSE;
10156                     break;
10157                 }
10158             }
10159             else {
10160                 zero_of_run = '0';
10161             }
10162             s++;
10163             continue;
10164         }
10165
10166         /* Here, isn't an ASCII digit.  Find the code point of the character */
10167         if (utf8_target && ! UTF8_IS_INVARIANT(*s)) {
10168             Size_t len;
10169             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10170             s += len;
10171         }
10172         else {
10173             cp = *(s++);
10174         }
10175
10176         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10177          * digit in that script.  We can skip the rest of this code for this
10178          * character. */
10179         if (UNLIKELY(   zero_of_run > 0
10180                      && cp >= zero_of_run
10181                      && cp - zero_of_run <= 9))
10182         {
10183             continue;
10184         }
10185
10186         /* Find the character's script.  The correct values are hard-coded here
10187          * for small-enough code points. */
10188         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10189                                unlikely to change */
10190             if (       cp > 255
10191                 || (   isALPHA_L1(cp)
10192                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10193             {
10194                 script_of_char = SCX_Latin;
10195             }
10196             else {
10197                 script_of_char = SCX_Common;
10198             }
10199         }
10200         else {
10201             script_of_char = _Perl_SCX_invmap[
10202                                        _invlist_search(PL_SCX_invlist, cp)];
10203         }
10204
10205         /* We arbitrarily accept a single unassigned character, but not in
10206          * combination with anything else, and not a run of them. */
10207         if (   UNLIKELY(script_of_run == SCX_Unknown)
10208             || UNLIKELY(   script_of_run != SCX_INVALID
10209                         && script_of_char == SCX_Unknown))
10210         {
10211             retval = FALSE;
10212             break;
10213         }
10214
10215         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10216                 script_of_run = SCX_Unknown;
10217                 continue;
10218         }
10219
10220         /* We accept 'inherited' script characters currently even at the
10221          * beginning.  (We know that no characters in Inherited are digits, or
10222          * we'd have to check for that) */
10223         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10224             continue;
10225         }
10226
10227         /* If unknown, the run's script is set to the char's */
10228         if (UNLIKELY(script_of_run == SCX_INVALID)) {
10229             script_of_run = script_of_char;
10230         }
10231
10232         /* All decimal digits must be from the same sequence of 10.  Above, we
10233          * handled any ASCII digits without descending to here.  We also
10234          * handled the case where we already knew what digit sequence is the
10235          * one to use, and the character is in that sequence.  Now that we know
10236          * the script, we can use script_zeros[] to directly find which
10237          * sequence the script uses, except in a few cases it returns 0 */
10238         if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
10239             zero_of_run = script_zeros[script_of_char];
10240         }
10241
10242         /* Now we can see if the script of the character is the same as that of
10243          * the run */
10244         if (LIKELY(script_of_char == script_of_run)) {
10245             /* By far the most common case */
10246             goto scripts_match;
10247         }
10248
10249         /* Here, the scripts of the run and the current character don't match
10250          * exactly.  The run could so far have been entirely characters from
10251          * Common.  It's now time to change its script to that of this
10252          * non-Common character */
10253         if (script_of_run == SCX_Common) {
10254
10255             /* But Common contains several sets of digits.  Only the '0' set
10256              * can be part of another script. */
10257             if (zero_of_run > 0 && zero_of_run != '0') {
10258                 retval = FALSE;
10259                 break;
10260             }
10261
10262             script_of_run = script_of_char;
10263             goto scripts_match;
10264         }
10265
10266         /* Here, the script of the run isn't Common.  But characters in Common
10267          * match any script */
10268         if (script_of_char == SCX_Common) {
10269             goto scripts_match;
10270         }
10271
10272 #ifndef HAS_SCX_AUX_TABLES
10273
10274         /* Too early a Unicode version to have a code point belonging to more
10275          * than one script, so, if the scripts don't exactly match, fail */
10276         retval = FALSE;
10277         break;
10278
10279 #else
10280
10281         /* Here there is no exact match between the character's script and the
10282          * run's.  And we've handled the special cases of scripts Unknown,
10283          * Inherited, and Common.
10284          *
10285          * Negative script numbers signify that the value may be any of several
10286          * scripts, and we need to look at auxiliary information to make our
10287          * deterimination.  But if both are non-negative, we can fail now */
10288         if (LIKELY(script_of_char >= 0)) {
10289             const SCX_enum * search_in;
10290             PERL_UINT_FAST8_T search_in_len;
10291             PERL_UINT_FAST8_T i;
10292
10293             if (LIKELY(script_of_run >= 0)) {
10294                 retval = FALSE;
10295                 break;
10296             }
10297
10298             /* Use the previously constructed set of possible scripts, if any.
10299              * */
10300             if (intersection) {
10301                 search_in = intersection;
10302                 search_in_len = intersection_len;
10303             }
10304             else {
10305                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10306                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10307             }
10308
10309             for (i = 0; i < search_in_len; i++) {
10310                 if (search_in[i] == script_of_char) {
10311                     script_of_run = script_of_char;
10312                     goto scripts_match;
10313                 }
10314             }
10315
10316             retval = FALSE;
10317             break;
10318         }
10319         else if (LIKELY(script_of_run >= 0)) {
10320             /* script of character could be one of several, but run is a single
10321              * script */
10322             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10323             const PERL_UINT_FAST8_T search_in_len
10324                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10325             PERL_UINT_FAST8_T i;
10326
10327             for (i = 0; i < search_in_len; i++) {
10328                 if (search_in[i] == script_of_run) {
10329                     script_of_char = script_of_run;
10330                     goto scripts_match;
10331                 }
10332             }
10333
10334             retval = FALSE;
10335             break;
10336         }
10337         else {
10338             /* Both run and char could be in one of several scripts.  If the
10339              * intersection is empty, then this character isn't in this script
10340              * run.  Otherwise, we need to calculate the intersection to use
10341              * for future iterations of the loop, unless we are already at the
10342              * final character */
10343             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10344             const PERL_UINT_FAST8_T char_len
10345                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10346             const SCX_enum * search_run;
10347             PERL_UINT_FAST8_T run_len;
10348
10349             SCX_enum * new_overlap = NULL;
10350             PERL_UINT_FAST8_T i, j;
10351
10352             if (intersection) {
10353                 search_run = intersection;
10354                 run_len = intersection_len;
10355             }
10356             else {
10357                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10358                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10359             }
10360
10361             intersection_len = 0;
10362
10363             for (i = 0; i < run_len; i++) {
10364                 for (j = 0; j < char_len; j++) {
10365                     if (search_run[i] == search_char[j]) {
10366
10367                         /* Here, the script at i,j matches.  That means this
10368                          * character is in the run.  But continue on to find
10369                          * the complete intersection, for the next loop
10370                          * iteration, and for the digit check after it.
10371                          *
10372                          * On the first found common script, we malloc space
10373                          * for the intersection list for the worst case of the
10374                          * intersection, which is the minimum of the number of
10375                          * scripts remaining in each set. */
10376                         if (intersection_len == 0) {
10377                             Newx(new_overlap,
10378                                  MIN(run_len - i, char_len - j),
10379                                  SCX_enum);
10380                         }
10381                         new_overlap[intersection_len++] = search_run[i];
10382                     }
10383                 }
10384             }
10385
10386             /* Here we've looked through everything.  If they have no scripts
10387              * in common, not a run */
10388             if (intersection_len == 0) {
10389                 retval = FALSE;
10390                 break;
10391             }
10392
10393             /* If there is only a single script in common, set to that.
10394              * Otherwise, use the intersection going forward */
10395             Safefree(intersection);
10396             if (intersection_len == 1) {
10397                 script_of_run = script_of_char = new_overlap[0];
10398                 Safefree(new_overlap);
10399             }
10400             else {
10401                 intersection = new_overlap;
10402             }
10403         }
10404
10405 #endif
10406
10407   scripts_match:
10408
10409         /* Here, the script of the character is compatible with that of the
10410          * run.  Either they match exactly, or one or both can be any of
10411          * several scripts, and the intersection is not empty.  If the
10412          * character is not a decimal digit, we are done with it.  Otherwise,
10413          * it could still fail if it is from a different set of 10 than seen
10414          * already (or we may not have seen any, and we need to set the
10415          * sequence).  If we have determined a single script and that script
10416          * only has one set of digits (almost all scripts are like that), then
10417          * this isn't a problem, as any digit must come from the same sequence.
10418          * The only scripts that have multiple sequences have been constructed
10419          * to be 0 in 'script_zeros[]'.
10420          *
10421          * Here we check if it is a digit. */
10422         if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
10423             && (   (          zero_of_run == 0
10424                     || (  (   script_of_char >= 0
10425                            && script_zeros[script_of_char] == 0)
10426                         ||    intersection))))
10427         {
10428             SSize_t range_zero_index;
10429             range_zero_index = _invlist_search(decimals_invlist, cp);
10430             if (   LIKELY(range_zero_index >= 0)
10431                 && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
10432             {
10433                 UV range_zero = decimals_array[range_zero_index];
10434                 if (zero_of_run) {
10435                     if (zero_of_run != range_zero) {
10436                         retval = FALSE;
10437                         break;
10438                     }
10439                 }
10440                 else {
10441                     zero_of_run = range_zero;
10442                 }
10443             }
10444         }
10445     } /* end of looping through CLOSESR text */
10446
10447     Safefree(intersection);
10448     return retval;
10449 }
10450
10451 #endif /* ifndef PERL_IN_XSUB_RE */
10452
10453 /*
10454  * ex: set ts=8 sts=4 sw=4 et:
10455  */