regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char non_utf8_target_but_utf8_required[]
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /*
 111  * Forwards.
 112  */
 113
 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 115
 116 #define HOPc(pos,off) \
 117         (char *)(reginfo->is_utf8_target \
 118             ? reghop3((U8*)pos, off, \
 119                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 120             : (U8*)(pos + off))
 121
 122 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 123 #define HOPBACK3(pos, off, lim) \
 124         (reginfo->is_utf8_target                          \
 125             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 126             : (pos - off >= lim)                                 \
 127                 ? (U8*)pos - off                                 \
 128                 : NULL)
 129
 130 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 131
 132 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 133 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 134
 135 /* lim must be +ve. Returns NULL on overshoot */
 136 #define HOPMAYBE3(pos,off,lim) \
 137         (reginfo->is_utf8_target                        \
 138             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 139             : ((U8*)pos + off <= lim)                   \
 140                 ? (U8*)pos + off                        \
 141                 : NULL)
 142
 143 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 144  * off must be >=0; args should be vars rather than expressions */
 145 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 146     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 147     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 148 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 149
 150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 151     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 152     : (U8*)(pos + off))
 153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 154
 155 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 156 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 157
 158 /* for use after a quantifier and before an EXACT-like node -- japhy */
 159 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 160  *
 161  * NOTE that *nothing* that affects backtracking should be in here, specifically
 162  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 163  * node that is in between two EXACT like nodes when ascertaining what the required
 164  * "follow" character is. This should probably be moved to regex compile time
 165  * although it may be done at run time beause of the REF possibility - more
 166  * investigation required. -- demerphq
 167 */
 168 #define JUMPABLE(rn) (                                                             \
 169     OP(rn) == OPEN ||                                                              \
 170     (OP(rn) == CLOSE &&                                                            \
 171      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 172     OP(rn) == EVAL ||                                                              \
 173     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 174     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 175     OP(rn) == KEEPS ||                                                             \
 176     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 177 )
 178 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 179
 180 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 181
 182 /*
 183   Search for mandatory following text node; for lookahead, the text must
 184   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 185 */
 186 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 187     while (JUMPABLE(rn)) { \
 188         const OPCODE type = OP(rn); \
 189         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 190             rn = NEXTOPER(NEXTOPER(rn)); \
 191         else if (type == PLUS) \
 192             rn = NEXTOPER(rn); \
 193         else if (type == IFMATCH) \
 194             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 195         else rn += NEXT_OFF(rn); \
 196     } \
 197 } STMT_END
 198
 199 #define SLAB_FIRST(s) (&(s)->states[0])
 200 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 201
 202 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 203 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 204 static regmatch_state * S_push_slab(pTHX);
 205
 206 #define REGCP_PAREN_ELEMS 3
 207 #define REGCP_OTHER_ELEMS 3
 208 #define REGCP_FRAME_ELEMS 1
 209 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 210  * are needed for the regexp context stack bookkeeping. */
 211
 212 STATIC CHECKPOINT
 213 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 214 {
 215     const int retval = PL_savestack_ix;
 216     const int paren_elems_to_push =
 217                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 218     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 219     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 220     I32 p;
 221     GET_RE_DEBUG_FLAGS_DECL;
 222
 223     PERL_ARGS_ASSERT_REGCPPUSH;
 224
 225     if (paren_elems_to_push < 0)
 226         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 227                    (int)paren_elems_to_push, (int)maxopenparen,
 228                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 229
 230     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 231         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 232                    " out of range (%lu-%ld)",
 233                    total_elems,
 234                    (unsigned long)maxopenparen,
 235                    (long)parenfloor);
 236
 237     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 238
 239     DEBUG_BUFFERS_r(
 240         if ((int)maxopenparen > (int)parenfloor)
 241             Perl_re_exec_indentf( aTHX_
 242                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 243                 depth,
 244                 PTR2UV(rex),
 245                 PTR2UV(rex->offs)
 246             );
 247     );
 248     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 249 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 250         SSPUSHIV(rex->offs[p].end);
 251         SSPUSHIV(rex->offs[p].start);
 252         SSPUSHINT(rex->offs[p].start_tmp);
 253         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 254             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 255             depth,
 256             (UV)p,
 257             (IV)rex->offs[p].start,
 258             (IV)rex->offs[p].start_tmp,
 259             (IV)rex->offs[p].end
 260         ));
 261     }
 262 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 263     SSPUSHINT(maxopenparen);
 264     SSPUSHINT(rex->lastparen);
 265     SSPUSHINT(rex->lastcloseparen);
 266     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 267
 268     return retval;
 269 }
 270
 271 /* These are needed since we do not localize EVAL nodes: */
 272 #define REGCP_SET(cp)                                           \
 273     DEBUG_STATE_r(                                              \
 274         Perl_re_exec_indentf( aTHX_                             \
 275             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 276             depth, (IV)PL_savestack_ix                          \
 277         )                                                       \
 278     );                                                          \
 279     cp = PL_savestack_ix
 280
 281 #define REGCP_UNWIND(cp)                                        \
 282     DEBUG_STATE_r(                                              \
 283         if (cp != PL_savestack_ix)                              \
 284             Perl_re_exec_indentf( aTHX_                         \
 285                 "Clearing an EVAL scope, savestack=%"           \
 286                 IVdf "..%" IVdf "\n",                           \
 287                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 288             )                                                   \
 289     );                                                          \
 290     regcpblow(cp)
 291
 292 /* set the start and end positions of capture ix */
 293 #define CLOSE_CAPTURE(ix, s, e)                                            \
 294     rex->offs[ix].start = s;                                               \
 295     rex->offs[ix].end = e;                                                 \
 296     if (ix > rex->lastparen)                                               \
 297         rex->lastparen = ix;                                               \
 298     rex->lastcloseparen = ix;                                              \
 299     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
 300         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
 301         depth,                                                             \
 302         PTR2UV(rex),                                                       \
 303         PTR2UV(rex->offs),                                                 \
 304         (UV)ix,                                                            \
 305         (IV)rex->offs[ix].start,                                           \
 306         (IV)rex->offs[ix].end,                                             \
 307         (UV)rex->lastparen                                                 \
 308     ))
 309
 310 #define UNWIND_PAREN(lp, lcp)               \
 311     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
 312         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
 313         depth,                              \
 314         PTR2UV(rex),                        \
 315         PTR2UV(rex->offs),                  \
 316         (UV)(lp),                           \
 317         (UV)(rex->lastparen),               \
 318         (UV)(lcp)                           \
 319     ));                                     \
 320     for (n = rex->lastparen; n > lp; n--)   \
 321         rex->offs[n].end = -1;              \
 322     rex->lastparen = n;                     \
 323     rex->lastcloseparen = lcp;
 324
 325
 326 STATIC void
 327 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 328 {
 329     UV i;
 330     U32 paren;
 331     GET_RE_DEBUG_FLAGS_DECL;
 332
 333     PERL_ARGS_ASSERT_REGCPPOP;
 334
 335     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 336     i = SSPOPUV;
 337     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 338     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 339     rex->lastcloseparen = SSPOPINT;
 340     rex->lastparen = SSPOPINT;
 341     *maxopenparen_p = SSPOPINT;
 342
 343     i -= REGCP_OTHER_ELEMS;
 344     /* Now restore the parentheses context. */
 345     DEBUG_BUFFERS_r(
 346         if (i || rex->lastparen + 1 <= rex->nparens)
 347             Perl_re_exec_indentf( aTHX_
 348                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 349                 depth,
 350                 PTR2UV(rex),
 351                 PTR2UV(rex->offs)
 352             );
 353     );
 354     paren = *maxopenparen_p;
 355     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 356         SSize_t tmps;
 357         rex->offs[paren].start_tmp = SSPOPINT;
 358         rex->offs[paren].start = SSPOPIV;
 359         tmps = SSPOPIV;
 360         if (paren <= rex->lastparen)
 361             rex->offs[paren].end = tmps;
 362         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 363             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 364             depth,
 365             (UV)paren,
 366             (IV)rex->offs[paren].start,
 367             (IV)rex->offs[paren].start_tmp,
 368             (IV)rex->offs[paren].end,
 369             (paren > rex->lastparen ? "(skipped)" : ""));
 370         );
 371         paren--;
 372     }
 373 #if 1
 374     /* It would seem that the similar code in regtry()
 375      * already takes care of this, and in fact it is in
 376      * a better location to since this code can #if 0-ed out
 377      * but the code in regtry() is needed or otherwise tests
 378      * requiring null fields (pat.t#187 and split.t#{13,14}
 379      * (as of patchlevel 7877)  will fail.  Then again,
 380      * this code seems to be necessary or otherwise
 381      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 382      * --jhi updated by dapm */
 383     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 384         if (i > *maxopenparen_p)
 385             rex->offs[i].start = -1;
 386         rex->offs[i].end = -1;
 387         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 388             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 389             depth,
 390             (UV)i,
 391             (i > *maxopenparen_p) ? "-1" : "  "
 392         ));
 393     }
 394 #endif
 395 }
 396
 397 /* restore the parens and associated vars at savestack position ix,
 398  * but without popping the stack */
 399
 400 STATIC void
 401 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 402 {
 403     I32 tmpix = PL_savestack_ix;
 404     PERL_ARGS_ASSERT_REGCP_RESTORE;
 405
 406     PL_savestack_ix = ix;
 407     regcppop(rex, maxopenparen_p);
 408     PL_savestack_ix = tmpix;
 409 }
 410
 411 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 412
 413 #ifndef PERL_IN_XSUB_RE
 414
 415 bool
 416 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 417 {
 418     /* Returns a boolean as to whether or not 'character' is a member of the
 419      * Posix character class given by 'classnum' that should be equivalent to a
 420      * value in the typedef '_char_class_number'.
 421      *
 422      * Ideally this could be replaced by a just an array of function pointers
 423      * to the C library functions that implement the macros this calls.
 424      * However, to compile, the precise function signatures are required, and
 425      * these may vary from platform to to platform.  To avoid having to figure
 426      * out what those all are on each platform, I (khw) am using this method,
 427      * which adds an extra layer of function call overhead (unless the C
 428      * optimizer strips it away).  But we don't particularly care about
 429      * performance with locales anyway. */
 430
 431     switch ((_char_class_number) classnum) {
 432         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 433         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 434         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 435         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 436         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 437                                         || isUPPER_LC(character);
 438         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 439         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 440         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 441         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 442         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 443         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 444         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 445         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 446         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 447         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 448         default:    /* VERTSPACE should never occur in locales */
 449             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 450     }
 451
 452     NOT_REACHED; /* NOTREACHED */
 453     return FALSE;
 454 }
 455
 456 #endif
 457
 458 PERL_STATIC_INLINE I32
 459 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
 460 {
 461     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
 462      * folded.  Works on all folds representable without UTF-8, except for
 463      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
 464      * check that the strings each have at least 'len' characters.
 465      *
 466      * There is almost an identical API function where s2 need not be folded:
 467      * Perl_foldEQ_latin1() */
 468
 469     const U8 *a = (const U8 *)s1;
 470     const U8 *b = (const U8 *)s2;
 471
 472     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
 473
 474     assert(len >= 0);
 475
 476     while (len--) {
 477         assert(! isUPPER_L1(*b));
 478         if (toLOWER_L1(*a) != *b) {
 479             return 0;
 480         }
 481         a++, b++;
 482     }
 483     return 1;
 484 }
 485
 486 STATIC bool
 487 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 488 {
 489     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 490      * 'character' is a member of the Posix character class given by 'classnum'
 491      * that should be equivalent to a value in the typedef
 492      * '_char_class_number'.
 493      *
 494      * This just calls isFOO_lc on the code point for the character if it is in
 495      * the range 0-255.  Outside that range, all characters use Unicode
 496      * rules, ignoring any locale.  So use the Unicode function if this class
 497      * requires an inversion list, and use the Unicode macro otherwise. */
 498
 499     dVAR;
 500
 501     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 502
 503     if (UTF8_IS_INVARIANT(*character)) {
 504         return isFOO_lc(classnum, *character);
 505     }
 506     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 507         return isFOO_lc(classnum,
 508                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 509     }
 510
 511     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 512
 513     switch ((_char_class_number) classnum) {
 514         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 515         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 516         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 517         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 518         default:
 519             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 520                                         utf8_to_uvchr_buf(character, e, NULL));
 521     }
 522
 523     return FALSE; /* Things like CNTRL are always below 256 */
 524 }
 525
 526 STATIC U8 *
 527 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 528 {
 529     /* Returns the position of the first byte in the sequence between 's' and
 530      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 531      * */
 532
 533     PERL_ARGS_ASSERT_FIND_SPAN_END;
 534
 535     assert(send >= s);
 536
 537     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 538                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 539                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 540     {
 541         PERL_UINTMAX_T span_word;
 542
 543         /* Process per-byte until reach word boundary.  XXX This loop could be
 544          * eliminated if we knew that this platform had fast unaligned reads */
 545         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 546             if (*s != span_byte) {
 547                 return s;
 548             }
 549             s++;
 550         }
 551
 552         /* Create a word filled with the bytes we are spanning */
 553         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 554
 555         /* Process per-word as long as we have at least a full word left */
 556         do {
 557
 558             /* Keep going if the whole word is composed of 'span_byte's */
 559             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 560                 s += PERL_WORDSIZE;
 561                 continue;
 562             }
 563
 564             /* Here, at least one byte in the word isn't 'span_byte'. */
 565
 566 #ifdef EBCDIC
 567
 568             break;
 569
 570 #else
 571
 572             /* This xor leaves 1 bits only in those non-matching bytes */
 573             span_word ^= * (PERL_UINTMAX_T *) s;
 574
 575             /* Make sure the upper bit of each non-matching byte is set.  This
 576              * makes each such byte look like an ASCII platform variant byte */
 577             span_word |= span_word << 1;
 578             span_word |= span_word << 2;
 579             span_word |= span_word << 4;
 580
 581             /* That reduces the problem to what this function solves */
 582             return s + _variant_byte_number(span_word);
 583
 584 #endif
 585
 586         } while (s + PERL_WORDSIZE <= send);
 587     }
 588
 589     /* Process the straggler bytes beyond the final word boundary */
 590     while (s < send) {
 591         if (*s != span_byte) {
 592             return s;
 593         }
 594         s++;
 595     }
 596
 597     return s;
 598 }
 599
 600 STATIC U8 *
 601 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 602 {
 603     /* Returns the position of the first byte in the sequence between 's'
 604      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 605      * returns 'send' if none found.  It uses word-level operations instead of
 606      * byte to speed up the process */
 607
 608     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 609
 610     assert(send >= s);
 611     assert((byte & mask) == byte);
 612
 613 #ifndef EBCDIC
 614
 615     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 616                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 617                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 618     {
 619         PERL_UINTMAX_T word, mask_word;
 620
 621         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 622             if (((*s) & mask) == byte) {
 623                 return s;
 624             }
 625             s++;
 626         }
 627
 628         word      = PERL_COUNT_MULTIPLIER * byte;
 629         mask_word = PERL_COUNT_MULTIPLIER * mask;
 630
 631         do {
 632             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 633
 634             /* If 'masked' contains bytes with the bit pattern of 'byte' within
 635              * it, xoring with 'word' will leave each of the 8 bits in such
 636              * bytes be 0, and no byte containing any other bit pattern will be
 637              * 0. */
 638             masked ^= word;
 639
 640             /* This causes the most significant bit to be set to 1 for any
 641              * bytes in the word that aren't completely 0 */
 642             masked |= masked << 1;
 643             masked |= masked << 2;
 644             masked |= masked << 4;
 645
 646             /* The msbits are the same as what marks a byte as variant, so we
 647              * can use this mask.  If all msbits are 1, the word doesn't
 648              * contain 'byte' */
 649             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 650                 s += PERL_WORDSIZE;
 651                 continue;
 652             }
 653
 654             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 655              * and any that are, are 0.  Complement and re-AND to swap that */
 656             masked = ~ masked;
 657             masked &= PERL_VARIANTS_WORD_MASK;
 658
 659             /* This reduces the problem to that solved by this function */
 660             s += _variant_byte_number(masked);
 661             return s;
 662
 663         } while (s + PERL_WORDSIZE <= send);
 664     }
 665
 666 #endif
 667
 668     while (s < send) {
 669         if (((*s) & mask) == byte) {
 670             return s;
 671         }
 672         s++;
 673     }
 674
 675     return s;
 676 }
 677
 678 STATIC U8 *
 679 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 680 {
 681     /* Returns the position of the first byte in the sequence between 's' and
 682      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 683      * 'span_byte' should have been ANDed with 'mask' in the call of this
 684      * function.  Returns 'send' if none found.  Works like find_span_end(),
 685      * except for the AND */
 686
 687     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 688
 689     assert(send >= s);
 690     assert((span_byte & mask) == span_byte);
 691
 692     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 693                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 694                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 695     {
 696         PERL_UINTMAX_T span_word, mask_word;
 697
 698         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 699             if (((*s) & mask) != span_byte) {
 700                 return s;
 701             }
 702             s++;
 703         }
 704
 705         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 706         mask_word = PERL_COUNT_MULTIPLIER * mask;
 707
 708         do {
 709             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 710
 711             if (masked == span_word) {
 712                 s += PERL_WORDSIZE;
 713                 continue;
 714             }
 715
 716 #ifdef EBCDIC
 717
 718             break;
 719
 720 #else
 721
 722             masked ^= span_word;
 723             masked |= masked << 1;
 724             masked |= masked << 2;
 725             masked |= masked << 4;
 726             return s + _variant_byte_number(masked);
 727
 728 #endif
 729
 730         } while (s + PERL_WORDSIZE <= send);
 731     }
 732
 733     while (s < send) {
 734         if (((*s) & mask) != span_byte) {
 735             return s;
 736         }
 737         s++;
 738     }
 739
 740     return s;
 741 }
 742
 743 /*
 744  * pregexec and friends
 745  */
 746
 747 #ifndef PERL_IN_XSUB_RE
 748 /*
 749  - pregexec - match a regexp against a string
 750  */
 751 I32
 752 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 753          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 754 /* stringarg: the point in the string at which to begin matching */
 755 /* strend:    pointer to null at end of string */
 756 /* strbeg:    real beginning of string */
 757 /* minend:    end of match must be >= minend bytes after stringarg. */
 758 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 759  *            itself is accessed via the pointers above */
 760 /* nosave:    For optimizations. */
 761 {
 762     PERL_ARGS_ASSERT_PREGEXEC;
 763
 764     return
 765         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 766                       nosave ? 0 : REXEC_COPY_STR);
 767 }
 768 #endif
 769
 770
 771
 772 /* re_intuit_start():
 773  *
 774  * Based on some optimiser hints, try to find the earliest position in the
 775  * string where the regex could match.
 776  *
 777  *   rx:     the regex to match against
 778  *   sv:     the SV being matched: only used for utf8 flag; the string
 779  *           itself is accessed via the pointers below. Note that on
 780  *           something like an overloaded SV, SvPOK(sv) may be false
 781  *           and the string pointers may point to something unrelated to
 782  *           the SV itself.
 783  *   strbeg: real beginning of string
 784  *   strpos: the point in the string at which to begin matching
 785  *   strend: pointer to the byte following the last char of the string
 786  *   flags   currently unused; set to 0
 787  *   data:   currently unused; set to NULL
 788  *
 789  * The basic idea of re_intuit_start() is to use some known information
 790  * about the pattern, namely:
 791  *
 792  *   a) the longest known anchored substring (i.e. one that's at a
 793  *      constant offset from the beginning of the pattern; but not
 794  *      necessarily at a fixed offset from the beginning of the
 795  *      string);
 796  *   b) the longest floating substring (i.e. one that's not at a constant
 797  *      offset from the beginning of the pattern);
 798  *   c) Whether the pattern is anchored to the string; either
 799  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 800  *      or anchored to pos(): /\G/;
 801  *   d) A start class: a real or synthetic character class which
 802  *      represents which characters are legal at the start of the pattern;
 803  *
 804  * to either quickly reject the match, or to find the earliest position
 805  * within the string at which the pattern might match, thus avoiding
 806  * running the full NFA engine at those earlier locations, only to
 807  * eventually fail and retry further along.
 808  *
 809  * Returns NULL if the pattern can't match, or returns the address within
 810  * the string which is the earliest place the match could occur.
 811  *
 812  * The longest of the anchored and floating substrings is called 'check'
 813  * and is checked first. The other is called 'other' and is checked
 814  * second. The 'other' substring may not be present.  For example,
 815  *
 816  *    /(abc|xyz)ABC\d{0,3}DEFG/
 817  *
 818  * will have
 819  *
 820  *   check substr (float)    = "DEFG", offset 6..9 chars
 821  *   other substr (anchored) = "ABC",  offset 3..3 chars
 822  *   stclass = [ax]
 823  *
 824  * Be aware that during the course of this function, sometimes 'anchored'
 825  * refers to a substring being anchored relative to the start of the
 826  * pattern, and sometimes to the pattern itself being anchored relative to
 827  * the string. For example:
 828  *
 829  *   /\dabc/:   "abc" is anchored to the pattern;
 830  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 831  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 832  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 833  *                    but the pattern is anchored to the string.
 834  */
 835
 836 char *
 837 Perl_re_intuit_start(pTHX_
 838                     REGEXP * const rx,
 839                     SV *sv,
 840                     const char * const strbeg,
 841                     char *strpos,
 842                     char *strend,
 843                     const U32 flags,
 844                     re_scream_pos_data *data)
 845 {
 846     struct regexp *const prog = ReANY(rx);
 847     SSize_t start_shift = prog->check_offset_min;
 848     /* Should be nonnegative! */
 849     SSize_t end_shift   = 0;
 850     /* current lowest pos in string where the regex can start matching */
 851     char *rx_origin = strpos;
 852     SV *check;
 853     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 854     U8   other_ix = 1 - prog->substrs->check_ix;
 855     bool ml_anch = 0;
 856     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 857     char *check_at = NULL;              /* check substr found at this pos */
 858     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 859     RXi_GET_DECL(prog,progi);
 860     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 861     regmatch_info *const reginfo = &reginfo_buf;
 862     GET_RE_DEBUG_FLAGS_DECL;
 863
 864     PERL_ARGS_ASSERT_RE_INTUIT_START;
 865     PERL_UNUSED_ARG(flags);
 866     PERL_UNUSED_ARG(data);
 867
 868     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 869                 "Intuit: trying to determine minimum start position...\n"));
 870
 871     /* for now, assume that all substr offsets are positive. If at some point
 872      * in the future someone wants to do clever things with lookbehind and
 873      * -ve offsets, they'll need to fix up any code in this function
 874      * which uses these offsets. See the thread beginning
 875      * <20140113145929.GF27210@iabyn.com>
 876      */
 877     assert(prog->substrs->data[0].min_offset >= 0);
 878     assert(prog->substrs->data[0].max_offset >= 0);
 879     assert(prog->substrs->data[1].min_offset >= 0);
 880     assert(prog->substrs->data[1].max_offset >= 0);
 881     assert(prog->substrs->data[2].min_offset >= 0);
 882     assert(prog->substrs->data[2].max_offset >= 0);
 883
 884     /* for now, assume that if both present, that the floating substring
 885      * doesn't start before the anchored substring.
 886      * If you break this assumption (e.g. doing better optimisations
 887      * with lookahead/behind), then you'll need to audit the code in this
 888      * function carefully first
 889      */
 890     assert(
 891             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 892               && (prog->float_utf8    || prog->float_substr))
 893            || (prog->float_min_offset >= prog->anchored_offset));
 894
 895     /* byte rather than char calculation for efficiency. It fails
 896      * to quickly reject some cases that can't match, but will reject
 897      * them later after doing full char arithmetic */
 898     if (prog->minlen > strend - strpos) {
 899         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 900                               "  String too short...\n"));
 901         goto fail;
 902     }
 903
 904     RXp_MATCH_UTF8_set(prog, utf8_target);
 905     reginfo->is_utf8_target = cBOOL(utf8_target);
 906     reginfo->info_aux = NULL;
 907     reginfo->strbeg = strbeg;
 908     reginfo->strend = strend;
 909     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 910     reginfo->intuit = 1;
 911     /* not actually used within intuit, but zero for safety anyway */
 912     reginfo->poscache_maxiter = 0;
 913
 914     if (utf8_target) {
 915         if ((!prog->anchored_utf8 && prog->anchored_substr)
 916                 || (!prog->float_utf8 && prog->float_substr))
 917             to_utf8_substr(prog);
 918         check = prog->check_utf8;
 919     } else {
 920         if (!prog->check_substr && prog->check_utf8) {
 921             if (! to_byte_substr(prog)) {
 922                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 923             }
 924         }
 925         check = prog->check_substr;
 926     }
 927
 928     /* dump the various substring data */
 929     DEBUG_OPTIMISE_MORE_r({
 930         int i;
 931         for (i=0; i<=2; i++) {
 932             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 933                                   : prog->substrs->data[i].substr);
 934             if (!sv)
 935                 continue;
 936
 937             Perl_re_printf( aTHX_
 938                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
 939                 " useful=%" IVdf " utf8=%d [%s]\n",
 940                 i,
 941                 (IV)prog->substrs->data[i].min_offset,
 942                 (IV)prog->substrs->data[i].max_offset,
 943                 (IV)prog->substrs->data[i].end_shift,
 944                 BmUSEFUL(sv),
 945                 utf8_target ? 1 : 0,
 946                 SvPEEK(sv));
 947         }
 948     });
 949
 950     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 951
 952         /* ml_anch: check after \n?
 953          *
 954          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 955          * with /.*.../, these flags will have been added by the
 956          * compiler:
 957          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 958          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 959          */
 960         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 961                    && !(prog->intflags & PREGf_IMPLICIT);
 962
 963         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 964             /* we are only allowed to match at BOS or \G */
 965
 966             /* trivially reject if there's a BOS anchor and we're not at BOS.
 967              *
 968              * Note that we don't try to do a similar quick reject for
 969              * \G, since generally the caller will have calculated strpos
 970              * based on pos() and gofs, so the string is already correctly
 971              * anchored by definition; and handling the exceptions would
 972              * be too fiddly (e.g. REXEC_IGNOREPOS).
 973              */
 974             if (   strpos != strbeg
 975                 && (prog->intflags & PREGf_ANCH_SBOL))
 976             {
 977                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 978                                 "  Not at start...\n"));
 979                 goto fail;
 980             }
 981
 982             /* in the presence of an anchor, the anchored (relative to the
 983              * start of the regex) substr must also be anchored relative
 984              * to strpos. So quickly reject if substr isn't found there.
 985              * This works for \G too, because the caller will already have
 986              * subtracted gofs from pos, and gofs is the offset from the
 987              * \G to the start of the regex. For example, in /.abc\Gdef/,
 988              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 989              * caller will have set strpos=pos()-4; we look for the substr
 990              * at position pos()-4+1, which lines up with the "a" */
 991
 992             if (prog->check_offset_min == prog->check_offset_max) {
 993                 /* Substring at constant offset from beg-of-str... */
 994                 SSize_t slen = SvCUR(check);
 995                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 996
 997                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 998                     "  Looking for check substr at fixed offset %" IVdf "...\n",
 999                     (IV)prog->check_offset_min));
1000
1001                 if (SvTAIL(check)) {
1002                     /* In this case, the regex is anchored at the end too.
1003                      * Unless it's a multiline match, the lengths must match
1004                      * exactly, give or take a \n.  NB: slen >= 1 since
1005                      * the last char of check is \n */
1006                     if (!multiline
1007                         && (   strend - s > slen
1008                             || strend - s < slen - 1
1009                             || (strend - s == slen && strend[-1] != '\n')))
1010                     {
1011                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1012                                             "  String too long...\n"));
1013                         goto fail_finish;
1014                     }
1015                     /* Now should match s[0..slen-2] */
1016                     slen--;
1017                 }
1018                 if (slen && (strend - s < slen
1019                     || *SvPVX_const(check) != *s
1020                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1021                 {
1022                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1023                                     "  String not equal...\n"));
1024                     goto fail_finish;
1025                 }
1026
1027                 check_at = s;
1028                 goto success_at_start;
1029             }
1030         }
1031     }
1032
1033     end_shift = prog->check_end_shift;
1034
1035 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1036     if (end_shift < 0)
1037         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1038                    (IV)end_shift, RX_PRECOMP(rx));
1039 #endif
1040
1041   restart:
1042
1043     /* This is the (re)entry point of the main loop in this function.
1044      * The goal of this loop is to:
1045      * 1) find the "check" substring in the region rx_origin..strend
1046      *    (adjusted by start_shift / end_shift). If not found, reject
1047      *    immediately.
1048      * 2) If it exists, look for the "other" substr too if defined; for
1049      *    example, if the check substr maps to the anchored substr, then
1050      *    check the floating substr, and vice-versa. If not found, go
1051      *    back to (1) with rx_origin suitably incremented.
1052      * 3) If we find an rx_origin position that doesn't contradict
1053      *    either of the substrings, then check the possible additional
1054      *    constraints on rx_origin of /^.../m or a known start class.
1055      *    If these fail, then depending on which constraints fail, jump
1056      *    back to here, or to various other re-entry points further along
1057      *    that skip some of the first steps.
1058      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1059      *    substring. If the start position was determined to be at the
1060      *    beginning of the string  - so, not rejected, but not optimised,
1061      *    since we have to run regmatch from position 0 - decrement the
1062      *    BmUSEFUL() count. Otherwise increment it.
1063      */
1064
1065
1066     /* first, look for the 'check' substring */
1067
1068     {
1069         U8* start_point;
1070         U8* end_point;
1071
1072         DEBUG_OPTIMISE_MORE_r({
1073             Perl_re_printf( aTHX_
1074                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1075                 " Start shift: %" IVdf " End shift %" IVdf
1076                 " Real end Shift: %" IVdf "\n",
1077                 (IV)(rx_origin - strbeg),
1078                 (IV)prog->check_offset_min,
1079                 (IV)start_shift,
1080                 (IV)end_shift,
1081                 (IV)prog->check_end_shift);
1082         });
1083
1084         end_point = HOPBACK3(strend, end_shift, rx_origin);
1085         if (!end_point)
1086             goto fail_finish;
1087         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1088         if (!start_point)
1089             goto fail_finish;
1090
1091
1092         /* If the regex is absolutely anchored to either the start of the
1093          * string (SBOL) or to pos() (ANCH_GPOS), then
1094          * check_offset_max represents an upper bound on the string where
1095          * the substr could start. For the ANCH_GPOS case, we assume that
1096          * the caller of intuit will have already set strpos to
1097          * pos()-gofs, so in this case strpos + offset_max will still be
1098          * an upper bound on the substr.
1099          */
1100         if (!ml_anch
1101             && prog->intflags & PREGf_ANCH
1102             && prog->check_offset_max != SSize_t_MAX)
1103         {
1104             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1105             const char * const anchor =
1106                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1107             SSize_t targ_len = (char*)end_point - anchor;
1108
1109             if (check_len > targ_len) {
1110                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1111                               "Target string too short to match required substring...\n"));
1112                 goto fail_finish;
1113             }
1114
1115             /* do a bytes rather than chars comparison. It's conservative;
1116              * so it skips doing the HOP if the result can't possibly end
1117              * up earlier than the old value of end_point.
1118              */
1119             assert(anchor + check_len <= (char *)end_point);
1120             if (prog->check_offset_max + check_len < targ_len) {
1121                 end_point = HOP3lim((U8*)anchor,
1122                                 prog->check_offset_max,
1123                                 end_point - check_len
1124                             )
1125                             + check_len;
1126                 if (end_point < start_point)
1127                     goto fail_finish;
1128             }
1129         }
1130
1131         check_at = fbm_instr( start_point, end_point,
1132                       check, multiline ? FBMrf_MULTILINE : 0);
1133
1134         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1135             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1136             (IV)((char*)start_point - strbeg),
1137             (IV)((char*)end_point   - strbeg),
1138             (IV)(check_at ? check_at - strbeg : -1)
1139         ));
1140
1141         /* Update the count-of-usability, remove useless subpatterns,
1142             unshift s.  */
1143
1144         DEBUG_EXECUTE_r({
1145             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1146                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1147             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1148                               (check_at ? "Found" : "Did not find"),
1149                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1150                     ? "anchored" : "floating"),
1151                 quoted,
1152                 RE_SV_TAIL(check),
1153                 (check_at ? " at offset " : "...\n") );
1154         });
1155
1156         if (!check_at)
1157             goto fail_finish;
1158         /* set rx_origin to the minimum position where the regex could start
1159          * matching, given the constraint of the just-matched check substring.
1160          * But don't set it lower than previously.
1161          */
1162
1163         if (check_at - rx_origin > prog->check_offset_max)
1164             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1165         /* Finish the diagnostic message */
1166         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1167             "%ld (rx_origin now %" IVdf ")...\n",
1168             (long)(check_at - strbeg),
1169             (IV)(rx_origin - strbeg)
1170         ));
1171     }
1172
1173
1174     /* now look for the 'other' substring if defined */
1175
1176     if (prog->substrs->data[other_ix].utf8_substr
1177         || prog->substrs->data[other_ix].substr)
1178     {
1179         /* Take into account the "other" substring. */
1180         char *last, *last1;
1181         char *s;
1182         SV* must;
1183         struct reg_substr_datum *other;
1184
1185       do_other_substr:
1186         other = &prog->substrs->data[other_ix];
1187         if (!utf8_target && !other->substr) {
1188             if (!to_byte_substr(prog)) {
1189                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1190             }
1191         }
1192
1193         /* if "other" is anchored:
1194          * we've previously found a floating substr starting at check_at.
1195          * This means that the regex origin must lie somewhere
1196          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1197          * and max:                 HOP3(check_at, -check_offset_min)
1198          * (except that min will be >= strpos)
1199          * So the fixed  substr must lie somewhere between
1200          *  HOP3(min, anchored_offset)
1201          *  HOP3(max, anchored_offset) + SvCUR(substr)
1202          */
1203
1204         /* if "other" is floating
1205          * Calculate last1, the absolute latest point where the
1206          * floating substr could start in the string, ignoring any
1207          * constraints from the earlier fixed match. It is calculated
1208          * as follows:
1209          *
1210          * strend - prog->minlen (in chars) is the absolute latest
1211          * position within the string where the origin of the regex
1212          * could appear. The latest start point for the floating
1213          * substr is float_min_offset(*) on from the start of the
1214          * regex.  last1 simply combines thee two offsets.
1215          *
1216          * (*) You might think the latest start point should be
1217          * float_max_offset from the regex origin, and technically
1218          * you'd be correct. However, consider
1219          *    /a\d{2,4}bcd\w/
1220          * Here, float min, max are 3,5 and minlen is 7.
1221          * This can match either
1222          *    /a\d\dbcd\w/
1223          *    /a\d\d\dbcd\w/
1224          *    /a\d\d\d\dbcd\w/
1225          * In the first case, the regex matches minlen chars; in the
1226          * second, minlen+1, in the third, minlen+2.
1227          * In the first case, the floating offset is 3 (which equals
1228          * float_min), in the second, 4, and in the third, 5 (which
1229          * equals float_max). In all cases, the floating string bcd
1230          * can never start more than 4 chars from the end of the
1231          * string, which equals minlen - float_min. As the substring
1232          * starts to match more than float_min from the start of the
1233          * regex, it makes the regex match more than minlen chars,
1234          * and the two cancel each other out. So we can always use
1235          * float_min - minlen, rather than float_max - minlen for the
1236          * latest position in the string.
1237          *
1238          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1239          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1240          */
1241
1242         assert(prog->minlen >= other->min_offset);
1243         last1 = HOP3c(strend,
1244                         other->min_offset - prog->minlen, strbeg);
1245
1246         if (other_ix) {/* i.e. if (other-is-float) */
1247             /* last is the latest point where the floating substr could
1248              * start, *given* any constraints from the earlier fixed
1249              * match. This constraint is that the floating string starts
1250              * <= float_max_offset chars from the regex origin (rx_origin).
1251              * If this value is less than last1, use it instead.
1252              */
1253             assert(rx_origin <= last1);
1254             last =
1255                 /* this condition handles the offset==infinity case, and
1256                  * is a short-cut otherwise. Although it's comparing a
1257                  * byte offset to a char length, it does so in a safe way,
1258                  * since 1 char always occupies 1 or more bytes,
1259                  * so if a string range is  (last1 - rx_origin) bytes,
1260                  * it will be less than or equal to  (last1 - rx_origin)
1261                  * chars; meaning it errs towards doing the accurate HOP3
1262                  * rather than just using last1 as a short-cut */
1263                 (last1 - rx_origin) < other->max_offset
1264                     ? last1
1265                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1266         }
1267         else {
1268             assert(strpos + start_shift <= check_at);
1269             last = HOP4c(check_at, other->min_offset - start_shift,
1270                         strbeg, strend);
1271         }
1272
1273         s = HOP3c(rx_origin, other->min_offset, strend);
1274         if (s < other_last)     /* These positions already checked */
1275             s = other_last;
1276
1277         must = utf8_target ? other->utf8_substr : other->substr;
1278         assert(SvPOK(must));
1279         {
1280             char *from = s;
1281             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1282
1283             if (to > strend)
1284                 to = strend;
1285             if (from > to) {
1286                 s = NULL;
1287                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1288                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1289                     (IV)(from - strbeg),
1290                     (IV)(to   - strbeg)
1291                 ));
1292             }
1293             else {
1294                 s = fbm_instr(
1295                     (unsigned char*)from,
1296                     (unsigned char*)to,
1297                     must,
1298                     multiline ? FBMrf_MULTILINE : 0
1299                 );
1300                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1301                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1302                     (IV)(from - strbeg),
1303                     (IV)(to   - strbeg),
1304                     (IV)(s ? s - strbeg : -1)
1305                 ));
1306             }
1307         }
1308
1309         DEBUG_EXECUTE_r({
1310             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1311                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1312             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1313                 s ? "Found" : "Contradicts",
1314                 other_ix ? "floating" : "anchored",
1315                 quoted, RE_SV_TAIL(must));
1316         });
1317
1318
1319         if (!s) {
1320             /* last1 is latest possible substr location. If we didn't
1321              * find it before there, we never will */
1322             if (last >= last1) {
1323                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1324                                         "; giving up...\n"));
1325                 goto fail_finish;
1326             }
1327
1328             /* try to find the check substr again at a later
1329              * position. Maybe next time we'll find the "other" substr
1330              * in range too */
1331             other_last = HOP3c(last, 1, strend) /* highest failure */;
1332             rx_origin =
1333                 other_ix /* i.e. if other-is-float */
1334                     ? HOP3c(rx_origin, 1, strend)
1335                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1336             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1337                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1338                 (other_ix ? "floating" : "anchored"),
1339                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1340                 (IV)(rx_origin - strbeg)
1341             ));
1342             goto restart;
1343         }
1344         else {
1345             if (other_ix) { /* if (other-is-float) */
1346                 /* other_last is set to s, not s+1, since its possible for
1347                  * a floating substr to fail first time, then succeed
1348                  * second time at the same floating position; e.g.:
1349                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1350                  * The first time round, anchored and float match at
1351                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1352                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1353                  */
1354                 other_last = s;
1355             }
1356             else {
1357                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1358                 other_last = HOP3c(s, 1, strend);
1359             }
1360             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1361                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1362                   (long)(s - strbeg),
1363                 (IV)(rx_origin - strbeg)
1364               ));
1365
1366         }
1367     }
1368     else {
1369         DEBUG_OPTIMISE_MORE_r(
1370             Perl_re_printf( aTHX_
1371                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1372                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1373                 " strend:%" IVdf "\n",
1374                 (IV)prog->check_offset_min,
1375                 (IV)prog->check_offset_max,
1376                 (IV)(check_at-strbeg),
1377                 (IV)(rx_origin-strbeg),
1378                 (IV)(rx_origin-check_at),
1379                 (IV)(strend-strbeg)
1380             )
1381         );
1382     }
1383
1384   postprocess_substr_matches:
1385
1386     /* handle the extra constraint of /^.../m if present */
1387
1388     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1389         char *s;
1390
1391         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1392                         "  looking for /^/m anchor"));
1393
1394         /* we have failed the constraint of a \n before rx_origin.
1395          * Find the next \n, if any, even if it's beyond the current
1396          * anchored and/or floating substrings. Whether we should be
1397          * scanning ahead for the next \n or the next substr is debatable.
1398          * On the one hand you'd expect rare substrings to appear less
1399          * often than \n's. On the other hand, searching for \n means
1400          * we're effectively flipping between check_substr and "\n" on each
1401          * iteration as the current "rarest" string candidate, which
1402          * means for example that we'll quickly reject the whole string if
1403          * hasn't got a \n, rather than trying every substr position
1404          * first
1405          */
1406
1407         s = HOP3c(strend, - prog->minlen, strpos);
1408         if (s <= rx_origin ||
1409             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1410         {
1411             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1412                             "  Did not find /%s^%s/m...\n",
1413                             PL_colors[0], PL_colors[1]));
1414             goto fail_finish;
1415         }
1416
1417         /* earliest possible origin is 1 char after the \n.
1418          * (since *rx_origin == '\n', it's safe to ++ here rather than
1419          * HOP(rx_origin, 1)) */
1420         rx_origin++;
1421
1422         if (prog->substrs->check_ix == 0  /* check is anchored */
1423             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1424         {
1425             /* Position contradicts check-string; either because
1426              * check was anchored (and thus has no wiggle room),
1427              * or check was float and rx_origin is above the float range */
1428             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1429                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1430                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1431             goto restart;
1432         }
1433
1434         /* if we get here, the check substr must have been float,
1435          * is in range, and we may or may not have had an anchored
1436          * "other" substr which still contradicts */
1437         assert(prog->substrs->check_ix); /* check is float */
1438
1439         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1440             /* whoops, the anchored "other" substr exists, so we still
1441              * contradict. On the other hand, the float "check" substr
1442              * didn't contradict, so just retry the anchored "other"
1443              * substr */
1444             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1445                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1446                 PL_colors[0], PL_colors[1],
1447                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1448                 (IV)(rx_origin - strbeg)
1449             ));
1450             goto do_other_substr;
1451         }
1452
1453         /* success: we don't contradict the found floating substring
1454          * (and there's no anchored substr). */
1455         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1456             "  Found /%s^%s/m with rx_origin %ld...\n",
1457             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1458     }
1459     else {
1460         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1461             "  (multiline anchor test skipped)\n"));
1462     }
1463
1464   success_at_start:
1465
1466
1467     /* if we have a starting character class, then test that extra constraint.
1468      * (trie stclasses are too expensive to use here, we are better off to
1469      * leave it to regmatch itself) */
1470
1471     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1472         const U8* const str = (U8*)STRING(progi->regstclass);
1473
1474         /* XXX this value could be pre-computed */
1475         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1476                     ?  (reginfo->is_utf8_pat
1477                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1478                         : STR_LEN(progi->regstclass))
1479                     : 1);
1480         char * endpos;
1481         char *s;
1482         /* latest pos that a matching float substr constrains rx start to */
1483         char *rx_max_float = NULL;
1484
1485         /* if the current rx_origin is anchored, either by satisfying an
1486          * anchored substring constraint, or a /^.../m constraint, then we
1487          * can reject the current origin if the start class isn't found
1488          * at the current position. If we have a float-only match, then
1489          * rx_origin is constrained to a range; so look for the start class
1490          * in that range. if neither, then look for the start class in the
1491          * whole rest of the string */
1492
1493         /* XXX DAPM it's not clear what the minlen test is for, and why
1494          * it's not used in the floating case. Nothing in the test suite
1495          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1496          * Here are some old comments, which may or may not be correct:
1497          *
1498          *   minlen == 0 is possible if regstclass is \b or \B,
1499          *   and the fixed substr is ''$.
1500          *   Since minlen is already taken into account, rx_origin+1 is
1501          *   before strend; accidentally, minlen >= 1 guaranties no false
1502          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1503          *   0) below assumes that regstclass does not come from lookahead...
1504          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1505          *   This leaves EXACTF-ish only, which are dealt with in
1506          *   find_byclass().
1507          */
1508
1509         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1510             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1511         else if (prog->float_substr || prog->float_utf8) {
1512             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1513             endpos = HOP3clim(rx_max_float, cl_l, strend);
1514         }
1515         else
1516             endpos= strend;
1517
1518         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1519             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1520             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1521               (IV)start_shift, (IV)(check_at - strbeg),
1522               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1523
1524         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1525                             reginfo);
1526         if (!s) {
1527             if (endpos == strend) {
1528                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1529                                 "  Could not match STCLASS...\n") );
1530                 goto fail;
1531             }
1532             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1533                                "  This position contradicts STCLASS...\n") );
1534             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1535                         && !(prog->intflags & PREGf_IMPLICIT))
1536                 goto fail;
1537
1538             /* Contradict one of substrings */
1539             if (prog->anchored_substr || prog->anchored_utf8) {
1540                 if (prog->substrs->check_ix == 1) { /* check is float */
1541                     /* Have both, check_string is floating */
1542                     assert(rx_origin + start_shift <= check_at);
1543                     if (rx_origin + start_shift != check_at) {
1544                         /* not at latest position float substr could match:
1545                          * Recheck anchored substring, but not floating.
1546                          * The condition above is in bytes rather than
1547                          * chars for efficiency. It's conservative, in
1548                          * that it errs on the side of doing 'goto
1549                          * do_other_substr'. In this case, at worst,
1550                          * an extra anchored search may get done, but in
1551                          * practice the extra fbm_instr() is likely to
1552                          * get skipped anyway. */
1553                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1554                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1555                             (long)(other_last - strbeg),
1556                             (IV)(rx_origin - strbeg)
1557                         ));
1558                         goto do_other_substr;
1559                     }
1560                 }
1561             }
1562             else {
1563                 /* float-only */
1564
1565                 if (ml_anch) {
1566                     /* In the presence of ml_anch, we might be able to
1567                      * find another \n without breaking the current float
1568                      * constraint. */
1569
1570                     /* strictly speaking this should be HOP3c(..., 1, ...),
1571                      * but since we goto a block of code that's going to
1572                      * search for the next \n if any, its safe here */
1573                     rx_origin++;
1574                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1575                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1576                               PL_colors[0], PL_colors[1],
1577                               (long)(rx_origin - strbeg)) );
1578                     goto postprocess_substr_matches;
1579                 }
1580
1581                 /* strictly speaking this can never be true; but might
1582                  * be if we ever allow intuit without substrings */
1583                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1584                     goto fail;
1585
1586                 rx_origin = rx_max_float;
1587             }
1588
1589             /* at this point, any matching substrings have been
1590              * contradicted. Start again... */
1591
1592             rx_origin = HOP3c(rx_origin, 1, strend);
1593
1594             /* uses bytes rather than char calculations for efficiency.
1595              * It's conservative: it errs on the side of doing 'goto restart',
1596              * where there is code that does a proper char-based test */
1597             if (rx_origin + start_shift + end_shift > strend) {
1598                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1599                                        "  Could not match STCLASS...\n") );
1600                 goto fail;
1601             }
1602             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1603                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1604                 (prog->substrs->check_ix ? "floating" : "anchored"),
1605                 (long)(rx_origin + start_shift - strbeg),
1606                 (IV)(rx_origin - strbeg)
1607             ));
1608             goto restart;
1609         }
1610
1611         /* Success !!! */
1612
1613         if (rx_origin != s) {
1614             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1615                         "  By STCLASS: moving %ld --> %ld\n",
1616                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1617                    );
1618         }
1619         else {
1620             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1621                                   "  Does not contradict STCLASS...\n");
1622                    );
1623         }
1624     }
1625
1626     /* Decide whether using the substrings helped */
1627
1628     if (rx_origin != strpos) {
1629         /* Fixed substring is found far enough so that the match
1630            cannot start at strpos. */
1631
1632         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1633         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1634     }
1635     else {
1636         /* The found rx_origin position does not prohibit matching at
1637          * strpos, so calling intuit didn't gain us anything. Decrement
1638          * the BmUSEFUL() count on the check substring, and if we reach
1639          * zero, free it.  */
1640         if (!(prog->intflags & PREGf_NAUGHTY)
1641             && (utf8_target ? (
1642                 prog->check_utf8                /* Could be deleted already */
1643                 && --BmUSEFUL(prog->check_utf8) < 0
1644                 && (prog->check_utf8 == prog->float_utf8)
1645             ) : (
1646                 prog->check_substr              /* Could be deleted already */
1647                 && --BmUSEFUL(prog->check_substr) < 0
1648                 && (prog->check_substr == prog->float_substr)
1649             )))
1650         {
1651             /* If flags & SOMETHING - do not do it many times on the same match */
1652             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1653             /* XXX Does the destruction order has to change with utf8_target? */
1654             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1655             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1656             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1657             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1658             check = NULL;                       /* abort */
1659             /* XXXX This is a remnant of the old implementation.  It
1660                     looks wasteful, since now INTUIT can use many
1661                     other heuristics. */
1662             prog->extflags &= ~RXf_USE_INTUIT;
1663         }
1664     }
1665
1666     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1667             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1668              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1669
1670     return rx_origin;
1671
1672   fail_finish:                          /* Substring not found */
1673     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1674         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1675   fail:
1676     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1677                           PL_colors[4], PL_colors[5]));
1678     return NULL;
1679 }
1680
1681
1682 #define DECL_TRIE_TYPE(scan) \
1683     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1684                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1685                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1686                     trie_type = ((scan->flags == EXACT)                             \
1687                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1688                                  : (scan->flags == EXACTL)                          \
1689                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1690                                     : (scan->flags == EXACTFAA)                     \
1691                                       ? (utf8_target                                \
1692                                          ? trie_utf8_exactfa_fold                   \
1693                                          : trie_latin_utf8_exactfa_fold)            \
1694                                       : (scan->flags == EXACTFLU8                   \
1695                                          ? (utf8_target                             \
1696                                            ? trie_flu8                              \
1697                                            : trie_flu8_latin)                       \
1698                                          : (utf8_target                             \
1699                                            ? trie_utf8_fold                         \
1700                                            : trie_latin_utf8_fold)))
1701
1702 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1703  * 'foldbuf+sizeof(foldbuf)' */
1704 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1705 STMT_START {                                                                        \
1706     STRLEN skiplen;                                                                 \
1707     U8 flags = FOLD_FLAGS_FULL;                                                     \
1708     switch (trie_type) {                                                            \
1709     case trie_flu8:                                                                 \
1710         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1711         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1712             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1713         }                                                                           \
1714         goto do_trie_utf8_fold;                                                     \
1715     case trie_utf8_exactfa_fold:                                                    \
1716         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1717         /* FALLTHROUGH */                                                           \
1718     case trie_utf8_fold:                                                            \
1719       do_trie_utf8_fold:                                                            \
1720         if ( foldlen>0 ) {                                                          \
1721             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1722             foldlen -= len;                                                         \
1723             uscan += len;                                                           \
1724             len=0;                                                                  \
1725         } else {                                                                    \
1726             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1727                                                                             flags); \
1728             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1729             skiplen = UVCHR_SKIP( uvc );                                            \
1730             foldlen -= skiplen;                                                     \
1731             uscan = foldbuf + skiplen;                                              \
1732         }                                                                           \
1733         break;                                                                      \
1734     case trie_flu8_latin:                                                           \
1735         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1736         goto do_trie_latin_utf8_fold;                                               \
1737     case trie_latin_utf8_exactfa_fold:                                              \
1738         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1739         /* FALLTHROUGH */                                                           \
1740     case trie_latin_utf8_fold:                                                      \
1741       do_trie_latin_utf8_fold:                                                      \
1742         if ( foldlen>0 ) {                                                          \
1743             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1744             foldlen -= len;                                                         \
1745             uscan += len;                                                           \
1746             len=0;                                                                  \
1747         } else {                                                                    \
1748             len = 1;                                                                \
1749             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1750             skiplen = UVCHR_SKIP( uvc );                                            \
1751             foldlen -= skiplen;                                                     \
1752             uscan = foldbuf + skiplen;                                              \
1753         }                                                                           \
1754         break;                                                                      \
1755     case trie_utf8l:                                                                \
1756         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1757         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1758             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1759         }                                                                           \
1760         /* FALLTHROUGH */                                                           \
1761     case trie_utf8:                                                                 \
1762         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1763         break;                                                                      \
1764     case trie_plain:                                                                \
1765         uvc = (UV)*uc;                                                              \
1766         len = 1;                                                                    \
1767     }                                                                               \
1768     if (uvc < 256) {                                                                \
1769         charid = trie->charmap[ uvc ];                                              \
1770     }                                                                               \
1771     else {                                                                          \
1772         charid = 0;                                                                 \
1773         if (widecharmap) {                                                          \
1774             SV** const svpp = hv_fetch(widecharmap,                                 \
1775                         (char*)&uvc, sizeof(UV), 0);                                \
1776             if (svpp)                                                               \
1777                 charid = (U16)SvIV(*svpp);                                          \
1778         }                                                                           \
1779     }                                                                               \
1780 } STMT_END
1781
1782 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1783     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1784                 startpos, doutf8, depth)
1785
1786 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1787     STMT_START {                                            \
1788         while (s < strend) {                                \
1789             CODE                                            \
1790             s += ((UTF8)                                    \
1791                   ? UTF8_SAFE_SKIP(s, reginfo->strend)      \
1792                   : 1);                                     \
1793         }                                                   \
1794     } STMT_END
1795
1796 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1797     STMT_START {                                            \
1798         while (s < strend) {                                \
1799             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1800         }                                                   \
1801     } STMT_END
1802
1803 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1804     if (COND) {                                                \
1805         FBC_CHECK_AND_TRY                                      \
1806         s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
1807         previous_occurrence_end = s;                           \
1808     }                                                          \
1809     else {                                                     \
1810         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1811     }
1812
1813 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1814     if (utf8_target) {                                         \
1815         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1816     }                                                          \
1817     else {                                                     \
1818         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1819     }
1820
1821 /* We keep track of where the next character should start after an occurrence
1822  * of the one we're looking for.  Knowing that, we can see right away if the
1823  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1824  * don't accept the 2nd and succeeding adjacent occurrences */
1825 #define FBC_CHECK_AND_TRY                                           \
1826         if (   (   doevery                                          \
1827                 || s != previous_occurrence_end)                    \
1828             && (   reginfo->intuit                                  \
1829                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1830         {                                                           \
1831             goto got_it;                                            \
1832         }
1833
1834
1835 /* This differs from the above macros in that it calls a function which returns
1836  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1837  * there is no such occurrence. */
1838 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1839     while (s < strend) {                                    \
1840         s = (f);                                            \
1841         if (s >= strend) {                                  \
1842             break;                                          \
1843         }                                                   \
1844                                                             \
1845         FBC_CHECK_AND_TRY                                   \
1846         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1847         previous_occurrence_end = s;                        \
1848     }
1849
1850 /* This differs from the above macros in that it is passed a single byte that
1851  * is known to begin the next occurrence of the thing being looked for in 's'.
1852  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1853  * at that position. */
1854 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
1855     while (s < strend) {                                    \
1856         s = (char *) memchr(s, byte, strend -s);            \
1857         if (s == NULL) {                                    \
1858             s = (char *) strend;                            \
1859             break;                                          \
1860         }                                                   \
1861                                                             \
1862         if (COND) {                                         \
1863             FBC_CHECK_AND_TRY                               \
1864             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1865             previous_occurrence_end = s;                    \
1866         }                                                   \
1867         else {                                              \
1868             s += UTF8SKIP(s);                               \
1869         }                                                   \
1870     }
1871
1872 /* The three macros below are slightly different versions of the same logic.
1873  *
1874  * The first is for /a and /aa when the target string is UTF-8.  This can only
1875  * match ascii, but it must advance based on UTF-8.   The other two handle the
1876  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1877  * for the boundary (or non-boundary) between a word and non-word character.
1878  * The utf8 and non-utf8 cases have the same logic, but the details must be
1879  * different.  Find the "wordness" of the character just prior to this one, and
1880  * compare it with the wordness of this one.  If they differ, we have a
1881  * boundary.  At the beginning of the string, pretend that the previous
1882  * character was a new-line.
1883  *
1884  * All these macros uncleanly have side-effects with each other and outside
1885  * variables.  So far it's been too much trouble to clean-up
1886  *
1887  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1888  *               a word character or not.
1889  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1890  *               word/non-word
1891  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1892  *
1893  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1894  * are looking for a boundary or for a non-boundary.  If we are looking for a
1895  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1896  * see if this tentative match actually works, and if so, to quit the loop
1897  * here.  And vice-versa if we are looking for a non-boundary.
1898  *
1899  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1900  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1901  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1902  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1903  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1904  * complement.  But in that branch we complement tmp, meaning that at the
1905  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1906  * which means at the top of the loop in the next iteration, it is
1907  * TEST_NON_UTF8(s-1) */
1908 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1909     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1910     tmp = TEST_NON_UTF8(tmp);                                                  \
1911     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1912         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1913             tmp = !tmp;                                                        \
1914             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1915         }                                                                      \
1916         else {                                                                 \
1917             IF_FAIL;                                                           \
1918         }                                                                      \
1919     );                                                                         \
1920
1921 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1922  * TEST_UTF8 is a macro that for the same input code points returns identically
1923  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1924 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1925     if (s == reginfo->strbeg) {                                                \
1926         tmp = '\n';                                                            \
1927     }                                                                          \
1928     else { /* Back-up to the start of the previous character */                \
1929         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1930         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1931                                                        0, UTF8_ALLOW_DEFAULT); \
1932     }                                                                          \
1933     tmp = TEST_UV(tmp);                                                        \
1934     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1935         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1936             tmp = !tmp;                                                        \
1937             IF_SUCCESS;                                                        \
1938         }                                                                      \
1939         else {                                                                 \
1940             IF_FAIL;                                                           \
1941         }                                                                      \
1942     );
1943
1944 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1945  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1946  * macros below */
1947 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1948     if (utf8_target) {                                                         \
1949         UTF8_CODE                                                              \
1950     }                                                                          \
1951     else {  /* Not utf8 */                                                     \
1952         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1953         tmp = TEST_NON_UTF8(tmp);                                              \
1954         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
1955             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1956                 IF_SUCCESS;                                                    \
1957                 tmp = !tmp;                                                    \
1958             }                                                                  \
1959             else {                                                             \
1960                 IF_FAIL;                                                       \
1961             }                                                                  \
1962         );                                                                     \
1963     }                                                                          \
1964     /* Here, things have been set up by the previous code so that tmp is the   \
1965      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1966      * utf8ness of the target).  We also have to check if this matches against \
1967      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1968      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1969      * string */                                                               \
1970     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1971         IF_SUCCESS;                                                            \
1972     }                                                                          \
1973     else {                                                                     \
1974         IF_FAIL;                                                               \
1975     }
1976
1977 /* This is the macro to use when we want to see if something that looks like it
1978  * could match, actually does, and if so exits the loop.  It needs to be used
1979  * only for bounds checking macros, as it allows for matching beyond the end of
1980  * string (which should be zero length without having to look at the string
1981  * contents) */
1982 #define REXEC_FBC_TRYIT                                                     \
1983     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
1984         goto got_it
1985
1986 /* The only difference between the BOUND and NBOUND cases is that
1987  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1988  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1989  * with the other one being empty (PLACEHOLDER is defined as empty).
1990  *
1991  * The TEST_FOO parameters are for operating on different forms of input, but
1992  * all should be ones that return identically for the same underlying code
1993  * points */
1994 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1995     FBC_BOUND_COMMON(                                                          \
1996           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1997           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1998
1999 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2000     FBC_BOUND_COMMON(                                                          \
2001             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2002             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2003
2004 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2005     FBC_BOUND_COMMON(                                                          \
2006           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2007           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2008
2009 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2010     FBC_BOUND_COMMON(                                                          \
2011             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2012             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2013
2014 #ifdef DEBUGGING
2015 static IV
2016 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2017   IV cp_out = _invlist_search(invlist, cp_in);
2018   assert(cp_out >= 0);
2019   return cp_out;
2020 }
2021 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2022         invmap[S_get_break_val_cp_checked(invlist, cp)]
2023 #else
2024 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2025         invmap[_invlist_search(invlist, cp)]
2026 #endif
2027
2028 /* Takes a pointer to an inversion list, a pointer to its corresponding
2029  * inversion map, and a code point, and returns the code point's value
2030  * according to the two arrays.  It assumes that all code points have a value.
2031  * This is used as the base macro for macros for particular properties */
2032 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2033         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2034
2035 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2036  * of a code point, returning the value for the first code point in the string.
2037  * And it takes the particular macro name that finds the desired value given a
2038  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2039 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2040              (__ASSERT_(pos < strend)                                          \
2041                  /* Note assumes is valid UTF-8 */                             \
2042              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2043
2044 /* Returns the GCB value for the input code point */
2045 #define getGCB_VAL_CP(cp)                                                      \
2046           _generic_GET_BREAK_VAL_CP(                                           \
2047                                     PL_GCB_invlist,                            \
2048                                     _Perl_GCB_invmap,                          \
2049                                     (cp))
2050
2051 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2052  * bounded by pos and strend */
2053 #define getGCB_VAL_UTF8(pos, strend)                                           \
2054     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2055
2056 /* Returns the LB value for the input code point */
2057 #define getLB_VAL_CP(cp)                                                       \
2058           _generic_GET_BREAK_VAL_CP(                                           \
2059                                     PL_LB_invlist,                             \
2060                                     _Perl_LB_invmap,                           \
2061                                     (cp))
2062
2063 /* Returns the LB value for the first code point in the UTF-8 encoded string
2064  * bounded by pos and strend */
2065 #define getLB_VAL_UTF8(pos, strend)                                            \
2066     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2067
2068
2069 /* Returns the SB value for the input code point */
2070 #define getSB_VAL_CP(cp)                                                       \
2071           _generic_GET_BREAK_VAL_CP(                                           \
2072                                     PL_SB_invlist,                             \
2073                                     _Perl_SB_invmap,                     \
2074                                     (cp))
2075
2076 /* Returns the SB value for the first code point in the UTF-8 encoded string
2077  * bounded by pos and strend */
2078 #define getSB_VAL_UTF8(pos, strend)                                            \
2079     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2080
2081 /* Returns the WB value for the input code point */
2082 #define getWB_VAL_CP(cp)                                                       \
2083           _generic_GET_BREAK_VAL_CP(                                           \
2084                                     PL_WB_invlist,                             \
2085                                     _Perl_WB_invmap,                         \
2086                                     (cp))
2087
2088 /* Returns the WB value for the first code point in the UTF-8 encoded string
2089  * bounded by pos and strend */
2090 #define getWB_VAL_UTF8(pos, strend)                                            \
2091     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2092
2093 /* We know what class REx starts with.  Try to find this position... */
2094 /* if reginfo->intuit, its a dryrun */
2095 /* annoyingly all the vars in this routine have different names from their counterparts
2096    in regmatch. /grrr */
2097 STATIC char *
2098 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2099     const char *strend, regmatch_info *reginfo)
2100 {
2101     dVAR;
2102
2103     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2104     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2105
2106     char *pat_string;   /* The pattern's exactish string */
2107     char *pat_end;          /* ptr to end char of pat_string */
2108     re_fold_t folder;   /* Function for computing non-utf8 folds */
2109     const U8 *fold_array;   /* array for folding ords < 256 */
2110     STRLEN ln;
2111     STRLEN lnc;
2112     U8 c1;
2113     U8 c2;
2114     char *e = NULL;
2115
2116     /* In some cases we accept only the first occurence of 'x' in a sequence of
2117      * them.  This variable points to just beyond the end of the previous
2118      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2119      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2120      * hop back.) */
2121     char * previous_occurrence_end = 0;
2122
2123     I32 tmp;            /* Scratch variable */
2124     const bool utf8_target = reginfo->is_utf8_target;
2125     UV utf8_fold_flags = 0;
2126     const bool is_utf8_pat = reginfo->is_utf8_pat;
2127     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2128                                    with a result inverts that result, as 0^1 =
2129                                    1 and 1^1 = 0 */
2130     _char_class_number classnum;
2131
2132     RXi_GET_DECL(prog,progi);
2133
2134     PERL_ARGS_ASSERT_FIND_BYCLASS;
2135
2136     /* We know what class it must start with. */
2137     switch (OP(c)) {
2138     case ANYOFPOSIXL:
2139     case ANYOFL:
2140         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2141
2142         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2143             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2144         }
2145
2146         /* FALLTHROUGH */
2147     case ANYOFD:
2148     case ANYOF:
2149         if (utf8_target) {
2150             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2151                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2152         }
2153         else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2154             /* We know that s is in the bitmap range since the target isn't
2155              * UTF-8, so what happens for out-of-range values is not relevant,
2156              * so exclude that from the flags */
2157             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2158         }
2159         else {
2160             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2161         }
2162         break;
2163
2164     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2165         /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
2166          * use 0 */
2167         REXEC_FBC_FIND_NEXT_SCAN(0,
2168          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2169                                    (U8) ARG(c), FLAGS(c)));
2170         break;
2171
2172     case NANYOFM:   /* UTF-8ness does matter because can match UTF-8 variants.
2173                      */
2174         REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
2175          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2176                                    (U8) ARG(c), FLAGS(c)));
2177         break;
2178
2179     case ANYOFH:
2180         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2181             U8 first_byte = FLAGS(c);
2182
2183             if (first_byte) {   /* We know what the first byte of any matched
2184                                    string should be */
2185                 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2186                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2187             }
2188             else {
2189                 REXEC_FBC_CLASS_SCAN(TRUE,
2190                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2191             }
2192         }
2193         break;
2194
2195     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2196         assert(! is_utf8_pat);
2197         /* FALLTHROUGH */
2198     case EXACTFAA:
2199         if (is_utf8_pat) {
2200             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2201                              |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
2202             goto do_exactf_utf8;
2203         }
2204         else if (utf8_target) {
2205
2206             /* Here, and elsewhere in this file, the reason we can't consider a
2207              * non-UTF-8 pattern already folded in the presence of a UTF-8
2208              * target is because any MICRO SIGN in the pattern won't be folded.
2209              * Since the fold of the MICRO SIGN requires UTF-8 to represent, we
2210              * can consider a non-UTF-8 pattern folded when matching a
2211              * non-UTF-8 target */
2212             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2213             goto do_exactf_utf8;
2214         }
2215
2216         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2217          * which these functions don't handle anyway */
2218         fold_array = PL_fold_latin1;
2219         folder = foldEQ_latin1_s2_folded;
2220         goto do_exactf_non_utf8;
2221
2222     case EXACTF:   /* This node only generated for non-utf8 patterns */
2223         assert(! is_utf8_pat);
2224         if (utf8_target) {
2225             goto do_exactf_utf8;
2226         }
2227         fold_array = PL_fold;
2228         folder = foldEQ;
2229         goto do_exactf_non_utf8;
2230
2231     case EXACTFL:
2232         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2233         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2234             utf8_fold_flags = FOLDEQ_LOCALE;
2235             goto do_exactf_utf8;
2236         }
2237         fold_array = PL_fold_locale;
2238         folder = foldEQ_locale;
2239         goto do_exactf_non_utf8;
2240
2241     case EXACTFUP:      /* Problematic even though pattern isn't UTF-8.  Use
2242                            full functionality normally not done except for
2243                            UTF-8 */
2244         assert(! is_utf8_pat);
2245         goto do_exactf_utf8;
2246
2247     case EXACTFLU8:
2248             if (! utf8_target) {    /* All code points in this node require
2249                                        UTF-8 to express.  */
2250                 break;
2251             }
2252             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2253                                              | FOLDEQ_S2_FOLDS_SANE;
2254             goto do_exactf_utf8;
2255
2256     case EXACTFU_ONLY8:
2257         if (! utf8_target) {
2258             break;
2259         }
2260         assert(is_utf8_pat);
2261         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2262         goto do_exactf_utf8;
2263
2264     case EXACTFU:
2265         if (is_utf8_pat || utf8_target) {
2266             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2267             goto do_exactf_utf8;
2268         }
2269
2270         /* Any 'ss' in the pattern should have been replaced by regcomp,
2271          * so we don't have to worry here about this single special case
2272          * in the Latin1 range */
2273         fold_array = PL_fold_latin1;
2274         folder = foldEQ_latin1_s2_folded;
2275
2276         /* FALLTHROUGH */
2277
2278       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2279                            are no glitches with fold-length differences
2280                            between the target string and pattern */
2281
2282         /* The idea in the non-utf8 EXACTF* cases is to first find the
2283          * first character of the EXACTF* node and then, if necessary,
2284          * case-insensitively compare the full text of the node.  c1 is the
2285          * first character.  c2 is its fold.  This logic will not work for
2286          * Unicode semantics and the german sharp ss, which hence should
2287          * not be compiled into a node that gets here. */
2288         pat_string = STRING(c);
2289         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2290
2291         /* We know that we have to match at least 'ln' bytes (which is the
2292          * same as characters, since not utf8).  If we have to match 3
2293          * characters, and there are only 2 availabe, we know without
2294          * trying that it will fail; so don't start a match past the
2295          * required minimum number from the far end */
2296         e = HOP3c(strend, -((SSize_t)ln), s);
2297         if (e < s)
2298             break;
2299
2300         c1 = *pat_string;
2301         c2 = fold_array[c1];
2302         if (c1 == c2) { /* If char and fold are the same */
2303             while (s <= e) {
2304                 s = (char *) memchr(s, c1, e + 1 - s);
2305                 if (s == NULL) {
2306                     break;
2307                 }
2308
2309                 /* Check that the rest of the node matches */
2310                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2311                     && (reginfo->intuit || regtry(reginfo, &s)) )
2312                 {
2313                     goto got_it;
2314                 }
2315                 s++;
2316             }
2317         }
2318         else {
2319             U8 bits_differing = c1 ^ c2;
2320
2321             /* If the folds differ in one bit position only, we can mask to
2322              * match either of them, and can use this faster find method.  Both
2323              * ASCII and EBCDIC tend to have their case folds differ in only
2324              * one position, so this is very likely */
2325             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2326                 bits_differing = ~ bits_differing;
2327                 while (s <= e) {
2328                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2329                                         (c1 & bits_differing), bits_differing);
2330                     if (s > e) {
2331                         break;
2332                     }
2333
2334                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2335                         && (reginfo->intuit || regtry(reginfo, &s)) )
2336                     {
2337                         goto got_it;
2338                     }
2339                     s++;
2340                 }
2341             }
2342             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2343                        should actually happen only in EXACTFL nodes */
2344                 while (s <= e) {
2345                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2346                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2347                         && (reginfo->intuit || regtry(reginfo, &s)) )
2348                     {
2349                         goto got_it;
2350                     }
2351                     s++;
2352                 }
2353             }
2354         }
2355         break;
2356
2357       do_exactf_utf8:
2358       {
2359         unsigned expansion;
2360
2361         /* If one of the operands is in utf8, we can't use the simpler folding
2362          * above, due to the fact that many different characters can have the
2363          * same fold, or portion of a fold, or different- length fold */
2364         pat_string = STRING(c);
2365         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2366         pat_end = pat_string + ln;
2367         lnc = is_utf8_pat       /* length to match in characters */
2368                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2369                 : ln;
2370
2371         /* We have 'lnc' characters to match in the pattern, but because of
2372          * multi-character folding, each character in the target can match
2373          * up to 3 characters (Unicode guarantees it will never exceed
2374          * this) if it is utf8-encoded; and up to 2 if not (based on the
2375          * fact that the Latin 1 folds are already determined, and the
2376          * only multi-char fold in that range is the sharp-s folding to
2377          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2378          * string character.  Adjust lnc accordingly, rounding up, so that
2379          * if we need to match at least 4+1/3 chars, that really is 5. */
2380         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2381         lnc = (lnc + expansion - 1) / expansion;
2382
2383         /* As in the non-UTF8 case, if we have to match 3 characters, and
2384          * only 2 are left, it's guaranteed to fail, so don't start a
2385          * match that would require us to go beyond the end of the string
2386          */
2387         e = HOP3c(strend, -((SSize_t)lnc), s);
2388
2389         /* XXX Note that we could recalculate e to stop the loop earlier,
2390          * as the worst case expansion above will rarely be met, and as we
2391          * go along we would usually find that e moves further to the left.
2392          * This would happen only after we reached the point in the loop
2393          * where if there were no expansion we should fail.  Unclear if
2394          * worth the expense */
2395
2396         while (s <= e) {
2397             char *my_strend= (char *)strend;
2398             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2399                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2400                 && (reginfo->intuit || regtry(reginfo, &s)) )
2401             {
2402                 goto got_it;
2403             }
2404             s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2405         }
2406         break;
2407     }
2408
2409     case BOUNDL:
2410         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2411         if (FLAGS(c) != TRADITIONAL_BOUND) {
2412             if (! IN_UTF8_CTYPE_LOCALE) {
2413                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2414                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2415             }
2416             goto do_boundu;
2417         }
2418
2419         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2420         break;
2421
2422     case NBOUNDL:
2423         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2424         if (FLAGS(c) != TRADITIONAL_BOUND) {
2425             if (! IN_UTF8_CTYPE_LOCALE) {
2426                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2427                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2428             }
2429             goto do_nboundu;
2430         }
2431
2432         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2433         break;
2434
2435     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2436                    meaning */
2437         assert(FLAGS(c) == TRADITIONAL_BOUND);
2438
2439         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2440         break;
2441
2442     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2443                    meaning */
2444         assert(FLAGS(c) == TRADITIONAL_BOUND);
2445
2446         FBC_BOUND_A(isWORDCHAR_A);
2447         break;
2448
2449     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2450                    meaning */
2451         assert(FLAGS(c) == TRADITIONAL_BOUND);
2452
2453         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2454         break;
2455
2456     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2457                    meaning */
2458         assert(FLAGS(c) == TRADITIONAL_BOUND);
2459
2460         FBC_NBOUND_A(isWORDCHAR_A);
2461         break;
2462
2463     case NBOUNDU:
2464         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2465             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2466             break;
2467         }
2468
2469       do_nboundu:
2470
2471         to_complement = 1;
2472         /* FALLTHROUGH */
2473
2474     case BOUNDU:
2475       do_boundu:
2476         switch((bound_type) FLAGS(c)) {
2477             case TRADITIONAL_BOUND:
2478                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2479                 break;
2480             case GCB_BOUND:
2481                 if (s == reginfo->strbeg) {
2482                     if (reginfo->intuit || regtry(reginfo, &s))
2483                     {
2484                         goto got_it;
2485                     }
2486
2487                     /* Didn't match.  Try at the next position (if there is one) */
2488                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2489                     if (UNLIKELY(s >= reginfo->strend)) {
2490                         break;
2491                     }
2492                 }
2493
2494                 if (utf8_target) {
2495                     GCB_enum before = getGCB_VAL_UTF8(
2496                                                reghop3((U8*)s, -1,
2497                                                        (U8*)(reginfo->strbeg)),
2498                                                (U8*) reginfo->strend);
2499                     while (s < strend) {
2500                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2501                                                         (U8*) reginfo->strend);
2502                         if (   (to_complement ^ isGCB(before,
2503                                                       after,
2504                                                       (U8*) reginfo->strbeg,
2505                                                       (U8*) s,
2506                                                       utf8_target))
2507                             && (reginfo->intuit || regtry(reginfo, &s)))
2508                         {
2509                             goto got_it;
2510                         }
2511                         before = after;
2512                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2513                     }
2514                 }
2515                 else {  /* Not utf8.  Everything is a GCB except between CR and
2516                            LF */
2517                     while (s < strend) {
2518                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2519                                               || UCHARAT(s) != '\n'))
2520                             && (reginfo->intuit || regtry(reginfo, &s)))
2521                         {
2522                             goto got_it;
2523                         }
2524                         s++;
2525                     }
2526                 }
2527
2528                 /* And, since this is a bound, it can match after the final
2529                  * character in the string */
2530                 if (   reginfo->intuit
2531                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2532                 {
2533                     goto got_it;
2534                 }
2535                 break;
2536
2537             case LB_BOUND:
2538                 if (s == reginfo->strbeg) {
2539                     if (reginfo->intuit || regtry(reginfo, &s)) {
2540                         goto got_it;
2541                     }
2542                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2543                     if (UNLIKELY(s >= reginfo->strend)) {
2544                         break;
2545                     }
2546                 }
2547
2548                 if (utf8_target) {
2549                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2550                                                                -1,
2551                                                                (U8*)(reginfo->strbeg)),
2552                                                        (U8*) reginfo->strend);
2553                     while (s < strend) {
2554                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2555                         if (to_complement ^ isLB(before,
2556                                                  after,
2557                                                  (U8*) reginfo->strbeg,
2558                                                  (U8*) s,
2559                                                  (U8*) reginfo->strend,
2560                                                  utf8_target)
2561                             && (reginfo->intuit || regtry(reginfo, &s)))
2562                         {
2563                             goto got_it;
2564                         }
2565                         before = after;
2566                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2567                     }
2568                 }
2569                 else {  /* Not utf8. */
2570                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2571                     while (s < strend) {
2572                         LB_enum after = getLB_VAL_CP((U8) *s);
2573                         if (to_complement ^ isLB(before,
2574                                                  after,
2575                                                  (U8*) reginfo->strbeg,
2576                                                  (U8*) s,
2577                                                  (U8*) reginfo->strend,
2578                                                  utf8_target)
2579                             && (reginfo->intuit || regtry(reginfo, &s)))
2580                         {
2581                             goto got_it;
2582                         }
2583                         before = after;
2584                         s++;
2585                     }
2586                 }
2587
2588                 if (   reginfo->intuit
2589                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2590                 {
2591                     goto got_it;
2592                 }
2593
2594                 break;
2595
2596             case SB_BOUND:
2597                 if (s == reginfo->strbeg) {
2598                     if (reginfo->intuit || regtry(reginfo, &s)) {
2599                         goto got_it;
2600                     }
2601                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2602                     if (UNLIKELY(s >= reginfo->strend)) {
2603                         break;
2604                     }
2605                 }
2606
2607                 if (utf8_target) {
2608                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2609                                                         -1,
2610                                                         (U8*)(reginfo->strbeg)),
2611                                                       (U8*) reginfo->strend);
2612                     while (s < strend) {
2613                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2614                                                          (U8*) reginfo->strend);
2615                         if ((to_complement ^ isSB(before,
2616                                                   after,
2617                                                   (U8*) reginfo->strbeg,
2618                                                   (U8*) s,
2619                                                   (U8*) reginfo->strend,
2620                                                   utf8_target))
2621                             && (reginfo->intuit || regtry(reginfo, &s)))
2622                         {
2623                             goto got_it;
2624                         }
2625                         before = after;
2626                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2627                     }
2628                 }
2629                 else {  /* Not utf8. */
2630                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2631                     while (s < strend) {
2632                         SB_enum after = getSB_VAL_CP((U8) *s);
2633                         if ((to_complement ^ isSB(before,
2634                                                   after,
2635                                                   (U8*) reginfo->strbeg,
2636                                                   (U8*) s,
2637                                                   (U8*) reginfo->strend,
2638                                                   utf8_target))
2639                             && (reginfo->intuit || regtry(reginfo, &s)))
2640                         {
2641                             goto got_it;
2642                         }
2643                         before = after;
2644                         s++;
2645                     }
2646                 }
2647
2648                 /* Here are at the final position in the target string.  The SB
2649                  * value is always true here, so matches, depending on other
2650                  * constraints */
2651                 if (   reginfo->intuit
2652                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2653                 {
2654                     goto got_it;
2655                 }
2656
2657                 break;
2658
2659             case WB_BOUND:
2660                 if (s == reginfo->strbeg) {
2661                     if (reginfo->intuit || regtry(reginfo, &s)) {
2662                         goto got_it;
2663                     }
2664                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2665                     if (UNLIKELY(s >= reginfo->strend)) {
2666                         break;
2667                     }
2668                 }
2669
2670                 if (utf8_target) {
2671                     /* We are at a boundary between char_sub_0 and char_sub_1.
2672                      * We also keep track of the value for char_sub_-1 as we
2673                      * loop through the line.   Context may be needed to make a
2674                      * determination, and if so, this can save having to
2675                      * recalculate it */
2676                     WB_enum previous = WB_UNKNOWN;
2677                     WB_enum before = getWB_VAL_UTF8(
2678                                               reghop3((U8*)s,
2679                                                       -1,
2680                                                       (U8*)(reginfo->strbeg)),
2681                                               (U8*) reginfo->strend);
2682                     while (s < strend) {
2683                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2684                                                         (U8*) reginfo->strend);
2685                         if ((to_complement ^ isWB(previous,
2686                                                   before,
2687                                                   after,
2688                                                   (U8*) reginfo->strbeg,
2689                                                   (U8*) s,
2690                                                   (U8*) reginfo->strend,
2691                                                   utf8_target))
2692                             && (reginfo->intuit || regtry(reginfo, &s)))
2693                         {
2694                             goto got_it;
2695                         }
2696                         previous = before;
2697                         before = after;
2698                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2699                     }
2700                 }
2701                 else {  /* Not utf8. */
2702                     WB_enum previous = WB_UNKNOWN;
2703                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2704                     while (s < strend) {
2705                         WB_enum after = getWB_VAL_CP((U8) *s);
2706                         if ((to_complement ^ isWB(previous,
2707                                                   before,
2708                                                   after,
2709                                                   (U8*) reginfo->strbeg,
2710                                                   (U8*) s,
2711                                                   (U8*) reginfo->strend,
2712                                                   utf8_target))
2713                             && (reginfo->intuit || regtry(reginfo, &s)))
2714                         {
2715                             goto got_it;
2716                         }
2717                         previous = before;
2718                         before = after;
2719                         s++;
2720                     }
2721                 }
2722
2723                 if (   reginfo->intuit
2724                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2725                 {
2726                     goto got_it;
2727                 }
2728         }
2729         break;
2730
2731     case LNBREAK:
2732         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2733                         is_LNBREAK_latin1_safe(s, strend)
2734         );
2735         break;
2736
2737     /* The argument to all the POSIX node types is the class number to pass to
2738      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2739
2740     case NPOSIXL:
2741         to_complement = 1;
2742         /* FALLTHROUGH */
2743
2744     case POSIXL:
2745         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2746         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2747                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2748         break;
2749
2750     case NPOSIXD:
2751         to_complement = 1;
2752         /* FALLTHROUGH */
2753
2754     case POSIXD:
2755         if (utf8_target) {
2756             goto posix_utf8;
2757         }
2758         goto posixa;
2759
2760     case NPOSIXA:
2761         if (utf8_target) {
2762             /* The complement of something that matches only ASCII matches all
2763              * non-ASCII, plus everything in ASCII that isn't in the class. */
2764             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2765                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2766             break;
2767         }
2768
2769         to_complement = 1;
2770         goto posixa;
2771
2772     case POSIXA:
2773         /* Don't need to worry about utf8, as it can match only a single
2774          * byte invariant character.  But we do anyway for performance reasons,
2775          * as otherwise we would have to examine all the continuation
2776          * characters */
2777         if (utf8_target) {
2778             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2779             break;
2780         }
2781
2782       posixa:
2783         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2784                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2785         break;
2786
2787     case NPOSIXU:
2788         to_complement = 1;
2789         /* FALLTHROUGH */
2790
2791     case POSIXU:
2792         if (! utf8_target) {
2793             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2794                                  to_complement ^ cBOOL(_generic_isCC(*s,
2795                                                                     FLAGS(c))));
2796         }
2797         else {
2798
2799           posix_utf8:
2800             classnum = (_char_class_number) FLAGS(c);
2801             switch (classnum) {
2802                 default:
2803                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2804                         to_complement ^ cBOOL(_invlist_contains_cp(
2805                                               PL_XPosix_ptrs[classnum],
2806                                               utf8_to_uvchr_buf((U8 *) s,
2807                                                                 (U8 *) strend,
2808                                                                 NULL))));
2809                     break;
2810                 case _CC_ENUM_SPACE:
2811                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2812                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2813                     break;
2814
2815                 case _CC_ENUM_BLANK:
2816                     REXEC_FBC_CLASS_SCAN(1,
2817                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2818                     break;
2819
2820                 case _CC_ENUM_XDIGIT:
2821                     REXEC_FBC_CLASS_SCAN(1,
2822                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2823                     break;
2824
2825                 case _CC_ENUM_VERTSPACE:
2826                     REXEC_FBC_CLASS_SCAN(1,
2827                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2828                     break;
2829
2830                 case _CC_ENUM_CNTRL:
2831                     REXEC_FBC_CLASS_SCAN(1,
2832                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2833                     break;
2834             }
2835         }
2836         break;
2837
2838     case AHOCORASICKC:
2839     case AHOCORASICK:
2840         {
2841             DECL_TRIE_TYPE(c);
2842             /* what trie are we using right now */
2843             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2844             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2845             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2846
2847             const char *last_start = strend - trie->minlen;
2848 #ifdef DEBUGGING
2849             const char *real_start = s;
2850 #endif
2851             STRLEN maxlen = trie->maxlen;
2852             SV *sv_points;
2853             U8 **points; /* map of where we were in the input string
2854                             when reading a given char. For ASCII this
2855                             is unnecessary overhead as the relationship
2856                             is always 1:1, but for Unicode, especially
2857                             case folded Unicode this is not true. */
2858             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2859             U8 *bitmap=NULL;
2860
2861
2862             GET_RE_DEBUG_FLAGS_DECL;
2863
2864             /* We can't just allocate points here. We need to wrap it in
2865              * an SV so it gets freed properly if there is a croak while
2866              * running the match */
2867             ENTER;
2868             SAVETMPS;
2869             sv_points=newSV(maxlen * sizeof(U8 *));
2870             SvCUR_set(sv_points,
2871                 maxlen * sizeof(U8 *));
2872             SvPOK_on(sv_points);
2873             sv_2mortal(sv_points);
2874             points=(U8**)SvPV_nolen(sv_points );
2875             if ( trie_type != trie_utf8_fold
2876                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2877             {
2878                 if (trie->bitmap)
2879                     bitmap=(U8*)trie->bitmap;
2880                 else
2881                     bitmap=(U8*)ANYOF_BITMAP(c);
2882             }
2883             /* this is the Aho-Corasick algorithm modified a touch
2884                to include special handling for long "unknown char" sequences.
2885                The basic idea being that we use AC as long as we are dealing
2886                with a possible matching char, when we encounter an unknown char
2887                (and we have not encountered an accepting state) we scan forward
2888                until we find a legal starting char.
2889                AC matching is basically that of trie matching, except that when
2890                we encounter a failing transition, we fall back to the current
2891                states "fail state", and try the current char again, a process
2892                we repeat until we reach the root state, state 1, or a legal
2893                transition. If we fail on the root state then we can either
2894                terminate if we have reached an accepting state previously, or
2895                restart the entire process from the beginning if we have not.
2896
2897              */
2898             while (s <= last_start) {
2899                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2900                 U8 *uc = (U8*)s;
2901                 U16 charid = 0;
2902                 U32 base = 1;
2903                 U32 state = 1;
2904                 UV uvc = 0;
2905                 STRLEN len = 0;
2906                 STRLEN foldlen = 0;
2907                 U8 *uscan = (U8*)NULL;
2908                 U8 *leftmost = NULL;
2909 #ifdef DEBUGGING
2910                 U32 accepted_word= 0;
2911 #endif
2912                 U32 pointpos = 0;
2913
2914                 while ( state && uc <= (U8*)strend ) {
2915                     int failed=0;
2916                     U32 word = aho->states[ state ].wordnum;
2917
2918                     if( state==1 ) {
2919                         if ( bitmap ) {
2920                             DEBUG_TRIE_EXECUTE_r(
2921                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2922                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2923                                         (char *)uc, utf8_target, 0 );
2924                                     Perl_re_printf( aTHX_
2925                                         " Scanning for legal start char...\n");
2926                                 }
2927                             );
2928                             if (utf8_target) {
2929                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2930                                     uc += UTF8SKIP(uc);
2931                                 }
2932                             } else {
2933                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2934                                     uc++;
2935                                 }
2936                             }
2937                             s= (char *)uc;
2938                         }
2939                         if (uc >(U8*)last_start) break;
2940                     }
2941
2942                     if ( word ) {
2943                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2944                         if (!leftmost || lpos < leftmost) {
2945                             DEBUG_r(accepted_word=word);
2946                             leftmost= lpos;
2947                         }
2948                         if (base==0) break;
2949
2950                     }
2951                     points[pointpos++ % maxlen]= uc;
2952                     if (foldlen || uc < (U8*)strend) {
2953                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2954                                              (U8 *) strend, uscan, len, uvc,
2955                                              charid, foldlen, foldbuf,
2956                                              uniflags);
2957                         DEBUG_TRIE_EXECUTE_r({
2958                             dump_exec_pos( (char *)uc, c, strend,
2959                                         real_start, s, utf8_target, 0);
2960                             Perl_re_printf( aTHX_
2961                                 " Charid:%3u CP:%4" UVxf " ",
2962                                  charid, uvc);
2963                         });
2964                     }
2965                     else {
2966                         len = 0;
2967                         charid = 0;
2968                     }
2969
2970
2971                     do {
2972 #ifdef DEBUGGING
2973                         word = aho->states[ state ].wordnum;
2974 #endif
2975                         base = aho->states[ state ].trans.base;
2976
2977                         DEBUG_TRIE_EXECUTE_r({
2978                             if (failed)
2979                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2980                                     s,   utf8_target, 0 );
2981                             Perl_re_printf( aTHX_
2982                                 "%sState: %4" UVxf ", word=%" UVxf,
2983                                 failed ? " Fail transition to " : "",
2984                                 (UV)state, (UV)word);
2985                         });
2986                         if ( base ) {
2987                             U32 tmp;
2988                             I32 offset;
2989                             if (charid &&
2990                                  ( ((offset = base + charid
2991                                     - 1 - trie->uniquecharcount)) >= 0)
2992                                  && ((U32)offset < trie->lasttrans)
2993                                  && trie->trans[offset].check == state
2994                                  && (tmp=trie->trans[offset].next))
2995                             {
2996                                 DEBUG_TRIE_EXECUTE_r(
2997                                     Perl_re_printf( aTHX_ " - legal\n"));
2998                                 state = tmp;
2999                                 break;
3000                             }
3001                             else {
3002                                 DEBUG_TRIE_EXECUTE_r(
3003                                     Perl_re_printf( aTHX_ " - fail\n"));
3004                                 failed = 1;
3005                                 state = aho->fail[state];
3006                             }
3007                         }
3008                         else {
3009                             /* we must be accepting here */
3010                             DEBUG_TRIE_EXECUTE_r(
3011                                     Perl_re_printf( aTHX_ " - accepting\n"));
3012                             failed = 1;
3013                             break;
3014                         }
3015                     } while(state);
3016                     uc += len;
3017                     if (failed) {
3018                         if (leftmost)
3019                             break;
3020                         if (!state) state = 1;
3021                     }
3022                 }
3023                 if ( aho->states[ state ].wordnum ) {
3024                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3025                     if (!leftmost || lpos < leftmost) {
3026                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3027                         leftmost = lpos;
3028                     }
3029                 }
3030                 if (leftmost) {
3031                     s = (char*)leftmost;
3032                     DEBUG_TRIE_EXECUTE_r({
3033                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3034                             (UV)accepted_word, (IV)(s - real_start)
3035                         );
3036                     });
3037                     if (reginfo->intuit || regtry(reginfo, &s)) {
3038                         FREETMPS;
3039                         LEAVE;
3040                         goto got_it;
3041                     }
3042                     if (s < reginfo->strend) {
3043                         s = HOPc(s,1);
3044                     }
3045                     DEBUG_TRIE_EXECUTE_r({
3046                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3047                     });
3048                 } else {
3049                     DEBUG_TRIE_EXECUTE_r(
3050                         Perl_re_printf( aTHX_ "No match.\n"));
3051                     break;
3052                 }
3053             }
3054             FREETMPS;
3055             LEAVE;
3056         }
3057         break;
3058     default:
3059         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3060     }
3061     return 0;
3062   got_it:
3063     return s;
3064 }
3065
3066 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3067  * flags have same meanings as with regexec_flags() */
3068
3069 static void
3070 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3071                             char *strbeg,
3072                             char *strend,
3073                             SV *sv,
3074                             U32 flags,
3075                             bool utf8_target)
3076 {
3077     struct regexp *const prog = ReANY(rx);
3078
3079     if (flags & REXEC_COPY_STR) {
3080 #ifdef PERL_ANY_COW
3081         if (SvCANCOW(sv)) {
3082             DEBUG_C(Perl_re_printf( aTHX_
3083                               "Copy on write: regexp capture, type %d\n",
3084                                     (int) SvTYPE(sv)));
3085             /* Create a new COW SV to share the match string and store
3086              * in saved_copy, unless the current COW SV in saved_copy
3087              * is valid and suitable for our purpose */
3088             if ((   prog->saved_copy
3089                  && SvIsCOW(prog->saved_copy)
3090                  && SvPOKp(prog->saved_copy)
3091                  && SvIsCOW(sv)
3092                  && SvPOKp(sv)
3093                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3094             {
3095                 /* just reuse saved_copy SV */
3096                 if (RXp_MATCH_COPIED(prog)) {
3097                     Safefree(prog->subbeg);
3098                     RXp_MATCH_COPIED_off(prog);
3099                 }
3100             }
3101             else {
3102                 /* create new COW SV to share string */
3103                 RXp_MATCH_COPY_FREE(prog);
3104                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3105             }
3106             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3107             assert (SvPOKp(prog->saved_copy));
3108             prog->sublen  = strend - strbeg;
3109             prog->suboffset = 0;
3110             prog->subcoffset = 0;
3111         } else
3112 #endif
3113         {
3114             SSize_t min = 0;
3115             SSize_t max = strend - strbeg;
3116             SSize_t sublen;
3117
3118             if (    (flags & REXEC_COPY_SKIP_POST)
3119                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3120                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3121             ) { /* don't copy $' part of string */
3122                 U32 n = 0;
3123                 max = -1;
3124                 /* calculate the right-most part of the string covered
3125                  * by a capture. Due to lookahead, this may be to
3126                  * the right of $&, so we have to scan all captures */
3127                 while (n <= prog->lastparen) {
3128                     if (prog->offs[n].end > max)
3129                         max = prog->offs[n].end;
3130                     n++;
3131                 }
3132                 if (max == -1)
3133                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3134                             ? prog->offs[0].start
3135                             : 0;
3136                 assert(max >= 0 && max <= strend - strbeg);
3137             }
3138
3139             if (    (flags & REXEC_COPY_SKIP_PRE)
3140                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3141                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3142             ) { /* don't copy $` part of string */
3143                 U32 n = 0;
3144                 min = max;
3145                 /* calculate the left-most part of the string covered
3146                  * by a capture. Due to lookbehind, this may be to
3147                  * the left of $&, so we have to scan all captures */
3148                 while (min && n <= prog->lastparen) {
3149                     if (   prog->offs[n].start != -1
3150                         && prog->offs[n].start < min)
3151                     {
3152                         min = prog->offs[n].start;
3153                     }
3154                     n++;
3155                 }
3156                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3157                     && min >  prog->offs[0].end
3158                 )
3159                     min = prog->offs[0].end;
3160
3161             }
3162
3163             assert(min >= 0 && min <= max && min <= strend - strbeg);
3164             sublen = max - min;
3165
3166             if (RXp_MATCH_COPIED(prog)) {
3167                 if (sublen > prog->sublen)
3168                     prog->subbeg =
3169                             (char*)saferealloc(prog->subbeg, sublen+1);
3170             }
3171             else
3172                 prog->subbeg = (char*)safemalloc(sublen+1);
3173             Copy(strbeg + min, prog->subbeg, sublen, char);
3174             prog->subbeg[sublen] = '\0';
3175             prog->suboffset = min;
3176             prog->sublen = sublen;
3177             RXp_MATCH_COPIED_on(prog);
3178         }
3179         prog->subcoffset = prog->suboffset;
3180         if (prog->suboffset && utf8_target) {
3181             /* Convert byte offset to chars.
3182              * XXX ideally should only compute this if @-/@+
3183              * has been seen, a la PL_sawampersand ??? */
3184
3185             /* If there's a direct correspondence between the
3186              * string which we're matching and the original SV,
3187              * then we can use the utf8 len cache associated with
3188              * the SV. In particular, it means that under //g,
3189              * sv_pos_b2u() will use the previously cached
3190              * position to speed up working out the new length of
3191              * subcoffset, rather than counting from the start of
3192              * the string each time. This stops
3193              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3194              * from going quadratic */
3195             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3196                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3197                                                 SV_GMAGIC|SV_CONST_RETURN);
3198             else
3199                 prog->subcoffset = utf8_length((U8*)strbeg,
3200                                     (U8*)(strbeg+prog->suboffset));
3201         }
3202     }
3203     else {
3204         RXp_MATCH_COPY_FREE(prog);
3205         prog->subbeg = strbeg;
3206         prog->suboffset = 0;
3207         prog->subcoffset = 0;
3208         prog->sublen = strend - strbeg;
3209     }
3210 }
3211
3212
3213
3214
3215 /*
3216  - regexec_flags - match a regexp against a string
3217  */
3218 I32
3219 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3220               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3221 /* stringarg: the point in the string at which to begin matching */
3222 /* strend:    pointer to null at end of string */
3223 /* strbeg:    real beginning of string */
3224 /* minend:    end of match must be >= minend bytes after stringarg. */
3225 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3226  *            itself is accessed via the pointers above */
3227 /* data:      May be used for some additional optimizations.
3228               Currently unused. */
3229 /* flags:     For optimizations. See REXEC_* in regexp.h */
3230
3231 {
3232     struct regexp *const prog = ReANY(rx);
3233     char *s;
3234     regnode *c;
3235     char *startpos;
3236     SSize_t minlen;             /* must match at least this many chars */
3237     SSize_t dontbother = 0;     /* how many characters not to try at end */
3238     const bool utf8_target = cBOOL(DO_UTF8(sv));
3239     I32 multiline;
3240     RXi_GET_DECL(prog,progi);
3241     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3242     regmatch_info *const reginfo = &reginfo_buf;
3243     regexp_paren_pair *swap = NULL;
3244     I32 oldsave;
3245     GET_RE_DEBUG_FLAGS_DECL;
3246
3247     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3248     PERL_UNUSED_ARG(data);
3249
3250     /* Be paranoid... */
3251     if (prog == NULL) {
3252         Perl_croak(aTHX_ "NULL regexp parameter");
3253     }
3254
3255     DEBUG_EXECUTE_r(
3256         debug_start_match(rx, utf8_target, stringarg, strend,
3257         "Matching");
3258     );
3259
3260     startpos = stringarg;
3261
3262     /* set these early as they may be used by the HOP macros below */
3263     reginfo->strbeg = strbeg;
3264     reginfo->strend = strend;
3265     reginfo->is_utf8_target = cBOOL(utf8_target);
3266
3267     if (prog->intflags & PREGf_GPOS_SEEN) {
3268         MAGIC *mg;
3269
3270         /* set reginfo->ganch, the position where \G can match */
3271
3272         reginfo->ganch =
3273             (flags & REXEC_IGNOREPOS)
3274             ? stringarg /* use start pos rather than pos() */
3275             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3276               /* Defined pos(): */
3277             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3278             : strbeg; /* pos() not defined; use start of string */
3279
3280         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3281             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3282
3283         /* in the presence of \G, we may need to start looking earlier in
3284          * the string than the suggested start point of stringarg:
3285          * if prog->gofs is set, then that's a known, fixed minimum
3286          * offset, such as
3287          * /..\G/:   gofs = 2
3288          * /ab|c\G/: gofs = 1
3289          * or if the minimum offset isn't known, then we have to go back
3290          * to the start of the string, e.g. /w+\G/
3291          */
3292
3293         if (prog->intflags & PREGf_ANCH_GPOS) {
3294             if (prog->gofs) {
3295                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3296                 if (!startpos ||
3297                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3298                 {
3299                     DEBUG_r(Perl_re_printf( aTHX_
3300                             "fail: ganch-gofs before earliest possible start\n"));
3301                     return 0;
3302                 }
3303             }
3304             else
3305                 startpos = reginfo->ganch;
3306         }
3307         else if (prog->gofs) {
3308             startpos = HOPBACKc(startpos, prog->gofs);
3309             if (!startpos)
3310                 startpos = strbeg;
3311         }
3312         else if (prog->intflags & PREGf_GPOS_FLOAT)
3313             startpos = strbeg;
3314     }
3315
3316     minlen = prog->minlen;
3317     if ((startpos + minlen) > strend || startpos < strbeg) {
3318         DEBUG_r(Perl_re_printf( aTHX_
3319                     "Regex match can't succeed, so not even tried\n"));
3320         return 0;
3321     }
3322
3323     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3324      * which will call destuctors to reset PL_regmatch_state, free higher
3325      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3326      * regmatch_info_aux_eval */
3327
3328     oldsave = PL_savestack_ix;
3329
3330     s = startpos;
3331
3332     if ((prog->extflags & RXf_USE_INTUIT)
3333         && !(flags & REXEC_CHECKED))
3334     {
3335         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3336                                     flags, NULL);
3337         if (!s)
3338             return 0;
3339
3340         if (prog->extflags & RXf_CHECK_ALL) {
3341             /* we can match based purely on the result of INTUIT.
3342              * Set up captures etc just for $& and $-[0]
3343              * (an intuit-only match wont have $1,$2,..) */
3344             assert(!prog->nparens);
3345
3346             /* s/// doesn't like it if $& is earlier than where we asked it to
3347              * start searching (which can happen on something like /.\G/) */
3348             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3349                     && (s < stringarg))
3350             {
3351                 /* this should only be possible under \G */
3352                 assert(prog->intflags & PREGf_GPOS_SEEN);
3353                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3354                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3355                 goto phooey;
3356             }
3357
3358             /* match via INTUIT shouldn't have any captures.
3359              * Let @-, @+, $^N know */
3360             prog->lastparen = prog->lastcloseparen = 0;
3361             RXp_MATCH_UTF8_set(prog, utf8_target);
3362             prog->offs[0].start = s - strbeg;
3363             prog->offs[0].end = utf8_target
3364                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3365                 : s - strbeg + prog->minlenret;
3366             if ( !(flags & REXEC_NOT_FIRST) )
3367                 S_reg_set_capture_string(aTHX_ rx,
3368                                         strbeg, strend,
3369                                         sv, flags, utf8_target);
3370
3371             return 1;
3372         }
3373     }
3374
3375     multiline = prog->extflags & RXf_PMf_MULTILINE;
3376
3377     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3378         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3379                               "String too short [regexec_flags]...\n"));
3380         goto phooey;
3381     }
3382
3383     /* Check validity of program. */
3384     if (UCHARAT(progi->program) != REG_MAGIC) {
3385         Perl_croak(aTHX_ "corrupted regexp program");
3386     }
3387
3388     RXp_MATCH_TAINTED_off(prog);
3389     RXp_MATCH_UTF8_set(prog, utf8_target);
3390
3391     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3392     reginfo->intuit = 0;
3393     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3394     reginfo->warned = FALSE;
3395     reginfo->sv = sv;
3396     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3397     /* see how far we have to get to not match where we matched before */
3398     reginfo->till = stringarg + minend;
3399
3400     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3401         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3402            S_cleanup_regmatch_info_aux has executed (registered by
3403            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3404            magic belonging to this SV.
3405            Not newSVsv, either, as it does not COW.
3406         */
3407         reginfo->sv = newSV(0);
3408         SvSetSV_nosteal(reginfo->sv, sv);
3409         SAVEFREESV(reginfo->sv);
3410     }
3411
3412     /* reserve next 2 or 3 slots in PL_regmatch_state:
3413      * slot N+0: may currently be in use: skip it
3414      * slot N+1: use for regmatch_info_aux struct
3415      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3416      * slot N+3: ready for use by regmatch()
3417      */
3418
3419     {
3420         regmatch_state *old_regmatch_state;
3421         regmatch_slab  *old_regmatch_slab;
3422         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3423
3424         /* on first ever match, allocate first slab */
3425         if (!PL_regmatch_slab) {
3426             Newx(PL_regmatch_slab, 1, regmatch_slab);
3427             PL_regmatch_slab->prev = NULL;
3428             PL_regmatch_slab->next = NULL;
3429             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3430         }
3431
3432         old_regmatch_state = PL_regmatch_state;
3433         old_regmatch_slab  = PL_regmatch_slab;
3434
3435         for (i=0; i <= max; i++) {
3436             if (i == 1)
3437                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3438             else if (i ==2)
3439                 reginfo->info_aux_eval =
3440                 reginfo->info_aux->info_aux_eval =
3441                             &(PL_regmatch_state->u.info_aux_eval);
3442
3443             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3444                 PL_regmatch_state = S_push_slab(aTHX);
3445         }
3446
3447         /* note initial PL_regmatch_state position; at end of match we'll
3448          * pop back to there and free any higher slabs */
3449
3450         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3451         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3452         reginfo->info_aux->poscache = NULL;
3453
3454         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3455
3456         if ((prog->extflags & RXf_EVAL_SEEN))
3457             S_setup_eval_state(aTHX_ reginfo);
3458         else
3459             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3460     }
3461
3462     /* If there is a "must appear" string, look for it. */
3463
3464     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3465         /* We have to be careful. If the previous successful match
3466            was from this regex we don't want a subsequent partially
3467            successful match to clobber the old results.
3468            So when we detect this possibility we add a swap buffer
3469            to the re, and switch the buffer each match. If we fail,
3470            we switch it back; otherwise we leave it swapped.
3471         */
3472         swap = prog->offs;
3473         /* avoid leak if we die, or clean up anyway if match completes */
3474         SAVEFREEPV(swap);
3475         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3476         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3477             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3478             0,
3479             PTR2UV(prog),
3480             PTR2UV(swap),
3481             PTR2UV(prog->offs)
3482         ));
3483     }
3484
3485     if (prog->recurse_locinput)
3486         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3487
3488     /* Simplest case: anchored match need be tried only once, or with
3489      * MBOL, only at the beginning of each line.
3490      *
3491      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3492      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3493      * match at the start of the string then it won't match anywhere else
3494      * either; while with /.*.../, if it doesn't match at the beginning,
3495      * the earliest it could match is at the start of the next line */
3496
3497     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3498         char *end;
3499
3500         if (regtry(reginfo, &s))
3501             goto got_it;
3502
3503         if (!(prog->intflags & PREGf_ANCH_MBOL))
3504             goto phooey;
3505
3506         /* didn't match at start, try at other newline positions */
3507
3508         if (minlen)
3509             dontbother = minlen - 1;
3510         end = HOP3c(strend, -dontbother, strbeg) - 1;
3511
3512         /* skip to next newline */
3513
3514         while (s <= end) { /* note it could be possible to match at the end of the string */
3515             /* NB: newlines are the same in unicode as they are in latin */
3516             if (*s++ != '\n')
3517                 continue;
3518             if (prog->check_substr || prog->check_utf8) {
3519             /* note that with PREGf_IMPLICIT, intuit can only fail
3520              * or return the start position, so it's of limited utility.
3521              * Nevertheless, I made the decision that the potential for
3522              * quick fail was still worth it - DAPM */
3523                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3524                 if (!s)
3525                     goto phooey;
3526             }
3527             if (regtry(reginfo, &s))
3528                 goto got_it;
3529         }
3530         goto phooey;
3531     } /* end anchored search */
3532
3533     if (prog->intflags & PREGf_ANCH_GPOS)
3534     {
3535         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3536         assert(prog->intflags & PREGf_GPOS_SEEN);
3537         /* For anchored \G, the only position it can match from is
3538          * (ganch-gofs); we already set startpos to this above; if intuit
3539          * moved us on from there, we can't possibly succeed */
3540         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3541         if (s == startpos && regtry(reginfo, &s))
3542             goto got_it;
3543         goto phooey;
3544     }
3545
3546     /* Messy cases:  unanchored match. */
3547     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3548         /* we have /x+whatever/ */
3549         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3550         char ch;
3551 #ifdef DEBUGGING
3552         int did_match = 0;
3553 #endif
3554         if (utf8_target) {
3555             if (! prog->anchored_utf8) {
3556                 to_utf8_substr(prog);
3557             }
3558             ch = SvPVX_const(prog->anchored_utf8)[0];
3559             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3560                 if (*s == ch) {
3561                     DEBUG_EXECUTE_r( did_match = 1 );
3562                     if (regtry(reginfo, &s)) goto got_it;
3563                     s += UTF8_SAFE_SKIP(s, strend);
3564                     while (s < strend && *s == ch)
3565                         s += UTF8SKIP(s);
3566                 }
3567             );
3568
3569         }
3570         else {
3571             if (! prog->anchored_substr) {
3572                 if (! to_byte_substr(prog)) {
3573                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3574                 }
3575             }
3576             ch = SvPVX_const(prog->anchored_substr)[0];
3577             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3578                 if (*s == ch) {
3579                     DEBUG_EXECUTE_r( did_match = 1 );
3580                     if (regtry(reginfo, &s)) goto got_it;
3581                     s++;
3582                     while (s < strend && *s == ch)
3583                         s++;
3584                 }
3585             );
3586         }
3587         DEBUG_EXECUTE_r(if (!did_match)
3588                 Perl_re_printf( aTHX_
3589                                   "Did not find anchored character...\n")
3590                );
3591     }
3592     else if (prog->anchored_substr != NULL
3593               || prog->anchored_utf8 != NULL
3594               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3595                   && prog->float_max_offset < strend - s)) {
3596         SV *must;
3597         SSize_t back_max;
3598         SSize_t back_min;
3599         char *last;
3600         char *last1;            /* Last position checked before */
3601 #ifdef DEBUGGING
3602         int did_match = 0;
3603 #endif
3604         if (prog->anchored_substr || prog->anchored_utf8) {
3605             if (utf8_target) {
3606                 if (! prog->anchored_utf8) {
3607                     to_utf8_substr(prog);
3608                 }
3609                 must = prog->anchored_utf8;
3610             }
3611             else {
3612                 if (! prog->anchored_substr) {
3613                     if (! to_byte_substr(prog)) {
3614                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3615                     }
3616                 }
3617                 must = prog->anchored_substr;
3618             }
3619             back_max = back_min = prog->anchored_offset;
3620         } else {
3621             if (utf8_target) {
3622                 if (! prog->float_utf8) {
3623                     to_utf8_substr(prog);
3624                 }
3625                 must = prog->float_utf8;
3626             }
3627             else {
3628                 if (! prog->float_substr) {
3629                     if (! to_byte_substr(prog)) {
3630                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3631                     }
3632                 }
3633                 must = prog->float_substr;
3634             }
3635             back_max = prog->float_max_offset;
3636             back_min = prog->float_min_offset;
3637         }
3638
3639         if (back_min<0) {
3640             last = strend;
3641         } else {
3642             last = HOP3c(strend,        /* Cannot start after this */
3643                   -(SSize_t)(CHR_SVLEN(must)
3644                          - (SvTAIL(must) != 0) + back_min), strbeg);
3645         }
3646         if (s > reginfo->strbeg)
3647             last1 = HOPc(s, -1);
3648         else
3649             last1 = s - 1;      /* bogus */
3650
3651         /* XXXX check_substr already used to find "s", can optimize if
3652            check_substr==must. */
3653         dontbother = 0;
3654         strend = HOPc(strend, -dontbother);
3655         while ( (s <= last) &&
3656                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3657                                   (unsigned char*)strend, must,
3658                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3659             DEBUG_EXECUTE_r( did_match = 1 );
3660             if (HOPc(s, -back_max) > last1) {
3661                 last1 = HOPc(s, -back_min);
3662                 s = HOPc(s, -back_max);
3663             }
3664             else {
3665                 char * const t = (last1 >= reginfo->strbeg)
3666                                     ? HOPc(last1, 1) : last1 + 1;
3667
3668                 last1 = HOPc(s, -back_min);
3669                 s = t;
3670             }
3671             if (utf8_target) {
3672                 while (s <= last1) {
3673                     if (regtry(reginfo, &s))
3674                         goto got_it;
3675                     if (s >= last1) {
3676                         s++; /* to break out of outer loop */
3677                         break;
3678                     }
3679                     s += UTF8SKIP(s);
3680                 }
3681             }
3682             else {
3683                 while (s <= last1) {
3684                     if (regtry(reginfo, &s))
3685                         goto got_it;
3686                     s++;
3687                 }
3688             }
3689         }
3690         DEBUG_EXECUTE_r(if (!did_match) {
3691             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3692                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3693             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3694                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3695                                ? "anchored" : "floating"),
3696                 quoted, RE_SV_TAIL(must));
3697         });
3698         goto phooey;
3699     }
3700     else if ( (c = progi->regstclass) ) {
3701         if (minlen) {
3702             const OPCODE op = OP(progi->regstclass);
3703             /* don't bother with what can't match */
3704             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3705                 strend = HOPc(strend, -(minlen - 1));
3706         }
3707         DEBUG_EXECUTE_r({
3708             SV * const prop = sv_newmortal();
3709             regprop(prog, prop, c, reginfo, NULL);
3710             {
3711                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3712                     s,strend-s,PL_dump_re_max_len);
3713                 Perl_re_printf( aTHX_
3714                     "Matching stclass %.*s against %s (%d bytes)\n",
3715                     (int)SvCUR(prop), SvPVX_const(prop),
3716                      quoted, (int)(strend - s));
3717             }
3718         });
3719         if (find_byclass(prog, c, s, strend, reginfo))
3720             goto got_it;
3721         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3722     }
3723     else {
3724         dontbother = 0;
3725         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3726             /* Trim the end. */
3727             char *last= NULL;
3728             SV* float_real;
3729             STRLEN len;
3730             const char *little;
3731
3732             if (utf8_target) {
3733                 if (! prog->float_utf8) {
3734                     to_utf8_substr(prog);
3735                 }
3736                 float_real = prog->float_utf8;
3737             }
3738             else {
3739                 if (! prog->float_substr) {
3740                     if (! to_byte_substr(prog)) {
3741                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3742                     }
3743                 }
3744                 float_real = prog->float_substr;
3745             }
3746
3747             little = SvPV_const(float_real, len);
3748             if (SvTAIL(float_real)) {
3749                     /* This means that float_real contains an artificial \n on
3750                      * the end due to the presence of something like this:
3751                      * /foo$/ where we can match both "foo" and "foo\n" at the
3752                      * end of the string.  So we have to compare the end of the
3753                      * string first against the float_real without the \n and
3754                      * then against the full float_real with the string.  We
3755                      * have to watch out for cases where the string might be
3756                      * smaller than the float_real or the float_real without
3757                      * the \n. */
3758                     char *checkpos= strend - len;
3759                     DEBUG_OPTIMISE_r(
3760                         Perl_re_printf( aTHX_
3761                             "%sChecking for float_real.%s\n",
3762                             PL_colors[4], PL_colors[5]));
3763                     if (checkpos + 1 < strbeg) {
3764                         /* can't match, even if we remove the trailing \n
3765                          * string is too short to match */
3766                         DEBUG_EXECUTE_r(
3767                             Perl_re_printf( aTHX_
3768                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3769                                 PL_colors[4], PL_colors[5]));
3770                         goto phooey;
3771                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3772                         /* can match, the end of the string matches without the
3773                          * "\n" */
3774                         last = checkpos + 1;
3775                     } else if (checkpos < strbeg) {
3776                         /* cant match, string is too short when the "\n" is
3777                          * included */
3778                         DEBUG_EXECUTE_r(
3779                             Perl_re_printf( aTHX_
3780                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3781                                 PL_colors[4], PL_colors[5]));
3782                         goto phooey;
3783                     } else if (!multiline) {
3784                         /* non multiline match, so compare with the "\n" at the
3785                          * end of the string */
3786                         if (memEQ(checkpos, little, len)) {
3787                             last= checkpos;
3788                         } else {
3789                             DEBUG_EXECUTE_r(
3790                                 Perl_re_printf( aTHX_
3791                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3792                                     PL_colors[4], PL_colors[5]));
3793                             goto phooey;
3794                         }
3795                     } else {
3796                         /* multiline match, so we have to search for a place
3797                          * where the full string is located */
3798                         goto find_last;
3799                     }
3800             } else {
3801                   find_last:
3802                     if (len)
3803                         last = rninstr(s, strend, little, little + len);
3804                     else
3805                         last = strend;  /* matching "$" */
3806             }
3807             if (!last) {
3808                 /* at one point this block contained a comment which was
3809                  * probably incorrect, which said that this was a "should not
3810                  * happen" case.  Even if it was true when it was written I am
3811                  * pretty sure it is not anymore, so I have removed the comment
3812                  * and replaced it with this one. Yves */
3813                 DEBUG_EXECUTE_r(
3814                     Perl_re_printf( aTHX_
3815                         "%sString does not contain required substring, cannot match.%s\n",
3816                         PL_colors[4], PL_colors[5]
3817                     ));
3818                 goto phooey;
3819             }
3820             dontbother = strend - last + prog->float_min_offset;
3821         }
3822         if (minlen && (dontbother < minlen))
3823             dontbother = minlen - 1;
3824         strend -= dontbother;              /* this one's always in bytes! */
3825         /* We don't know much -- general case. */
3826         if (utf8_target) {
3827             for (;;) {
3828                 if (regtry(reginfo, &s))
3829                     goto got_it;
3830                 if (s >= strend)
3831                     break;
3832                 s += UTF8SKIP(s);
3833             };
3834         }
3835         else {
3836             do {
3837                 if (regtry(reginfo, &s))
3838                     goto got_it;
3839             } while (s++ < strend);
3840         }
3841     }
3842
3843     /* Failure. */
3844     goto phooey;
3845
3846   got_it:
3847     /* s/// doesn't like it if $& is earlier than where we asked it to
3848      * start searching (which can happen on something like /.\G/) */
3849     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3850             && (prog->offs[0].start < stringarg - strbeg))
3851     {
3852         /* this should only be possible under \G */
3853         assert(prog->intflags & PREGf_GPOS_SEEN);
3854         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3855             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3856         goto phooey;
3857     }
3858
3859     /* clean up; this will trigger destructors that will free all slabs
3860      * above the current one, and cleanup the regmatch_info_aux
3861      * and regmatch_info_aux_eval sructs */
3862
3863     LEAVE_SCOPE(oldsave);
3864
3865     if (RXp_PAREN_NAMES(prog))
3866         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3867
3868     /* make sure $`, $&, $', and $digit will work later */
3869     if ( !(flags & REXEC_NOT_FIRST) )
3870         S_reg_set_capture_string(aTHX_ rx,
3871                                     strbeg, reginfo->strend,
3872                                     sv, flags, utf8_target);
3873
3874     return 1;
3875
3876   phooey:
3877     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3878                           PL_colors[4], PL_colors[5]));
3879
3880     if (swap) {
3881         /* we failed :-( roll it back.
3882          * Since the swap buffer will be freed on scope exit which follows
3883          * shortly, restore the old captures by copying 'swap's original
3884          * data to the new offs buffer
3885          */
3886         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3887             "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
3888             0,
3889             PTR2UV(prog),
3890             PTR2UV(prog->offs),
3891             PTR2UV(swap)
3892         ));
3893
3894         Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
3895     }
3896
3897     /* clean up; this will trigger destructors that will free all slabs
3898      * above the current one, and cleanup the regmatch_info_aux
3899      * and regmatch_info_aux_eval sructs */
3900
3901     LEAVE_SCOPE(oldsave);
3902
3903     return 0;
3904 }
3905
3906
3907 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3908  * Do inc before dec, in case old and new rex are the same */
3909 #define SET_reg_curpm(Re2)                          \
3910     if (reginfo->info_aux_eval) {                   \
3911         (void)ReREFCNT_inc(Re2);                    \
3912         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3913         PM_SETRE((PL_reg_curpm), (Re2));            \
3914     }
3915
3916
3917 /*
3918  - regtry - try match at specific point
3919  */
3920 STATIC bool                     /* 0 failure, 1 success */
3921 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3922 {
3923     CHECKPOINT lastcp;
3924     REGEXP *const rx = reginfo->prog;
3925     regexp *const prog = ReANY(rx);
3926     SSize_t result;
3927 #ifdef DEBUGGING
3928     U32 depth = 0; /* used by REGCP_SET */
3929 #endif
3930     RXi_GET_DECL(prog,progi);
3931     GET_RE_DEBUG_FLAGS_DECL;
3932
3933     PERL_ARGS_ASSERT_REGTRY;
3934
3935     reginfo->cutpoint=NULL;
3936
3937     prog->offs[0].start = *startposp - reginfo->strbeg;
3938     prog->lastparen = 0;
3939     prog->lastcloseparen = 0;
3940
3941     /* XXXX What this code is doing here?!!!  There should be no need
3942        to do this again and again, prog->lastparen should take care of
3943        this!  --ilya*/
3944
3945     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3946      * Actually, the code in regcppop() (which Ilya may be meaning by
3947      * prog->lastparen), is not needed at all by the test suite
3948      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3949      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3950      * Meanwhile, this code *is* needed for the
3951      * above-mentioned test suite tests to succeed.  The common theme
3952      * on those tests seems to be returning null fields from matches.
3953      * --jhi updated by dapm */
3954
3955     /* After encountering a variant of the issue mentioned above I think
3956      * the point Ilya was making is that if we properly unwind whenever
3957      * we set lastparen to a smaller value then we should not need to do
3958      * this every time, only when needed. So if we have tests that fail if
3959      * we remove this, then it suggests somewhere else we are improperly
3960      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3961      * places it is called, and related regcp() routines. - Yves */
3962 #if 1
3963     if (prog->nparens) {
3964         regexp_paren_pair *pp = prog->offs;
3965         I32 i;
3966         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3967             ++pp;
3968             pp->start = -1;
3969             pp->end = -1;
3970         }
3971     }
3972 #endif
3973     REGCP_SET(lastcp);
3974     result = regmatch(reginfo, *startposp, progi->program + 1);
3975     if (result != -1) {
3976         prog->offs[0].end = result;
3977         return 1;
3978     }
3979     if (reginfo->cutpoint)
3980         *startposp= reginfo->cutpoint;
3981     REGCP_UNWIND(lastcp);
3982     return 0;
3983 }
3984
3985 /* this is used to determine how far from the left messages like
3986    'failed...' are printed in regexec.c. It should be set such that
3987    messages are inline with the regop output that created them.
3988 */
3989 #define REPORT_CODE_OFF 29
3990 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3991 #ifdef DEBUGGING
3992 int
3993 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3994 {
3995     va_list ap;
3996     int result;
3997     PerlIO *f= Perl_debug_log;
3998     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3999     va_start(ap, depth);
4000     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4001     result = PerlIO_vprintf(f, fmt, ap);
4002     va_end(ap);
4003     return result;
4004 }
4005 #endif /* DEBUGGING */
4006
4007 /* grab a new slab and return the first slot in it */
4008
4009 STATIC regmatch_state *
4010 S_push_slab(pTHX)
4011 {
4012     regmatch_slab *s = PL_regmatch_slab->next;
4013     if (!s) {
4014         Newx(s, 1, regmatch_slab);
4015         s->prev = PL_regmatch_slab;
4016         s->next = NULL;
4017         PL_regmatch_slab->next = s;
4018     }
4019     PL_regmatch_slab = s;
4020     return SLAB_FIRST(s);
4021 }
4022
4023 #ifdef DEBUGGING
4024
4025 STATIC void
4026 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4027     const char *start, const char *end, const char *blurb)
4028 {
4029     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4030
4031     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4032
4033     if (!PL_colorset)
4034             reginitcolors();
4035     {
4036         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4037             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4038
4039         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4040             start, end - start, PL_dump_re_max_len);
4041
4042         Perl_re_printf( aTHX_
4043             "%s%s REx%s %s against %s\n",
4044                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4045
4046         if (utf8_target||utf8_pat)
4047             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4048                 utf8_pat ? "pattern" : "",
4049                 utf8_pat && utf8_target ? " and " : "",
4050                 utf8_target ? "string" : ""
4051             );
4052     }
4053 }
4054
4055 STATIC void
4056 S_dump_exec_pos(pTHX_ const char *locinput,
4057                       const regnode *scan,
4058                       const char *loc_regeol,
4059                       const char *loc_bostr,
4060                       const char *loc_reg_starttry,
4061                       const bool utf8_target,
4062                       const U32 depth
4063                 )
4064 {
4065     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4066     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4067     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4068     /* The part of the string before starttry has one color
4069        (pref0_len chars), between starttry and current
4070        position another one (pref_len - pref0_len chars),
4071        after the current position the third one.
4072        We assume that pref0_len <= pref_len, otherwise we
4073        decrease pref0_len.  */
4074     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4075         ? (5 + taill) - l : locinput - loc_bostr;
4076     int pref0_len;
4077
4078     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4079
4080     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4081         pref_len++;
4082     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4083     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4084         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4085               ? (5 + taill) - pref_len : loc_regeol - locinput);
4086     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4087         l--;
4088     if (pref0_len < 0)
4089         pref0_len = 0;
4090     if (pref0_len > pref_len)
4091         pref0_len = pref_len;
4092     {
4093         const int is_uni = utf8_target ? 1 : 0;
4094
4095         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4096             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4097
4098         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4099                     (locinput - pref_len + pref0_len),
4100                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4101
4102         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4103                     locinput, loc_regeol - locinput, 10, 0, 1);
4104
4105         const STRLEN tlen=len0+len1+len2;
4106         Perl_re_printf( aTHX_
4107                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4108                     (IV)(locinput - loc_bostr),
4109                     len0, s0,
4110                     len1, s1,
4111                     (docolor ? "" : "> <"),
4112                     len2, s2,
4113                     (int)(tlen > 19 ? 0 :  19 - tlen),
4114                     "",
4115                     depth);
4116     }
4117 }
4118
4119 #endif
4120
4121 /* reg_check_named_buff_matched()
4122  * Checks to see if a named buffer has matched. The data array of
4123  * buffer numbers corresponding to the buffer is expected to reside
4124  * in the regexp->data->data array in the slot stored in the ARG() of
4125  * node involved. Note that this routine doesn't actually care about the
4126  * name, that information is not preserved from compilation to execution.
4127  * Returns the index of the leftmost defined buffer with the given name
4128  * or 0 if non of the buffers matched.
4129  */
4130 STATIC I32
4131 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4132 {
4133     I32 n;
4134     RXi_GET_DECL(rex,rexi);
4135     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4136     I32 *nums=(I32*)SvPVX(sv_dat);
4137
4138     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4139
4140     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4141         if ((I32)rex->lastparen >= nums[n] &&
4142             rex->offs[nums[n]].end != -1)
4143         {
4144             return nums[n];
4145         }
4146     }
4147     return 0;
4148 }
4149
4150 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4151 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4152 #define CHRTEST_NOT_A_CP_1 -999
4153 #define CHRTEST_NOT_A_CP_2 -998
4154
4155 static bool
4156 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4157         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4158 {
4159     /* This function determines if there are zero, one, two, or more characters
4160      * that match the first character of the passed-in EXACTish node
4161      * <text_node>, and if there are one or two, it returns them in the
4162      * passed-in pointers.
4163      *
4164      * If it determines that no possible character in the target string can
4165      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4166      * the first character in <text_node> requires UTF-8 to represent, and the
4167      * target string isn't in UTF-8.)
4168      *
4169      * If there are more than two characters that could match the beginning of
4170      * <text_node>, or if more context is required to determine a match or not,
4171      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4172      *
4173      * The motiviation behind this function is to allow the caller to set up
4174      * tight loops for matching.  If <text_node> is of type EXACT, there is
4175      * only one possible character that can match its first character, and so
4176      * the situation is quite simple.  But things get much more complicated if
4177      * folding is involved.  It may be that the first character of an EXACTFish
4178      * node doesn't participate in any possible fold, e.g., punctuation, so it
4179      * can be matched only by itself.  The vast majority of characters that are
4180      * in folds match just two things, their lower and upper-case equivalents.
4181      * But not all are like that; some have multiple possible matches, or match
4182      * sequences of more than one character.  This function sorts all that out.
4183      *
4184      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4185      * loop of trying to match A*, we know we can't exit where the thing
4186      * following it isn't a B.  And something can't be a B unless it is the
4187      * beginning of B.  By putting a quick test for that beginning in a tight
4188      * loop, we can rule out things that can't possibly be B without having to
4189      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4190      * character, we can make a tight loop matching A*, using the outputs of
4191      * this function.
4192      *
4193      * If the target string to match isn't in UTF-8, and there aren't
4194      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4195      * the one or two possible octets (which are characters in this situation)
4196      * that can match.  In all cases, if there is only one character that can
4197      * match, *<c1p> and *<c2p> will be identical.
4198      *
4199      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4200      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4201      * can match the beginning of <text_node>.  They should be declared with at
4202      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4203      * undefined what these contain.)  If one or both of the buffers are
4204      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4205      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4206      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4207      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4208      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4209
4210     const bool utf8_target = reginfo->is_utf8_target;
4211
4212     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4213     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4214     bool use_chrtest_void = FALSE;
4215     const bool is_utf8_pat = reginfo->is_utf8_pat;
4216
4217     /* Used when we have both utf8 input and utf8 output, to avoid converting
4218      * to/from code points */
4219     bool utf8_has_been_setup = FALSE;
4220
4221     dVAR;
4222
4223     U8 *pat = (U8*)STRING(text_node);
4224     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4225
4226     if (   OP(text_node) == EXACT
4227         || OP(text_node) == EXACT_ONLY8
4228         || OP(text_node) == EXACTL)
4229     {
4230
4231         /* In an exact node, only one thing can be matched, that first
4232          * character.  If both the pat and the target are UTF-8, we can just
4233          * copy the input to the output, avoiding finding the code point of
4234          * that character */
4235         if (!is_utf8_pat) {
4236             assert(OP(text_node) != EXACT_ONLY8);
4237             c2 = c1 = *pat;
4238         }
4239         else if (utf8_target) {
4240             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4241             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4242             utf8_has_been_setup = TRUE;
4243         }
4244         else if (OP(text_node) == EXACT_ONLY8) {
4245             return FALSE;   /* Can only match UTF-8 target */
4246         }
4247         else {
4248             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4249         }
4250     }
4251     else { /* an EXACTFish node */
4252         U8 *pat_end = pat + STR_LEN(text_node);
4253
4254         /* An EXACTFL node has at least some characters unfolded, because what
4255          * they match is not known until now.  So, now is the time to fold
4256          * the first few of them, as many as are needed to determine 'c1' and
4257          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4258          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4259          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4260          * need to fold as many characters as a single character can fold to,
4261          * so that later we can check if the first ones are such a multi-char
4262          * fold.  But, in such a pattern only locale-problematic characters
4263          * aren't folded, so we can skip this completely if the first character
4264          * in the node isn't one of the tricky ones */
4265         if (OP(text_node) == EXACTFL) {
4266
4267             if (! is_utf8_pat) {
4268                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4269                 {
4270                     folded[0] = folded[1] = 's';
4271                     pat = folded;
4272                     pat_end = folded + 2;
4273                 }
4274             }
4275             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4276                 U8 *s = pat;
4277                 U8 *d = folded;
4278                 int i;
4279
4280                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4281                     if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
4282                         *(d++) = (U8) toFOLD_LC(*s);
4283                         s++;
4284                     }
4285                     else {
4286                         STRLEN len;
4287                         _toFOLD_utf8_flags(s,
4288                                            pat_end,
4289                                            d,
4290                                            &len,
4291                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4292                         d += len;
4293                         s += UTF8SKIP(s);
4294                     }
4295                 }
4296
4297                 pat = folded;
4298                 pat_end = d;
4299             }
4300         }
4301
4302         if (    ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4303              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4304         {
4305             /* Multi-character folds require more context to sort out.  Also
4306              * PL_utf8_foldclosures used below doesn't handle them, so have to
4307              * be handled outside this routine */
4308             use_chrtest_void = TRUE;
4309         }
4310         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4311             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4312
4313             if (   UNLIKELY(PL_in_utf8_turkic_locale)
4314                 && OP(text_node) == EXACTFL
4315                 && UNLIKELY(   c1 == 'i' || c1 == 'I'
4316                             || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
4317                             || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
4318             {   /* Hard-coded Turkish locale rules for these 4 characters
4319                    override normal rules */
4320                 if (c1 == 'i') {
4321                     c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4322                 }
4323                 else if (c1 == 'I') {
4324                     c2 = LATIN_SMALL_LETTER_DOTLESS_I;
4325                 }
4326                 else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
4327                     c2 = 'i';
4328                 }
4329                 else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
4330                     c2 = 'I';
4331                 }
4332             }
4333             else if (c1 > 255) {
4334                 const unsigned int * remaining_folds;
4335                 unsigned int first_fold;
4336
4337                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4338                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4339                 Size_t folds_count = _inverse_folds(c1, &first_fold,
4340                                                        &remaining_folds);
4341                 if (folds_count == 0) {
4342                     c2 = c1;    /* there is only a single character that could
4343                                    match */
4344                 }
4345                 else if (folds_count != 1) {
4346                     /* If there aren't exactly two folds to this (itself and
4347                      * another), it is outside the scope of this function */
4348                     use_chrtest_void = TRUE;
4349                 }
4350                 else {  /* There are two.  We already have one, get the other */
4351                     c2 = first_fold;
4352
4353                     /* Folds that cross the 255/256 boundary are forbidden if
4354                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4355                      * ASCIII.  The only other match to c1 is c2, and since c1
4356                      * is above 255, c2 better be as well under these
4357                      * circumstances.  If it isn't, it means the only legal
4358                      * match of c1 is itself. */
4359                     if (    c2 < 256
4360                         && (   (   OP(text_node) == EXACTFL
4361                                 && ! IN_UTF8_CTYPE_LOCALE)
4362                             || ((     OP(text_node) == EXACTFAA
4363                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4364                                 && (isASCII(c1) || isASCII(c2)))))
4365                     {
4366                         c2 = c1;
4367                     }
4368                 }
4369             }
4370             else /* Here, c1 is <= 255 */
4371                 if (   utf8_target
4372                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4373                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4374                     && (   (   OP(text_node) != EXACTFAA
4375                             && OP(text_node) != EXACTFAA_NO_TRIE)
4376                         ||   ! isASCII(c1)))
4377             {
4378                 /* Here, there could be something above Latin1 in the target
4379                  * which folds to this character in the pattern.  All such
4380                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4381                  * than two characters involved in their folds, so are outside
4382                  * the scope of this function */
4383                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4384                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4385                 }
4386                 else {
4387                     use_chrtest_void = TRUE;
4388                 }
4389             }
4390             else { /* Here nothing above Latin1 can fold to the pattern
4391                       character */
4392                 switch (OP(text_node)) {
4393
4394                     case EXACTFL:   /* /l rules */
4395                         c2 = PL_fold_locale[c1];
4396                         break;
4397
4398                     case EXACTF:   /* This node only generated for non-utf8
4399                                     patterns */
4400                         assert(! is_utf8_pat);
4401                         if (! utf8_target) {    /* /d rules */
4402                             c2 = PL_fold[c1];
4403                             break;
4404                         }
4405                         /* FALLTHROUGH */
4406                         /* /u rules for all these.  This happens to work for
4407                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4408                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4409                                                 non-utf8 patterns */
4410                         assert(! is_utf8_pat);
4411                         /* FALLTHROUGH */
4412                     case EXACTFAA:
4413                     case EXACTFUP:
4414                     case EXACTFU:
4415                         c2 = PL_fold_latin1[c1];
4416                         break;
4417                     case EXACTFU_ONLY8:
4418                         return FALSE;
4419                         NOT_REACHED; /* NOTREACHED */
4420
4421                     default:
4422                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4423                         NOT_REACHED; /* NOTREACHED */
4424                 }
4425             }
4426         }
4427     }
4428
4429     /* Here have figured things out.  Set up the returns */
4430     if (use_chrtest_void) {
4431         *c2p = *c1p = CHRTEST_VOID;
4432     }
4433     else if (utf8_target) {
4434         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4435             uvchr_to_utf8(c1_utf8, c1);
4436             uvchr_to_utf8(c2_utf8, c2);
4437         }
4438
4439         /* Invariants are stored in both the utf8 and byte outputs; Use
4440          * negative numbers otherwise for the byte ones.  Make sure that the
4441          * byte ones are the same iff the utf8 ones are the same */
4442         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4443         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4444                 ? *c2_utf8
4445                 : (c1 == c2)
4446                   ? CHRTEST_NOT_A_CP_1
4447                   : CHRTEST_NOT_A_CP_2;
4448     }
4449     else if (c1 > 255) {
4450        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4451                            can represent */
4452            return FALSE;
4453        }
4454
4455        *c1p = *c2p = c2;    /* c2 is the only representable value */
4456     }
4457     else {  /* c1 is representable; see about c2 */
4458        *c1p = c1;
4459        *c2p = (c2 < 256) ? c2 : c1;
4460     }
4461
4462     return TRUE;
4463 }
4464
4465 STATIC bool
4466 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4467 {
4468     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4469      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4470
4471     PERL_ARGS_ASSERT_ISGCB;
4472
4473     switch (GCB_table[before][after]) {
4474         case GCB_BREAKABLE:
4475             return TRUE;
4476
4477         case GCB_NOBREAK:
4478             return FALSE;
4479
4480         case GCB_RI_then_RI:
4481             {
4482                 int RI_count = 1;
4483                 U8 * temp_pos = (U8 *) curpos;
4484
4485                 /* Do not break within emoji flag sequences. That is, do not
4486                  * break between regional indicator (RI) symbols if there is an
4487                  * odd number of RI characters before the break point.
4488                  *  GB12   sot (RI RI)* RI × RI
4489                  *  GB13 [^RI] (RI RI)* RI × RI */
4490
4491                 while (backup_one_GCB(strbeg,
4492                                     &temp_pos,
4493                                     utf8_target) == GCB_Regional_Indicator)
4494                 {
4495                     RI_count++;
4496                 }
4497
4498                 return RI_count % 2 != 1;
4499             }
4500
4501         case GCB_EX_then_EM:
4502
4503             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4504             {
4505                 U8 * temp_pos = (U8 *) curpos;
4506                 GCB_enum prev;
4507
4508                 do {
4509                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4510                 }
4511                 while (prev == GCB_Extend);
4512
4513                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4514             }
4515
4516         case GCB_Maybe_Emoji_NonBreak:
4517
4518             {
4519
4520             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4521               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4522               */
4523                 U8 * temp_pos = (U8 *) curpos;
4524                 GCB_enum prev;
4525
4526                 do {
4527                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4528                 }
4529                 while (prev == GCB_Extend);
4530
4531                 return prev != GCB_XPG_XX;
4532             }
4533
4534         default:
4535             break;
4536     }
4537
4538 #ifdef DEBUGGING
4539     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4540                                   before, after, GCB_table[before][after]);
4541     assert(0);
4542 #endif
4543     return TRUE;
4544 }
4545
4546 STATIC GCB_enum
4547 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4548 {
4549     dVAR;
4550     GCB_enum gcb;
4551
4552     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4553
4554     if (*curpos < strbeg) {
4555         return GCB_EDGE;
4556     }
4557
4558     if (utf8_target) {
4559         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4560         U8 * prev_prev_char_pos;
4561
4562         if (! prev_char_pos) {
4563             return GCB_EDGE;
4564         }
4565
4566         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4567             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4568             *curpos = prev_char_pos;
4569             prev_char_pos = prev_prev_char_pos;
4570         }
4571         else {
4572             *curpos = (U8 *) strbeg;
4573             return GCB_EDGE;
4574         }
4575     }
4576     else {
4577         if (*curpos - 2 < strbeg) {
4578             *curpos = (U8 *) strbeg;
4579             return GCB_EDGE;
4580         }
4581         (*curpos)--;
4582         gcb = getGCB_VAL_CP(*(*curpos - 1));
4583     }
4584
4585     return gcb;
4586 }
4587
4588 /* Combining marks attach to most classes that precede them, but this defines
4589  * the exceptions (from TR14) */
4590 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4591                                      || prev == LB_Mandatory_Break      \
4592                                      || prev == LB_Carriage_Return      \
4593                                      || prev == LB_Line_Feed            \
4594                                      || prev == LB_Next_Line            \
4595                                      || prev == LB_Space                \
4596                                      || prev == LB_ZWSpace))
4597
4598 STATIC bool
4599 S_isLB(pTHX_ LB_enum before,
4600              LB_enum after,
4601              const U8 * const strbeg,
4602              const U8 * const curpos,
4603              const U8 * const strend,
4604              const bool utf8_target)
4605 {
4606     U8 * temp_pos = (U8 *) curpos;
4607     LB_enum prev = before;
4608
4609     /* Is the boundary between 'before' and 'after' line-breakable?
4610      * Most of this is just a table lookup of a generated table from Unicode
4611      * rules.  But some rules require context to decide, and so have to be
4612      * implemented in code */
4613
4614     PERL_ARGS_ASSERT_ISLB;
4615
4616     /* Rule numbers in the comments below are as of Unicode 9.0 */
4617
4618   redo:
4619     before = prev;
4620     switch (LB_table[before][after]) {
4621         case LB_BREAKABLE:
4622             return TRUE;
4623
4624         case LB_NOBREAK:
4625         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4626             return FALSE;
4627
4628         case LB_SP_foo + LB_BREAKABLE:
4629         case LB_SP_foo + LB_NOBREAK:
4630         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4631
4632             /* When we have something following a SP, we have to look at the
4633              * context in order to know what to do.
4634              *
4635              * SP SP should not reach here because LB7: Do not break before
4636              * spaces.  (For two spaces in a row there is nothing that
4637              * overrides that) */
4638             assert(after != LB_Space);
4639
4640             /* Here we have a space followed by a non-space.  Mostly this is a
4641              * case of LB18: "Break after spaces".  But there are complications
4642              * as the handling of spaces is somewhat tricky.  They are in a
4643              * number of rules, which have to be applied in priority order, but
4644              * something earlier in the string can cause a rule to be skipped
4645              * and a lower priority rule invoked.  A prime example is LB7 which
4646              * says don't break before a space.  But rule LB8 (lower priority)
4647              * says that the first break opportunity after a ZW is after any
4648              * span of spaces immediately after it.  If a ZW comes before a SP
4649              * in the input, rule LB8 applies, and not LB7.  Other such rules
4650              * involve combining marks which are rules 9 and 10, but they may
4651              * override higher priority rules if they come earlier in the
4652              * string.  Since we're doing random access into the middle of the
4653              * string, we have to look for rules that should get applied based
4654              * on both string position and priority.  Combining marks do not
4655              * attach to either ZW nor SP, so we don't have to consider them
4656              * until later.
4657              *
4658              * To check for LB8, we have to find the first non-space character
4659              * before this span of spaces */
4660             do {
4661                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4662             }
4663             while (prev == LB_Space);
4664
4665             /* LB8 Break before any character following a zero-width space,
4666              * even if one or more spaces intervene.
4667              *      ZW SP* ÷
4668              * So if we have a ZW just before this span, and to get here this
4669              * is the final space in the span. */
4670             if (prev == LB_ZWSpace) {
4671                 return TRUE;
4672             }
4673
4674             /* Here, not ZW SP+.  There are several rules that have higher
4675              * priority than LB18 and can be resolved now, as they don't depend
4676              * on anything earlier in the string (except ZW, which we have
4677              * already handled).  One of these rules is LB11 Do not break
4678              * before Word joiner, but we have specially encoded that in the
4679              * lookup table so it is caught by the single test below which
4680              * catches the other ones. */
4681             if (LB_table[LB_Space][after] - LB_SP_foo
4682                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4683             {
4684                 return FALSE;
4685             }
4686
4687             /* If we get here, we have to XXX consider combining marks. */
4688             if (prev == LB_Combining_Mark) {
4689
4690                 /* What happens with these depends on the character they
4691                  * follow.  */
4692                 do {
4693                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4694                 }
4695                 while (prev == LB_Combining_Mark);
4696
4697                 /* Most times these attach to and inherit the characteristics
4698                  * of that character, but not always, and when not, they are to
4699                  * be treated as AL by rule LB10. */
4700                 if (! LB_CM_ATTACHES_TO(prev)) {
4701                     prev = LB_Alphabetic;
4702                 }
4703             }
4704
4705             /* Here, we have the character preceding the span of spaces all set
4706              * up.  We follow LB18: "Break after spaces" unless the table shows
4707              * that is overriden */
4708             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4709
4710         case LB_CM_ZWJ_foo:
4711
4712             /* We don't know how to treat the CM except by looking at the first
4713              * non-CM character preceding it.  ZWJ is treated as CM */
4714             do {
4715                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4716             }
4717             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4718
4719             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4720              * attatches to it, then it inherits the behavior of 'prev'.  If it
4721              * doesn't attach, it is to be treated as an AL */
4722             if (! LB_CM_ATTACHES_TO(prev)) {
4723                 prev = LB_Alphabetic;
4724             }
4725
4726             goto redo;
4727
4728         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4729         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4730
4731             /* LB21a Don't break after Hebrew + Hyphen.
4732              * HL (HY | BA) × */
4733
4734             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4735                                                           == LB_Hebrew_Letter)
4736             {
4737                 return FALSE;
4738             }
4739
4740             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4741
4742         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4743         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4744
4745             /* LB25a (PR | PO) × ( OP | HY )? NU */
4746             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4747                 return FALSE;
4748             }
4749
4750             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4751                                                                 == LB_BREAKABLE;
4752
4753         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4754         case LB_SY_or_IS_then_various + LB_NOBREAK:
4755         {
4756             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4757
4758             LB_enum temp = prev;
4759             do {
4760                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4761             }
4762             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4763             if (temp == LB_Numeric) {
4764                 return FALSE;
4765             }
4766
4767             return LB_table[prev][after] - LB_SY_or_IS_then_various
4768                                                                == LB_BREAKABLE;
4769         }
4770
4771         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4772         case LB_various_then_PO_or_PR + LB_NOBREAK:
4773         {
4774             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4775
4776             LB_enum temp = prev;
4777             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4778             {
4779                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4780             }
4781             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4782                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4783             }
4784             if (temp == LB_Numeric) {
4785                 return FALSE;
4786             }
4787             return LB_various_then_PO_or_PR;
4788         }
4789
4790         case LB_RI_then_RI + LB_NOBREAK:
4791         case LB_RI_then_RI + LB_BREAKABLE:
4792             {
4793                 int RI_count = 1;
4794
4795                 /* LB30a Break between two regional indicator symbols if and
4796                  * only if there are an even number of regional indicators
4797                  * preceding the position of the break.
4798                  *
4799                  *    sot (RI RI)* RI × RI
4800                  *  [^RI] (RI RI)* RI × RI */
4801
4802                 while (backup_one_LB(strbeg,
4803                                      &temp_pos,
4804                                      utf8_target) == LB_Regional_Indicator)
4805                 {
4806                     RI_count++;
4807                 }
4808
4809                 return RI_count % 2 == 0;
4810             }
4811
4812         default:
4813             break;
4814     }
4815
4816 #ifdef DEBUGGING
4817     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4818                                   before, after, LB_table[before][after]);
4819     assert(0);
4820 #endif
4821     return TRUE;
4822 }
4823
4824 STATIC LB_enum
4825 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4826 {
4827     dVAR;
4828
4829     LB_enum lb;
4830
4831     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4832
4833     if (*curpos >= strend) {
4834         return LB_EDGE;
4835     }
4836
4837     if (utf8_target) {
4838         *curpos += UTF8SKIP(*curpos);
4839         if (*curpos >= strend) {
4840             return LB_EDGE;
4841         }
4842         lb = getLB_VAL_UTF8(*curpos, strend);
4843     }
4844     else {
4845         (*curpos)++;
4846         if (*curpos >= strend) {
4847             return LB_EDGE;
4848         }
4849         lb = getLB_VAL_CP(**curpos);
4850     }
4851
4852     return lb;
4853 }
4854
4855 STATIC LB_enum
4856 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4857 {
4858     dVAR;
4859     LB_enum lb;
4860
4861     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4862
4863     if (*curpos < strbeg) {
4864         return LB_EDGE;
4865     }
4866
4867     if (utf8_target) {
4868         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4869         U8 * prev_prev_char_pos;
4870
4871         if (! prev_char_pos) {
4872             return LB_EDGE;
4873         }
4874
4875         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4876             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4877             *curpos = prev_char_pos;
4878             prev_char_pos = prev_prev_char_pos;
4879         }
4880         else {
4881             *curpos = (U8 *) strbeg;
4882             return LB_EDGE;
4883         }
4884     }
4885     else {
4886         if (*curpos - 2 < strbeg) {
4887             *curpos = (U8 *) strbeg;
4888             return LB_EDGE;
4889         }
4890         (*curpos)--;
4891         lb = getLB_VAL_CP(*(*curpos - 1));
4892     }
4893
4894     return lb;
4895 }
4896
4897 STATIC bool
4898 S_isSB(pTHX_ SB_enum before,
4899              SB_enum after,
4900              const U8 * const strbeg,
4901              const U8 * const curpos,
4902              const U8 * const strend,
4903              const bool utf8_target)
4904 {
4905     /* returns a boolean indicating if there is a Sentence Boundary Break
4906      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4907
4908     U8 * lpos = (U8 *) curpos;
4909     bool has_para_sep = FALSE;
4910     bool has_sp = FALSE;
4911
4912     PERL_ARGS_ASSERT_ISSB;
4913
4914     /* Break at the start and end of text.
4915         SB1.  sot  ÷
4916         SB2.  ÷  eot
4917       But unstated in Unicode is don't break if the text is empty */
4918     if (before == SB_EDGE || after == SB_EDGE) {
4919         return before != after;
4920     }
4921
4922     /* SB 3: Do not break within CRLF. */
4923     if (before == SB_CR && after == SB_LF) {
4924         return FALSE;
4925     }
4926
4927     /* Break after paragraph separators.  CR and LF are considered
4928      * so because Unicode views text as like word processing text where there
4929      * are no newlines except between paragraphs, and the word processor takes
4930      * care of wrapping without there being hard line-breaks in the text *./
4931        SB4.  Sep | CR | LF  ÷ */
4932     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4933         return TRUE;
4934     }
4935
4936     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4937      * (See Section 6.2, Replacing Ignore Rules.)
4938         SB5.  X (Extend | Format)*  →  X */
4939     if (after == SB_Extend || after == SB_Format) {
4940
4941         /* Implied is that the these characters attach to everything
4942          * immediately prior to them except for those separator-type
4943          * characters.  And the rules earlier have already handled the case
4944          * when one of those immediately precedes the extend char */
4945         return FALSE;
4946     }
4947
4948     if (before == SB_Extend || before == SB_Format) {
4949         U8 * temp_pos = lpos;
4950         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4951         if (   backup != SB_EDGE
4952             && backup != SB_Sep
4953             && backup != SB_CR
4954             && backup != SB_LF)
4955         {
4956             before = backup;
4957             lpos = temp_pos;
4958         }
4959
4960         /* Here, both 'before' and 'backup' are these types; implied is that we
4961          * don't break between them */
4962         if (backup == SB_Extend || backup == SB_Format) {
4963             return FALSE;
4964         }
4965     }
4966
4967     /* Do not break after ambiguous terminators like period, if they are
4968      * immediately followed by a number or lowercase letter, if they are
4969      * between uppercase letters, if the first following letter (optionally
4970      * after certain punctuation) is lowercase, or if they are followed by
4971      * "continuation" punctuation such as comma, colon, or semicolon. For
4972      * example, a period may be an abbreviation or numeric period, and thus may
4973      * not mark the end of a sentence.
4974
4975      * SB6. ATerm  ×  Numeric */
4976     if (before == SB_ATerm && after == SB_Numeric) {
4977         return FALSE;
4978     }
4979
4980     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4981     if (before == SB_ATerm && after == SB_Upper) {
4982         U8 * temp_pos = lpos;
4983         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4984         if (backup == SB_Upper || backup == SB_Lower) {
4985             return FALSE;
4986         }
4987     }
4988
4989     /* The remaining rules that aren't the final one, all require an STerm or
4990      * an ATerm after having backed up over some Close* Sp*, and in one case an
4991      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4992      * So do that backup now, setting flags if either Sp or a paragraph
4993      * separator are found */
4994
4995     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4996         has_para_sep = TRUE;
4997         before = backup_one_SB(strbeg, &lpos, utf8_target);
4998     }
4999
5000     if (before == SB_Sp) {
5001         has_sp = TRUE;
5002         do {
5003             before = backup_one_SB(strbeg, &lpos, utf8_target);
5004         }
5005         while (before == SB_Sp);
5006     }
5007
5008     while (before == SB_Close) {
5009         before = backup_one_SB(strbeg, &lpos, utf8_target);
5010     }
5011
5012     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5013      * most cases an STerm */
5014     if (before == SB_STerm || before == SB_ATerm) {
5015
5016         /* So, here the lhs matches
5017          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5018          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5019          * The rules that apply here are:
5020          *
5021          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5022                                            | LF | STerm | ATerm) )* Lower
5023            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5024            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5025            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5026            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5027          */
5028
5029         /* And all but SB11 forbid having seen a paragraph separator */
5030         if (! has_para_sep) {
5031             if (before == SB_ATerm) {          /* SB8 */
5032                 U8 * rpos = (U8 *) curpos;
5033                 SB_enum later = after;
5034
5035                 while (    later != SB_OLetter
5036                         && later != SB_Upper
5037                         && later != SB_Lower
5038                         && later != SB_Sep
5039                         && later != SB_CR
5040                         && later != SB_LF
5041                         && later != SB_STerm
5042                         && later != SB_ATerm
5043                         && later != SB_EDGE)
5044                 {
5045                     later = advance_one_SB(&rpos, strend, utf8_target);
5046                 }
5047                 if (later == SB_Lower) {
5048                     return FALSE;
5049                 }
5050             }
5051
5052             if (   after == SB_SContinue    /* SB8a */
5053                 || after == SB_STerm
5054                 || after == SB_ATerm)
5055             {
5056                 return FALSE;
5057             }
5058
5059             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5060                 if (   after == SB_Close
5061                     || after == SB_Sp
5062                     || after == SB_Sep
5063                     || after == SB_CR
5064                     || after == SB_LF)
5065                 {
5066                     return FALSE;
5067                 }
5068             }
5069
5070             /* SB10.  This and SB9 could probably be combined some way, but khw
5071              * has decided to follow the Unicode rule book precisely for
5072              * simplified maintenance */
5073             if (   after == SB_Sp
5074                 || after == SB_Sep
5075                 || after == SB_CR
5076                 || after == SB_LF)
5077             {
5078                 return FALSE;
5079             }
5080         }
5081
5082         /* SB11.  */
5083         return TRUE;
5084     }
5085
5086     /* Otherwise, do not break.
5087     SB12.  Any  ×  Any */
5088
5089     return FALSE;
5090 }
5091
5092 STATIC SB_enum
5093 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5094 {
5095     dVAR;
5096     SB_enum sb;
5097
5098     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5099
5100     if (*curpos >= strend) {
5101         return SB_EDGE;
5102     }
5103
5104     if (utf8_target) {
5105         do {
5106             *curpos += UTF8SKIP(*curpos);
5107             if (*curpos >= strend) {
5108                 return SB_EDGE;
5109             }
5110             sb = getSB_VAL_UTF8(*curpos, strend);
5111         } while (sb == SB_Extend || sb == SB_Format);
5112     }
5113     else {
5114         do {
5115             (*curpos)++;
5116             if (*curpos >= strend) {
5117                 return SB_EDGE;
5118             }
5119             sb = getSB_VAL_CP(**curpos);
5120         } while (sb == SB_Extend || sb == SB_Format);
5121     }
5122
5123     return sb;
5124 }
5125
5126 STATIC SB_enum
5127 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5128 {
5129     dVAR;
5130     SB_enum sb;
5131
5132     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5133
5134     if (*curpos < strbeg) {
5135         return SB_EDGE;
5136     }
5137
5138     if (utf8_target) {
5139         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5140         if (! prev_char_pos) {
5141             return SB_EDGE;
5142         }
5143
5144         /* Back up over Extend and Format.  curpos is always just to the right
5145          * of the characater whose value we are getting */
5146         do {
5147             U8 * prev_prev_char_pos;
5148             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5149                                                                       strbeg)))
5150             {
5151                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5152                 *curpos = prev_char_pos;
5153                 prev_char_pos = prev_prev_char_pos;
5154             }
5155             else {
5156                 *curpos = (U8 *) strbeg;
5157                 return SB_EDGE;
5158             }
5159         } while (sb == SB_Extend || sb == SB_Format);
5160     }
5161     else {
5162         do {
5163             if (*curpos - 2 < strbeg) {
5164                 *curpos = (U8 *) strbeg;
5165                 return SB_EDGE;
5166             }
5167             (*curpos)--;
5168             sb = getSB_VAL_CP(*(*curpos - 1));
5169         } while (sb == SB_Extend || sb == SB_Format);
5170     }
5171
5172     return sb;
5173 }
5174
5175 STATIC bool
5176 S_isWB(pTHX_ WB_enum previous,
5177              WB_enum before,
5178              WB_enum after,
5179              const U8 * const strbeg,
5180              const U8 * const curpos,
5181              const U8 * const strend,
5182              const bool utf8_target)
5183 {
5184     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5185      *  a Unicode word break, using their published algorithm, but tailored for
5186      *  Perl by treating spans of white space as one unit.  Context may be
5187      *  needed to make this determination.  If the value for the character
5188      *  before 'before' is known, it is passed as 'previous'; otherwise that
5189      *  should be set to WB_UNKNOWN.  The other input parameters give the
5190      *  boundaries and current position in the matching of the string.  That
5191      *  is, 'curpos' marks the position where the character whose wb value is
5192      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5193
5194     U8 * before_pos = (U8 *) curpos;
5195     U8 * after_pos = (U8 *) curpos;
5196     WB_enum prev = before;
5197     WB_enum next;
5198
5199     PERL_ARGS_ASSERT_ISWB;
5200
5201     /* Rule numbers in the comments below are as of Unicode 9.0 */
5202
5203   redo:
5204     before = prev;
5205     switch (WB_table[before][after]) {
5206         case WB_BREAKABLE:
5207             return TRUE;
5208
5209         case WB_NOBREAK:
5210             return FALSE;
5211
5212         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5213             next = advance_one_WB(&after_pos, strend, utf8_target,
5214                                  FALSE /* Don't skip Extend nor Format */ );
5215             /* A space immediately preceeding an Extend or Format is attached
5216              * to by them, and hence gets separated from previous spaces.
5217              * Otherwise don't break between horizontal white space */
5218             return next == WB_Extend || next == WB_Format;
5219
5220         /* WB4 Ignore Format and Extend characters, except when they appear at
5221          * the beginning of a region of text.  This code currently isn't
5222          * general purpose, but it works as the rules are currently and likely
5223          * to be laid out.  The reason it works is that when 'they appear at
5224          * the beginning of a region of text', the rule is to break before
5225          * them, just like any other character.  Therefore, the default rule
5226          * applies and we don't have to look in more depth.  Should this ever
5227          * change, we would have to have 2 'case' statements, like in the rules
5228          * below, and backup a single character (not spacing over the extend
5229          * ones) and then see if that is one of the region-end characters and
5230          * go from there */
5231         case WB_Ex_or_FO_or_ZWJ_then_foo:
5232             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5233             goto redo;
5234
5235         case WB_DQ_then_HL + WB_BREAKABLE:
5236         case WB_DQ_then_HL + WB_NOBREAK:
5237
5238             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5239
5240             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5241                                                             == WB_Hebrew_Letter)
5242             {
5243                 return FALSE;
5244             }
5245
5246              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5247
5248         case WB_HL_then_DQ + WB_BREAKABLE:
5249         case WB_HL_then_DQ + WB_NOBREAK:
5250
5251             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5252
5253             if (advance_one_WB(&after_pos, strend, utf8_target,
5254                                        TRUE /* Do skip Extend and Format */ )
5255                                                             == WB_Hebrew_Letter)
5256             {
5257                 return FALSE;
5258             }
5259
5260             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5261
5262         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5263         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5264
5265             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5266              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5267
5268             next = advance_one_WB(&after_pos, strend, utf8_target,
5269                                        TRUE /* Do skip Extend and Format */ );
5270
5271             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5272             {
5273                 return FALSE;
5274             }
5275
5276             return WB_table[before][after]
5277                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5278
5279         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5280         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5281
5282             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5283              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5284
5285             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5286             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5287             {
5288                 return FALSE;
5289             }
5290
5291             return WB_table[before][after]
5292                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5293
5294         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5295         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5296
5297             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5298              * */
5299
5300             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5301                                                             == WB_Numeric)
5302             {
5303                 return FALSE;
5304             }
5305
5306             return WB_table[before][after]
5307                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5308
5309         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5310         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5311
5312             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5313
5314             if (advance_one_WB(&after_pos, strend, utf8_target,
5315                                        TRUE /* Do skip Extend and Format */ )
5316                                                             == WB_Numeric)
5317             {
5318                 return FALSE;
5319             }
5320
5321             return WB_table[before][after]
5322                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5323
5324         case WB_RI_then_RI + WB_NOBREAK:
5325         case WB_RI_then_RI + WB_BREAKABLE:
5326             {
5327                 int RI_count = 1;
5328
5329                 /* Do not break within emoji flag sequences. That is, do not
5330                  * break between regional indicator (RI) symbols if there is an
5331                  * odd number of RI characters before the potential break
5332                  * point.
5333                  *
5334                  * WB15   sot (RI RI)* RI × RI
5335                  * WB16 [^RI] (RI RI)* RI × RI */
5336
5337                 while (backup_one_WB(&previous,
5338                                      strbeg,
5339                                      &before_pos,
5340                                      utf8_target) == WB_Regional_Indicator)
5341                 {
5342                     RI_count++;
5343                 }
5344
5345                 return RI_count % 2 != 1;
5346             }
5347
5348         default:
5349             break;
5350     }
5351
5352 #ifdef DEBUGGING
5353     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5354                                   before, after, WB_table[before][after]);
5355     assert(0);
5356 #endif
5357     return TRUE;
5358 }
5359
5360 STATIC WB_enum
5361 S_advance_one_WB(pTHX_ U8 ** curpos,
5362                        const U8 * const strend,
5363                        const bool utf8_target,
5364                        const bool skip_Extend_Format)
5365 {
5366     dVAR;
5367     WB_enum wb;
5368
5369     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5370
5371     if (*curpos >= strend) {
5372         return WB_EDGE;
5373     }
5374
5375     if (utf8_target) {
5376
5377         /* Advance over Extend and Format */
5378         do {
5379             *curpos += UTF8SKIP(*curpos);
5380             if (*curpos >= strend) {
5381                 return WB_EDGE;
5382             }
5383             wb = getWB_VAL_UTF8(*curpos, strend);
5384         } while (    skip_Extend_Format
5385                  && (wb == WB_Extend || wb == WB_Format));
5386     }
5387     else {
5388         do {
5389             (*curpos)++;
5390             if (*curpos >= strend) {
5391                 return WB_EDGE;
5392             }
5393             wb = getWB_VAL_CP(**curpos);
5394         } while (    skip_Extend_Format
5395                  && (wb == WB_Extend || wb == WB_Format));
5396     }
5397
5398     return wb;
5399 }
5400
5401 STATIC WB_enum
5402 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5403 {
5404     dVAR;
5405     WB_enum wb;
5406
5407     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5408
5409     /* If we know what the previous character's break value is, don't have
5410         * to look it up */
5411     if (*previous != WB_UNKNOWN) {
5412         wb = *previous;
5413
5414         /* But we need to move backwards by one */
5415         if (utf8_target) {
5416             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5417             if (! *curpos) {
5418                 *previous = WB_EDGE;
5419                 *curpos = (U8 *) strbeg;
5420             }
5421             else {
5422                 *previous = WB_UNKNOWN;
5423             }
5424         }
5425         else {
5426             (*curpos)--;
5427             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5428         }
5429
5430         /* And we always back up over these three types */
5431         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5432             return wb;
5433         }
5434     }
5435
5436     if (*curpos < strbeg) {
5437         return WB_EDGE;
5438     }
5439
5440     if (utf8_target) {
5441         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5442         if (! prev_char_pos) {
5443             return WB_EDGE;
5444         }
5445
5446         /* Back up over Extend and Format.  curpos is always just to the right
5447          * of the characater whose value we are getting */
5448         do {
5449             U8 * prev_prev_char_pos;
5450             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5451                                                    -1,
5452                                                    strbeg)))
5453             {
5454                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5455                 *curpos = prev_char_pos;
5456                 prev_char_pos = prev_prev_char_pos;
5457             }
5458             else {
5459                 *curpos = (U8 *) strbeg;
5460                 return WB_EDGE;
5461             }
5462         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5463     }
5464     else {
5465         do {
5466             if (*curpos - 2 < strbeg) {
5467                 *curpos = (U8 *) strbeg;
5468                 return WB_EDGE;
5469             }
5470             (*curpos)--;
5471             wb = getWB_VAL_CP(*(*curpos - 1));
5472         } while (wb == WB_Extend || wb == WB_Format);
5473     }
5474
5475     return wb;
5476 }
5477
5478 /* Macros for regmatch(), using its internal variables */
5479 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
5480 #define NEXTCHR_IS_EOS (nextchr < 0)
5481
5482 #define SET_nextchr \
5483     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
5484
5485 #define SET_locinput(p) \
5486     locinput = (p);  \
5487     SET_nextchr
5488
5489 #define sayYES goto yes
5490 #define sayNO goto no
5491 #define sayNO_SILENT goto no_silent
5492
5493 /* we dont use STMT_START/END here because it leads to
5494    "unreachable code" warnings, which are bogus, but distracting. */
5495 #define CACHEsayNO \
5496     if (ST.cache_mask) \
5497        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
5498     sayNO
5499
5500 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5501 (                                                           \
5502     (   ( st )                                         ) && \
5503     (   ( st )->u.eval.close_paren                     ) && \
5504     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5505 )
5506
5507 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5508 (                                                           \
5509     (   ( st )                                         ) && \
5510     (   ( st )->u.eval.close_paren                     ) && \
5511     (   ( expr )                                       ) && \
5512     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5513 )
5514
5515
5516 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5517     (st)->u.eval.close_paren = ( (expr) + 1 )
5518
5519 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5520     (st)->u.eval.close_paren = 0
5521
5522 /* push a new state then goto it */
5523
5524 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
5525     pushinput = input; \
5526     pusheol = eol; \
5527     pushsr0 = sr0; \
5528     scan = node; \
5529     st->resume_state = state; \
5530     goto push_state;
5531
5532 /* push a new state with success backtracking, then goto it */
5533
5534 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
5535     pushinput = input; \
5536     pusheol = eol;     \
5537     pushsr0 = sr0; \
5538     scan = node; \
5539     st->resume_state = state; \
5540     goto push_yes_state;
5541
5542 #define DEBUG_STATE_pp(pp)                                  \
5543     DEBUG_STATE_r({                                         \
5544         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
5545         Perl_re_printf( aTHX_                               \
5546             "%*s" pp " %s%s%s%s%s\n",                       \
5547             INDENT_CHARS(depth), "",                        \
5548             PL_reg_name[st->resume_state],                  \
5549             ((st==yes_state||st==mark_state) ? "[" : ""),   \
5550             ((st==yes_state) ? "Y" : ""),                   \
5551             ((st==mark_state) ? "M" : ""),                  \
5552             ((st==yes_state||st==mark_state) ? "]" : "")    \
5553         );                                                  \
5554     });
5555
5556 /*
5557
5558 regmatch() - main matching routine
5559
5560 This is basically one big switch statement in a loop. We execute an op,
5561 set 'next' to point the next op, and continue. If we come to a point which
5562 we may need to backtrack to on failure such as (A|B|C), we push a
5563 backtrack state onto the backtrack stack. On failure, we pop the top
5564 state, and re-enter the loop at the state indicated. If there are no more
5565 states to pop, we return failure.
5566
5567 Sometimes we also need to backtrack on success; for example /A+/, where
5568 after successfully matching one A, we need to go back and try to
5569 match another one; similarly for lookahead assertions: if the assertion
5570 completes successfully, we backtrack to the state just before the assertion
5571 and then carry on.  In these cases, the pushed state is marked as
5572 'backtrack on success too'. This marking is in fact done by a chain of
5573 pointers, each pointing to the previous 'yes' state. On success, we pop to
5574 the nearest yes state, discarding any intermediate failure-only states.
5575 Sometimes a yes state is pushed just to force some cleanup code to be
5576 called at the end of a successful match or submatch; e.g. (??{$re}) uses
5577 it to free the inner regex.
5578
5579 Note that failure backtracking rewinds the cursor position, while
5580 success backtracking leaves it alone.
5581
5582 A pattern is complete when the END op is executed, while a subpattern
5583 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
5584 ops trigger the "pop to last yes state if any, otherwise return true"
5585 behaviour.
5586
5587 A common convention in this function is to use A and B to refer to the two
5588 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
5589 the subpattern to be matched possibly multiple times, while B is the entire
5590 rest of the pattern. Variable and state names reflect this convention.
5591
5592 The states in the main switch are the union of ops and failure/success of
5593 substates associated with with that op.  For example, IFMATCH is the op
5594 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
5595 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
5596 successfully matched A and IFMATCH_A_fail is a state saying that we have
5597 just failed to match A. Resume states always come in pairs. The backtrack
5598 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
5599 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
5600 on success or failure.
5601
5602 The struct that holds a backtracking state is actually a big union, with
5603 one variant for each major type of op. The variable st points to the
5604 top-most backtrack struct. To make the code clearer, within each
5605 block of code we #define ST to alias the relevant union.
5606
5607 Here's a concrete example of a (vastly oversimplified) IFMATCH
5608 implementation:
5609
5610     switch (state) {
5611     ....
5612
5613 #define ST st->u.ifmatch
5614
5615     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5616         ST.foo = ...; // some state we wish to save
5617         ...
5618         // push a yes backtrack state with a resume value of
5619         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
5620         // first node of A:
5621         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
5622         // NOTREACHED
5623
5624     case IFMATCH_A: // we have successfully executed A; now continue with B
5625         next = B;
5626         bar = ST.foo; // do something with the preserved value
5627         break;
5628
5629     case IFMATCH_A_fail: // A failed, so the assertion failed
5630         ...;   // do some housekeeping, then ...
5631         sayNO; // propagate the failure
5632
5633 #undef ST
5634
5635     ...
5636     }
5637
5638 For any old-timers reading this who are familiar with the old recursive
5639 approach, the code above is equivalent to:
5640
5641     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5642     {
5643         int foo = ...
5644         ...
5645         if (regmatch(A)) {
5646             next = B;
5647             bar = foo;
5648             break;
5649         }
5650         ...;   // do some housekeeping, then ...
5651         sayNO; // propagate the failure
5652     }
5653
5654 The topmost backtrack state, pointed to by st, is usually free. If you
5655 want to claim it, populate any ST.foo fields in it with values you wish to
5656 save, then do one of
5657
5658         PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
5659         PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
5660
5661 which sets that backtrack state's resume value to 'resume_state', pushes a
5662 new free entry to the top of the backtrack stack, then goes to 'node'.
5663 On backtracking, the free slot is popped, and the saved state becomes the
5664 new free state. An ST.foo field in this new top state can be temporarily
5665 accessed to retrieve values, but once the main loop is re-entered, it
5666 becomes available for reuse.
5667
5668 Note that the depth of the backtrack stack constantly increases during the
5669 left-to-right execution of the pattern, rather than going up and down with
5670 the pattern nesting. For example the stack is at its maximum at Z at the
5671 end of the pattern, rather than at X in the following:
5672
5673     /(((X)+)+)+....(Y)+....Z/
5674
5675 The only exceptions to this are lookahead/behind assertions and the cut,
5676 (?>A), which pop all the backtrack states associated with A before
5677 continuing.
5678
5679 Backtrack state structs are allocated in slabs of about 4K in size.
5680 PL_regmatch_state and st always point to the currently active state,
5681 and PL_regmatch_slab points to the slab currently containing
5682 PL_regmatch_state.  The first time regmatch() is called, the first slab is
5683 allocated, and is never freed until interpreter destruction. When the slab
5684 is full, a new one is allocated and chained to the end. At exit from
5685 regmatch(), slabs allocated since entry are freed.
5686
5687 In order to work with variable length lookbehinds, an upper limit is placed on
5688 lookbehinds which is set to where the match position is at the end of where the
5689 lookbehind would get to.  Nothing in the lookbehind should match above that,
5690 except we should be able to look beyond if for things like \b, which need the
5691 next character in the string to be able to determine if this is a boundary or
5692 not.  We also can't match the end of string/line unless we are also at the end
5693 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
5694 that match a width, we have to add a condition that they are within the legal
5695 bounds of our window into the string.
5696
5697 */
5698
5699 /* returns -1 on failure, $+[0] on success */
5700 STATIC SSize_t
5701 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5702 {
5703     dVAR;
5704     const bool utf8_target = reginfo->is_utf8_target;
5705     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5706     REGEXP *rex_sv = reginfo->prog;
5707     regexp *rex = ReANY(rex_sv);
5708     RXi_GET_DECL(rex,rexi);
5709     /* the current state. This is a cached copy of PL_regmatch_state */
5710     regmatch_state *st;
5711     /* cache heavy used fields of st in registers */
5712     regnode *scan;
5713     regnode *next;
5714     U32 n = 0;  /* general value; init to avoid compiler warning */
5715     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5716     SSize_t endref = 0; /* offset of end of backref when ln is start */
5717     char *locinput = startpos;
5718     char *loceol = reginfo->strend;
5719     char *pushinput; /* where to continue after a PUSH */
5720     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
5721     U8   *pushsr0;   /* save starting pos of script run */
5722     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5723
5724     bool result = 0;        /* return value of S_regmatch */
5725     U32 depth = 0;            /* depth of backtrack stack */
5726     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5727     const U32 max_nochange_depth =
5728         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5729         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5730     regmatch_state *yes_state = NULL; /* state to pop to on success of
5731                                                             subpattern */
5732     /* mark_state piggy backs on the yes_state logic so that when we unwind
5733        the stack on success we can update the mark_state as we go */
5734     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5735     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5736     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5737     U32 state_num;
5738     bool no_final = 0;      /* prevent failure from backtracking? */
5739     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5740     char *startpoint = locinput;
5741     SV *popmark = NULL;     /* are we looking for a mark? */
5742     SV *sv_commit = NULL;   /* last mark name seen in failure */
5743     SV *sv_yes_mark = NULL; /* last mark name we have seen
5744                                during a successful match */
5745     U32 lastopen = 0;       /* last open we saw */
5746     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5747     SV* const oreplsv = GvSVn(PL_replgv);
5748     /* these three flags are set by various ops to signal information to
5749      * the very next op. They have a useful lifetime of exactly one loop
5750      * iteration, and are not preserved or restored by state pushes/pops
5751      */
5752     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5753     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5754     int logical = 0;        /* the following EVAL is:
5755                                 0: (?{...})
5756                                 1: (?(?{...})X|Y)
5757                                 2: (??{...})
5758                                or the following IFMATCH/UNLESSM is:
5759                                 false: plain (?=foo)
5760                                 true:  used as a condition: (?(?=foo))
5761                             */
5762     PAD* last_pad = NULL;
5763     dMULTICALL;
5764     U8 gimme = G_SCALAR;
5765     CV *caller_cv = NULL;       /* who called us */
5766     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5767     U32 maxopenparen = 0;       /* max '(' index seen so far */
5768     int to_complement;  /* Invert the result? */
5769     _char_class_number classnum;
5770     bool is_utf8_pat = reginfo->is_utf8_pat;
5771     bool match = FALSE;
5772     I32 orig_savestack_ix = PL_savestack_ix;
5773     U8 * script_run_begin = NULL;
5774
5775 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5776 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5777 #  define SOLARIS_BAD_OPTIMIZER
5778     const U32 *pl_charclass_dup = PL_charclass;
5779 #  define PL_charclass pl_charclass_dup
5780 #endif
5781
5782 #ifdef DEBUGGING
5783     GET_RE_DEBUG_FLAGS_DECL;
5784 #endif
5785
5786     /* protect against undef(*^R) */
5787     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5788
5789     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5790     multicall_oldcatch = 0;
5791     PERL_UNUSED_VAR(multicall_cop);
5792
5793     PERL_ARGS_ASSERT_REGMATCH;
5794
5795     st = PL_regmatch_state;
5796
5797     /* Note that nextchr is a byte even in UTF */
5798     SET_nextchr;
5799     scan = prog;
5800
5801     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5802             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5803             Perl_re_printf( aTHX_ "regmatch start\n" );
5804     }));
5805
5806     while (scan != NULL) {
5807         next = scan + NEXT_OFF(scan);
5808         if (next == scan)
5809             next = NULL;
5810         state_num = OP(scan);
5811
5812       reenter_switch:
5813         DEBUG_EXECUTE_r(
5814             if (state_num <= REGNODE_MAX) {
5815                 SV * const prop = sv_newmortal();
5816                 regnode *rnext = regnext(scan);
5817
5818                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5819                 regprop(rex, prop, scan, reginfo, NULL);
5820                 Perl_re_printf( aTHX_
5821                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5822                     INDENT_CHARS(depth), "",
5823                     (IV)(scan - rexi->program),
5824                     SvPVX_const(prop),
5825                     (PL_regkind[OP(scan)] == END || !rnext) ?
5826                         0 : (IV)(rnext - rexi->program));
5827             }
5828         );
5829
5830         to_complement = 0;
5831
5832         SET_nextchr;
5833         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5834
5835         switch (state_num) {
5836         case SBOL: /*  /^../ and /\A../  */
5837             if (locinput == reginfo->strbeg)
5838                 break;
5839             sayNO;
5840
5841         case MBOL: /*  /^../m  */
5842             if (locinput == reginfo->strbeg ||
5843                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5844             {
5845                 break;
5846             }
5847             sayNO;
5848
5849         case GPOS: /*  \G  */
5850             if (locinput == reginfo->ganch)
5851                 break;
5852             sayNO;
5853
5854         case KEEPS: /*   \K  */
5855             /* update the startpoint */
5856             st->u.keeper.val = rex->offs[0].start;
5857             rex->offs[0].start = locinput - reginfo->strbeg;
5858             PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
5859                             script_run_begin);
5860             NOT_REACHED; /* NOTREACHED */
5861
5862         case KEEPS_next_fail:
5863             /* rollback the start point change */
5864             rex->offs[0].start = st->u.keeper.val;
5865             sayNO_SILENT;
5866             NOT_REACHED; /* NOTREACHED */
5867
5868         case MEOL: /* /..$/m  */
5869             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5870                 sayNO;
5871             break;
5872
5873         case SEOL: /* /..$/  */
5874             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5875                 sayNO;
5876             if (reginfo->strend - locinput > 1)
5877                 sayNO;
5878             break;
5879
5880         case EOS: /*  \z  */
5881             if (!NEXTCHR_IS_EOS)
5882                 sayNO;
5883             break;
5884
5885         case SANY: /*  /./s  */
5886             if (NEXTCHR_IS_EOS || locinput >= loceol)
5887                 sayNO;
5888             goto increment_locinput;
5889
5890         case REG_ANY: /*  /./  */
5891             if (   NEXTCHR_IS_EOS
5892                 || locinput >= loceol
5893                 || nextchr == '\n')
5894             {
5895                 sayNO;
5896             }
5897             goto increment_locinput;
5898
5899
5900 #undef  ST
5901 #define ST st->u.trie
5902         case TRIEC: /* (ab|cd) with known charclass */
5903             /* In this case the charclass data is available inline so
5904                we can fail fast without a lot of extra overhead.
5905              */
5906             if ( !   NEXTCHR_IS_EOS
5907                 &&   locinput < loceol
5908                 && ! ANYOF_BITMAP_TEST(scan, nextchr))
5909             {
5910                 DEBUG_EXECUTE_r(
5911                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5912                               depth, PL_colors[4], PL_colors[5])
5913                 );
5914                 sayNO_SILENT;
5915                 NOT_REACHED; /* NOTREACHED */
5916             }
5917             /* FALLTHROUGH */
5918         case TRIE:  /* (ab|cd)  */
5919             /* the basic plan of execution of the trie is:
5920              * At the beginning, run though all the states, and
5921              * find the longest-matching word. Also remember the position
5922              * of the shortest matching word. For example, this pattern:
5923              *    1  2 3 4    5
5924              *    ab|a|x|abcd|abc
5925              * when matched against the string "abcde", will generate
5926              * accept states for all words except 3, with the longest
5927              * matching word being 4, and the shortest being 2 (with
5928              * the position being after char 1 of the string).
5929              *
5930              * Then for each matching word, in word order (i.e. 1,2,4,5),
5931              * we run the remainder of the pattern; on each try setting
5932              * the current position to the character following the word,
5933              * returning to try the next word on failure.
5934              *
5935              * We avoid having to build a list of words at runtime by
5936              * using a compile-time structure, wordinfo[].prev, which
5937              * gives, for each word, the previous accepting word (if any).
5938              * In the case above it would contain the mappings 1->2, 2->0,
5939              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5940              * the longest word (4 above), a list of all words, by
5941              * following the list of prev pointers; this gives us the
5942              * unordered list 4,5,1,2. Then given the current word we have
5943              * just tried, we can go through the list and find the
5944              * next-biggest word to try (so if we just failed on word 2,
5945              * the next in the list is 4).
5946              *
5947              * Since at runtime we don't record the matching position in
5948              * the string for each word, we have to work that out for
5949              * each word we're about to process. The wordinfo table holds
5950              * the character length of each word; given that we recorded
5951              * at the start: the position of the shortest word and its
5952              * length in chars, we just need to move the pointer the
5953              * difference between the two char lengths. Depending on
5954              * Unicode status and folding, that's cheap or expensive.
5955              *
5956              * This algorithm is optimised for the case where are only a
5957              * small number of accept states, i.e. 0,1, or maybe 2.
5958              * With lots of accepts states, and having to try all of them,
5959              * it becomes quadratic on number of accept states to find all
5960              * the next words.
5961              */
5962
5963             {
5964                 /* what type of TRIE am I? (utf8 makes this contextual) */
5965                 DECL_TRIE_TYPE(scan);
5966
5967                 /* what trie are we using right now */
5968                 reg_trie_data * const trie
5969                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5970                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5971                 U32 state = trie->startstate;
5972
5973                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5974                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5975                     if (utf8_target
5976                         && ! NEXTCHR_IS_EOS
5977                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5978                         && scan->flags == EXACTL)
5979                     {
5980                         /* We only output for EXACTL, as we let the folder
5981                          * output this message for EXACTFLU8 to avoid
5982                          * duplication */
5983                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5984                                                                reginfo->strend);
5985                     }
5986                 }
5987                 if (   trie->bitmap
5988                     && (     NEXTCHR_IS_EOS
5989                         ||   locinput >= loceol
5990                         || ! TRIE_BITMAP_TEST(trie, nextchr)))
5991                 {
5992                     if (trie->states[ state ].wordnum) {
5993                          DEBUG_EXECUTE_r(
5994                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
5995                                           depth, PL_colors[4], PL_colors[5])
5996                         );
5997                         if (!trie->jump)
5998                             break;
5999                     } else {
6000                         DEBUG_EXECUTE_r(
6001                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6002                                           depth, PL_colors[4], PL_colors[5])
6003                         );
6004                         sayNO_SILENT;
6005                    }
6006                 }
6007
6008             {
6009                 U8 *uc = ( U8* )locinput;
6010
6011                 STRLEN len = 0;
6012                 STRLEN foldlen = 0;
6013                 U8 *uscan = (U8*)NULL;
6014                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6015                 U32 charcount = 0; /* how many input chars we have matched */
6016                 U32 accepted = 0; /* have we seen any accepting states? */
6017
6018                 ST.jump = trie->jump;
6019                 ST.me = scan;
6020                 ST.firstpos = NULL;
6021                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6022                 ST.nextword = 0;
6023
6024                 /* fully traverse the TRIE; note the position of the
6025                    shortest accept state and the wordnum of the longest
6026                    accept state */
6027
6028                 while ( state && uc <= (U8*)(loceol) ) {
6029                     U32 base = trie->states[ state ].trans.base;
6030                     UV uvc = 0;
6031                     U16 charid = 0;
6032                     U16 wordnum;
6033                     wordnum = trie->states[ state ].wordnum;
6034
6035                     if (wordnum) { /* it's an accept state */
6036                         if (!accepted) {
6037                             accepted = 1;
6038                             /* record first match position */
6039                             if (ST.longfold) {
6040                                 ST.firstpos = (U8*)locinput;
6041                                 ST.firstchars = 0;
6042                             }
6043                             else {
6044                                 ST.firstpos = uc;
6045                                 ST.firstchars = charcount;
6046                             }
6047                         }
6048                         if (!ST.nextword || wordnum < ST.nextword)
6049                             ST.nextword = wordnum;
6050                         ST.topword = wordnum;
6051                     }
6052
6053                     DEBUG_TRIE_EXECUTE_r({
6054                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6055                                 /* HERE */
6056                                 PerlIO_printf( Perl_debug_log,
6057                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6058                                     INDENT_CHARS(depth), "", PL_colors[4],
6059                                     (UV)state, (accepted ? 'Y' : 'N'));
6060                     });
6061
6062                     /* read a char and goto next state */
6063                     if ( base && (foldlen || uc < (U8*)(loceol))) {
6064                         I32 offset;
6065                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6066                                              (U8 *) loceol, uscan,
6067                                              len, uvc, charid, foldlen,
6068                                              foldbuf, uniflags);
6069                         charcount++;
6070                         if (foldlen>0)
6071                             ST.longfold = TRUE;
6072                         if (charid &&
6073                              ( ((offset =
6074                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6075
6076                              && ((U32)offset < trie->lasttrans)
6077                              && trie->trans[offset].check == state)
6078                         {
6079                             state = trie->trans[offset].next;
6080                         }
6081                         else {
6082                             state = 0;
6083                         }
6084                         uc += len;
6085
6086                     }
6087                     else {
6088                         state = 0;
6089                     }
6090                     DEBUG_TRIE_EXECUTE_r(
6091                         Perl_re_printf( aTHX_
6092                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6093                             charid, uvc, (UV)state, PL_colors[5] );
6094                     );
6095                 }
6096                 if (!accepted)
6097                    sayNO;
6098
6099                 /* calculate total number of accept states */
6100                 {
6101                     U16 w = ST.topword;
6102                     accepted = 0;
6103                     while (w) {
6104                         w = trie->wordinfo[w].prev;
6105                         accepted++;
6106                     }
6107                     ST.accepted = accepted;
6108                 }
6109
6110                 DEBUG_EXECUTE_r(
6111                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6112                         depth,
6113                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6114                 );
6115                 goto trie_first_try; /* jump into the fail handler */
6116             }}
6117             NOT_REACHED; /* NOTREACHED */
6118
6119         case TRIE_next_fail: /* we failed - try next alternative */
6120         {
6121             U8 *uc;
6122             if ( ST.jump ) {
6123                 /* undo any captures done in the tail part of a branch,
6124                  * e.g.
6125                  *    /(?:X(.)(.)|Y(.)).../
6126                  * where the trie just matches X then calls out to do the
6127                  * rest of the branch */
6128                 REGCP_UNWIND(ST.cp);
6129                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6130             }
6131             if (!--ST.accepted) {
6132                 DEBUG_EXECUTE_r({
6133                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6134                         depth,
6135                         PL_colors[4],
6136                         PL_colors[5] );
6137                 });
6138                 sayNO_SILENT;
6139             }
6140             {
6141                 /* Find next-highest word to process.  Note that this code
6142                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6143                 U16 min = 0;
6144                 U16 word;
6145                 U16 const nextword = ST.nextword;
6146                 reg_trie_wordinfo * const wordinfo
6147                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6148                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6149                     if (word > nextword && (!min || word < min))
6150                         min = word;
6151                 }
6152                 ST.nextword = min;
6153             }
6154
6155           trie_first_try:
6156             if (do_cutgroup) {
6157                 do_cutgroup = 0;
6158                 no_final = 0;
6159             }
6160
6161             if ( ST.jump ) {
6162                 ST.lastparen = rex->lastparen;
6163                 ST.lastcloseparen = rex->lastcloseparen;
6164                 REGCP_SET(ST.cp);
6165             }
6166
6167             /* find start char of end of current word */
6168             {
6169                 U32 chars; /* how many chars to skip */
6170                 reg_trie_data * const trie
6171                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6172
6173                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6174                             >=  ST.firstchars);
6175                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6176                             - ST.firstchars;
6177                 uc = ST.firstpos;
6178
6179                 if (ST.longfold) {
6180                     /* the hard option - fold each char in turn and find
6181                      * its folded length (which may be different */
6182                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6183                     STRLEN foldlen;
6184                     STRLEN len;
6185                     UV uvc;
6186                     U8 *uscan;
6187
6188                     while (chars) {
6189                         if (utf8_target) {
6190                             /* XXX This assumes the length is well-formed, as
6191                              * does the UTF8SKIP below */
6192                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6193                                                     uniflags);
6194                             uc += len;
6195                         }
6196                         else {
6197                             uvc = *uc;
6198                             uc++;
6199                         }
6200                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6201                         uscan = foldbuf;
6202                         while (foldlen) {
6203                             if (!--chars)
6204                                 break;
6205                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6206                                                  uniflags);
6207                             uscan += len;
6208                             foldlen -= len;
6209                         }
6210                     }
6211                 }
6212                 else {
6213                     if (utf8_target)
6214                         while (chars--)
6215                             uc += UTF8SKIP(uc);
6216                     else
6217                         uc += chars;
6218                 }
6219             }
6220
6221             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6222                             ? ST.jump[ST.nextword]
6223                             : NEXT_OFF(ST.me));
6224
6225             DEBUG_EXECUTE_r({
6226                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6227                     depth,
6228                     PL_colors[4],
6229                     ST.nextword,
6230                     PL_colors[5]
6231                     );
6232             });
6233
6234             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6235                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6236                                 script_run_begin);
6237                 NOT_REACHED; /* NOTREACHED */
6238             }
6239             /* only one choice left - just continue */
6240             DEBUG_EXECUTE_r({
6241                 AV *const trie_words
6242                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6243                 SV ** const tmp = trie_words
6244                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6245                 SV *sv= tmp ? sv_newmortal() : NULL;
6246
6247                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6248                     depth, PL_colors[4],
6249                     ST.nextword,
6250                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6251                             PL_colors[0], PL_colors[1],
6252                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6253                         )
6254                     : "not compiled under -Dr",
6255                     PL_colors[5] );
6256             });
6257
6258             locinput = (char*)uc;
6259             continue; /* execute rest of RE */
6260             /* NOTREACHED */
6261         }
6262 #undef  ST
6263
6264         case EXACTL:             /*  /abc/l       */
6265             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6266
6267             /* Complete checking would involve going through every character
6268              * matched by the string to see if any is above latin1.  But the
6269              * comparision otherwise might very well be a fast assembly
6270              * language routine, and I (khw) don't think slowing things down
6271              * just to check for this warning is worth it.  So this just checks
6272              * the first character */
6273             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6274                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6275             }
6276             goto do_exact;
6277         case EXACT_ONLY8:
6278             if (! utf8_target) {
6279                 sayNO;
6280             }
6281             /* FALLTHROUGH */
6282         case EXACT: {            /*  /abc/        */
6283             char *s;
6284           do_exact:
6285             s = STRING(scan);
6286             ln = STR_LEN(scan);
6287             if (utf8_target != is_utf8_pat) {
6288                 /* The target and the pattern have differing utf8ness. */
6289                 char *l = locinput;
6290                 const char * const e = s + ln;
6291
6292                 if (utf8_target) {
6293                     /* The target is utf8, the pattern is not utf8.
6294                      * Above-Latin1 code points can't match the pattern;
6295                      * invariants match exactly, and the other Latin1 ones need
6296                      * to be downgraded to a single byte in order to do the
6297                      * comparison.  (If we could be confident that the target
6298                      * is not malformed, this could be refactored to have fewer
6299                      * tests by just assuming that if the first bytes match, it
6300                      * is an invariant, but there are tests in the test suite
6301                      * dealing with (??{...}) which violate this) */
6302                     while (s < e) {
6303                         if (   l >= loceol
6304                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6305                         {
6306                             sayNO;
6307                         }
6308                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6309                             if (*l != *s) {
6310                                 sayNO;
6311                             }
6312                             l++;
6313                         }
6314                         else {
6315                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6316                             {
6317                                 sayNO;
6318                             }
6319                             l += 2;
6320                         }
6321                         s++;
6322                     }
6323                 }
6324                 else {
6325                     /* The target is not utf8, the pattern is utf8. */
6326                     while (s < e) {
6327                         if (   l >= loceol
6328                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6329                         {
6330                             sayNO;
6331                         }
6332                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6333                             if (*s != *l) {
6334                                 sayNO;
6335                             }
6336                             s++;
6337                         }
6338                         else {
6339                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6340                             {
6341                                 sayNO;
6342                             }
6343                             s += 2;
6344                         }
6345                         l++;
6346                     }
6347                 }
6348                 locinput = l;
6349             }
6350             else {
6351                 /* The target and the pattern have the same utf8ness. */
6352                 /* Inline the first character, for speed. */
6353                 if (   loceol - locinput < ln
6354                     || UCHARAT(s) != nextchr
6355                     || (ln > 1 && memNE(s, locinput, ln)))
6356                 {
6357                     sayNO;
6358                 }
6359                 locinput += ln;
6360             }
6361             break;
6362             }
6363
6364         case EXACTFL:            /*  /abc/il      */
6365           {
6366             re_fold_t folder;
6367             const U8 * fold_array;
6368             const char * s;
6369             U32 fold_utf8_flags;
6370
6371             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6372             folder = foldEQ_locale;
6373             fold_array = PL_fold_locale;
6374             fold_utf8_flags = FOLDEQ_LOCALE;
6375             goto do_exactf;
6376
6377         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6378                                       is effectively /u; hence to match, target
6379                                       must be UTF-8. */
6380             if (! utf8_target) {
6381                 sayNO;
6382             }
6383             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6384                                              | FOLDEQ_S2_FOLDS_SANE;
6385             folder = foldEQ_latin1_s2_folded;
6386             fold_array = PL_fold_latin1;
6387             goto do_exactf;
6388
6389         case EXACTFU_ONLY8:      /* /abc/iu with something in /abc/ > 255 */
6390             if (! utf8_target) {
6391                 sayNO;
6392             }
6393             assert(is_utf8_pat);
6394             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6395             goto do_exactf;
6396
6397         case EXACTFUP:          /*  /foo/iu, and something is problematic in
6398                                     'foo' so can't take shortcuts. */
6399             assert(! is_utf8_pat);
6400             folder = foldEQ_latin1;
6401             fold_array = PL_fold_latin1;
6402             fold_utf8_flags = 0;
6403             goto do_exactf;
6404
6405         case EXACTFU:            /*  /abc/iu      */
6406             folder = foldEQ_latin1_s2_folded;
6407             fold_array = PL_fold_latin1;
6408             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6409             goto do_exactf;
6410
6411         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6412                                    patterns */
6413             assert(! is_utf8_pat);
6414             /* FALLTHROUGH */
6415         case EXACTFAA:            /*  /abc/iaa     */
6416             folder = foldEQ_latin1_s2_folded;
6417             fold_array = PL_fold_latin1;
6418             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6419             if (is_utf8_pat || ! utf8_target) {
6420
6421                 /* The possible presence of a MICRO SIGN in the pattern forbids
6422                  * us to view a non-UTF-8 pattern as folded when there is a
6423                  * UTF-8 target */
6424                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
6425                                   |FOLDEQ_S2_FOLDS_SANE;
6426             }
6427             goto do_exactf;
6428
6429
6430         case EXACTF:             /*  /abc/i    This node only generated for
6431                                                non-utf8 patterns */
6432             assert(! is_utf8_pat);
6433             folder = foldEQ;
6434             fold_array = PL_fold;
6435             fold_utf8_flags = 0;
6436
6437           do_exactf:
6438             s = STRING(scan);
6439             ln = STR_LEN(scan);
6440
6441             if (   utf8_target
6442                 || is_utf8_pat
6443                 || state_num == EXACTFUP
6444                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6445             {
6446               /* Either target or the pattern are utf8, or has the issue where
6447                * the fold lengths may differ. */
6448                 const char * const l = locinput;
6449                 char *e = loceol;
6450
6451                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
6452                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
6453                 {
6454                     sayNO;
6455                 }
6456                 locinput = e;
6457                 break;
6458             }
6459
6460             /* Neither the target nor the pattern are utf8 */
6461             if (UCHARAT(s) != nextchr
6462                 && !NEXTCHR_IS_EOS
6463                 && UCHARAT(s) != fold_array[nextchr])
6464             {
6465                 sayNO;
6466             }
6467             if (loceol - locinput < ln)
6468                 sayNO;
6469             if (ln > 1 && ! folder(locinput, s, ln))
6470                 sayNO;
6471             locinput += ln;
6472             break;
6473         }
6474
6475         case NBOUNDL: /*  /\B/l  */
6476             to_complement = 1;
6477             /* FALLTHROUGH */
6478
6479         case BOUNDL:  /*  /\b/l  */
6480         {
6481             bool b1, b2;
6482             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6483
6484             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6485                 if (! IN_UTF8_CTYPE_LOCALE) {
6486                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6487                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6488                 }
6489                 goto boundu;
6490             }
6491
6492             if (utf8_target) {
6493                 if (locinput == reginfo->strbeg)
6494                     b1 = isWORDCHAR_LC('\n');
6495                 else {
6496                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6497                                                         (U8*)(reginfo->strbeg)),
6498                                                  (U8*)(reginfo->strend));
6499                 }
6500                 b2 = (NEXTCHR_IS_EOS)
6501                     ? isWORDCHAR_LC('\n')
6502                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6503                                               (U8*) reginfo->strend);
6504             }
6505             else { /* Here the string isn't utf8 */
6506                 b1 = (locinput == reginfo->strbeg)
6507                      ? isWORDCHAR_LC('\n')
6508                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6509                 b2 = (NEXTCHR_IS_EOS)
6510                     ? isWORDCHAR_LC('\n')
6511                     : isWORDCHAR_LC(nextchr);
6512             }
6513             if (to_complement ^ (b1 == b2)) {
6514                 sayNO;
6515             }
6516             break;
6517         }
6518
6519         case NBOUND:  /*  /\B/   */
6520             to_complement = 1;
6521             /* FALLTHROUGH */
6522
6523         case BOUND:   /*  /\b/   */
6524             if (utf8_target) {
6525                 goto bound_utf8;
6526             }
6527             goto bound_ascii_match_only;
6528
6529         case NBOUNDA: /*  /\B/a  */
6530             to_complement = 1;
6531             /* FALLTHROUGH */
6532
6533         case BOUNDA:  /*  /\b/a  */
6534         {
6535             bool b1, b2;
6536
6537           bound_ascii_match_only:
6538             /* Here the string isn't utf8, or is utf8 and only ascii characters
6539              * are to match \w.  In the latter case looking at the byte just
6540              * prior to the current one may be just the final byte of a
6541              * multi-byte character.  This is ok.  There are two cases:
6542              * 1) it is a single byte character, and then the test is doing
6543              *    just what it's supposed to.
6544              * 2) it is a multi-byte character, in which case the final byte is
6545              *    never mistakable for ASCII, and so the test will say it is
6546              *    not a word character, which is the correct answer. */
6547             b1 = (locinput == reginfo->strbeg)
6548                  ? isWORDCHAR_A('\n')
6549                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6550             b2 = (NEXTCHR_IS_EOS)
6551                 ? isWORDCHAR_A('\n')
6552                 : isWORDCHAR_A(nextchr);
6553             if (to_complement ^ (b1 == b2)) {
6554                 sayNO;
6555             }
6556             break;
6557         }
6558
6559         case NBOUNDU: /*  /\B/u  */
6560             to_complement = 1;
6561             /* FALLTHROUGH */
6562
6563         case BOUNDU:  /*  /\b/u  */
6564
6565           boundu:
6566             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6567                 match = FALSE;
6568             }
6569             else if (utf8_target) {
6570               bound_utf8:
6571                 switch((bound_type) FLAGS(scan)) {
6572                     case TRADITIONAL_BOUND:
6573                     {
6574                         bool b1, b2;
6575                         b1 = (locinput == reginfo->strbeg)
6576                              ? 0 /* isWORDCHAR_L1('\n') */
6577                              : isWORDCHAR_utf8_safe(
6578                                                reghop3((U8*)locinput,
6579                                                        -1,
6580                                                        (U8*)(reginfo->strbeg)),
6581                                                     (U8*) reginfo->strend);
6582                         b2 = (NEXTCHR_IS_EOS)
6583                             ? 0 /* isWORDCHAR_L1('\n') */
6584                             : isWORDCHAR_utf8_safe((U8*)locinput,
6585                                                    (U8*) reginfo->strend);
6586                         match = cBOOL(b1 != b2);
6587                         break;
6588                     }
6589                     case GCB_BOUND:
6590                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6591                             match = TRUE; /* GCB always matches at begin and
6592                                              end */
6593                         }
6594                         else {
6595                             /* Find the gcb values of previous and current
6596                              * chars, then see if is a break point */
6597                             match = isGCB(getGCB_VAL_UTF8(
6598                                                 reghop3((U8*)locinput,
6599                                                         -1,
6600                                                         (U8*)(reginfo->strbeg)),
6601                                                 (U8*) reginfo->strend),
6602                                           getGCB_VAL_UTF8((U8*) locinput,
6603                                                         (U8*) reginfo->strend),
6604                                           (U8*) reginfo->strbeg,
6605                                           (U8*) locinput,
6606                                           utf8_target);
6607                         }
6608                         break;
6609
6610                     case LB_BOUND:
6611                         if (locinput == reginfo->strbeg) {
6612                             match = FALSE;
6613                         }
6614                         else if (NEXTCHR_IS_EOS) {
6615                             match = TRUE;
6616                         }
6617                         else {
6618                             match = isLB(getLB_VAL_UTF8(
6619                                                 reghop3((U8*)locinput,
6620                                                         -1,
6621                                                         (U8*)(reginfo->strbeg)),
6622                                                 (U8*) reginfo->strend),
6623                                           getLB_VAL_UTF8((U8*) locinput,
6624                                                         (U8*) reginfo->strend),
6625                                           (U8*) reginfo->strbeg,
6626                                           (U8*) locinput,
6627                                           (U8*) reginfo->strend,
6628                                           utf8_target);
6629                         }
6630                         break;
6631
6632                     case SB_BOUND: /* Always matches at begin and end */
6633                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6634                             match = TRUE;
6635                         }
6636                         else {
6637                             match = isSB(getSB_VAL_UTF8(
6638                                                 reghop3((U8*)locinput,
6639                                                         -1,
6640                                                         (U8*)(reginfo->strbeg)),
6641                                                 (U8*) reginfo->strend),
6642                                           getSB_VAL_UTF8((U8*) locinput,
6643                                                         (U8*) reginfo->strend),
6644                                           (U8*) reginfo->strbeg,
6645                                           (U8*) locinput,
6646                                           (U8*) reginfo->strend,
6647                                           utf8_target);
6648                         }
6649                         break;
6650
6651                     case WB_BOUND:
6652                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6653                             match = TRUE;
6654                         }
6655                         else {
6656                             match = isWB(WB_UNKNOWN,
6657                                          getWB_VAL_UTF8(
6658                                                 reghop3((U8*)locinput,
6659                                                         -1,
6660                                                         (U8*)(reginfo->strbeg)),
6661                                                 (U8*) reginfo->strend),
6662                                           getWB_VAL_UTF8((U8*) locinput,
6663                                                         (U8*) reginfo->strend),
6664                                           (U8*) reginfo->strbeg,
6665                                           (U8*) locinput,
6666                                           (U8*) reginfo->strend,
6667                                           utf8_target);
6668                         }
6669                         break;
6670                 }
6671             }
6672             else {  /* Not utf8 target */
6673                 switch((bound_type) FLAGS(scan)) {
6674                     case TRADITIONAL_BOUND:
6675                     {
6676                         bool b1, b2;
6677                         b1 = (locinput == reginfo->strbeg)
6678                             ? 0 /* isWORDCHAR_L1('\n') */
6679                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6680                         b2 = (NEXTCHR_IS_EOS)
6681                             ? 0 /* isWORDCHAR_L1('\n') */
6682                             : isWORDCHAR_L1(nextchr);
6683                         match = cBOOL(b1 != b2);
6684                         break;
6685                     }
6686
6687                     case GCB_BOUND:
6688                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6689                             match = TRUE; /* GCB always matches at begin and
6690                                              end */
6691                         }
6692                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6693                                    range */
6694                             match =    UCHARAT(locinput - 1) != '\r'
6695                                     || UCHARAT(locinput) != '\n';
6696                         }
6697                         break;
6698
6699                     case LB_BOUND:
6700                         if (locinput == reginfo->strbeg) {
6701                             match = FALSE;
6702                         }
6703                         else if (NEXTCHR_IS_EOS) {
6704                             match = TRUE;
6705                         }
6706                         else {
6707                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6708                                          getLB_VAL_CP(UCHARAT(locinput)),
6709                                          (U8*) reginfo->strbeg,
6710                                          (U8*) locinput,
6711                                          (U8*) reginfo->strend,
6712                                          utf8_target);
6713                         }
6714                         break;
6715
6716                     case SB_BOUND: /* Always matches at begin and end */
6717                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6718                             match = TRUE;
6719                         }
6720                         else {
6721                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6722                                          getSB_VAL_CP(UCHARAT(locinput)),
6723                                          (U8*) reginfo->strbeg,
6724                                          (U8*) locinput,
6725                                          (U8*) reginfo->strend,
6726                                          utf8_target);
6727                         }
6728                         break;
6729
6730                     case WB_BOUND:
6731                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6732                             match = TRUE;
6733                         }
6734                         else {
6735                             match = isWB(WB_UNKNOWN,
6736                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6737                                          getWB_VAL_CP(UCHARAT(locinput)),
6738                                          (U8*) reginfo->strbeg,
6739                                          (U8*) locinput,
6740                                          (U8*) reginfo->strend,
6741                                          utf8_target);
6742                         }
6743                         break;
6744                 }
6745             }
6746
6747             if (to_complement ^ ! match) {
6748                 sayNO;
6749             }
6750             break;
6751
6752         case ANYOFPOSIXL:
6753         case ANYOFL:  /*  /[abc]/l      */
6754             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6755
6756             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6757             {
6758               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6759             }
6760             /* FALLTHROUGH */
6761         case ANYOFD:  /*   /[abc]/d       */
6762         case ANYOF:  /*   /[abc]/       */
6763             if (NEXTCHR_IS_EOS || locinput >= loceol)
6764                 sayNO;
6765             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
6766                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
6767             {
6768                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
6769                     sayNO;
6770                 }
6771                 locinput++;
6772             }
6773             else {
6774                 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6775                                                                    utf8_target))
6776                 {
6777                     sayNO;
6778                 }
6779                 goto increment_locinput;
6780             }
6781             break;
6782
6783         case ANYOFM:
6784             if (   NEXTCHR_IS_EOS
6785                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
6786                 || locinput >= loceol)
6787             {
6788                 sayNO;
6789             }
6790             locinput++; /* ANYOFM is always single byte */
6791             break;
6792
6793         case NANYOFM:
6794             if (   NEXTCHR_IS_EOS
6795                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
6796                 || locinput >= loceol)
6797             {
6798                 sayNO;
6799             }
6800             goto increment_locinput;
6801             break;
6802
6803         case ANYOFH:
6804             if (   ! utf8_target
6805                 ||   NEXTCHR_IS_EOS
6806                 ||  (   ANYOF_FLAGS(scan) != 0
6807                      && ANYOF_FLAGS(scan) != (U8) *locinput)
6808                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6809                                                                    utf8_target))
6810             {
6811                 sayNO;
6812             }
6813             goto increment_locinput;
6814             break;
6815
6816         /* The argument (FLAGS) to all the POSIX node types is the class number
6817          * */
6818
6819         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6820             to_complement = 1;
6821             /* FALLTHROUGH */
6822
6823         case POSIXL:    /* \w or [:punct:] etc. under /l */
6824             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6825             if (NEXTCHR_IS_EOS || locinput >= loceol)
6826                 sayNO;
6827
6828             /* Use isFOO_lc() for characters within Latin1.  (Note that
6829              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6830              * wouldn't be invariant) */
6831             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6832                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6833                     sayNO;
6834                 }
6835
6836                 locinput++;
6837                 break;
6838             }
6839
6840             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6841                 /* An above Latin-1 code point, or malformed */
6842                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6843                                                        reginfo->strend);
6844                 goto utf8_posix_above_latin1;
6845             }
6846
6847             /* Here is a UTF-8 variant code point below 256 and the target is
6848              * UTF-8 */
6849             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6850                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6851                                             *(locinput + 1))))))
6852             {
6853                 sayNO;
6854             }
6855
6856             goto increment_locinput;
6857
6858         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6859             to_complement = 1;
6860             /* FALLTHROUGH */
6861
6862         case POSIXD:    /* \w or [:punct:] etc. under /d */
6863             if (utf8_target) {
6864                 goto utf8_posix;
6865             }
6866             goto posixa;
6867
6868         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6869
6870             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6871                 sayNO;
6872             }
6873
6874             /* All UTF-8 variants match */
6875             if (! UTF8_IS_INVARIANT(nextchr)) {
6876                 goto increment_locinput;
6877             }
6878
6879             to_complement = 1;
6880             goto join_nposixa;
6881
6882         case POSIXA:    /* \w or [:punct:] etc. under /a */
6883
6884           posixa:
6885             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6886              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6887              * character is a single byte */
6888
6889             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6890                 sayNO;
6891             }
6892
6893           join_nposixa:
6894
6895             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6896                                                                 FLAGS(scan)))))
6897             {
6898                 sayNO;
6899             }
6900
6901             /* Here we are either not in utf8, or we matched a utf8-invariant,
6902              * so the next char is the next byte */
6903             locinput++;
6904             break;
6905
6906         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6907             to_complement = 1;
6908             /* FALLTHROUGH */
6909
6910         case POSIXU:    /* \w or [:punct:] etc. under /u */
6911           utf8_posix:
6912             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6913                 sayNO;
6914             }
6915
6916             /* Use _generic_isCC() for characters within Latin1.  (Note that
6917              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6918              * wouldn't be invariant) */
6919             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6920                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6921                                                            FLAGS(scan)))))
6922                 {
6923                     sayNO;
6924                 }
6925                 locinput++;
6926             }
6927             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6928                 if (! (to_complement
6929                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6930                                                                *(locinput + 1)),
6931                                              FLAGS(scan)))))
6932                 {
6933                     sayNO;
6934                 }
6935                 locinput += 2;
6936             }
6937             else {  /* Handle above Latin-1 code points */
6938               utf8_posix_above_latin1:
6939                 classnum = (_char_class_number) FLAGS(scan);
6940                 switch (classnum) {
6941                     default:
6942                         if (! (to_complement
6943                            ^ cBOOL(_invlist_contains_cp(
6944                                       PL_XPosix_ptrs[classnum],
6945                                       utf8_to_uvchr_buf((U8 *) locinput,
6946                                                         (U8 *) reginfo->strend,
6947                                                         NULL)))))
6948                         {
6949                             sayNO;
6950                         }
6951                         break;
6952                     case _CC_ENUM_SPACE:
6953                         if (! (to_complement
6954                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
6955                         {
6956                             sayNO;
6957                         }
6958                         break;
6959                     case _CC_ENUM_BLANK:
6960                         if (! (to_complement
6961                                         ^ cBOOL(is_HORIZWS_high(locinput))))
6962                         {
6963                             sayNO;
6964                         }
6965                         break;
6966                     case _CC_ENUM_XDIGIT:
6967                         if (! (to_complement
6968                                         ^ cBOOL(is_XDIGIT_high(locinput))))
6969                         {
6970                             sayNO;
6971                         }
6972                         break;
6973                     case _CC_ENUM_VERTSPACE:
6974                         if (! (to_complement
6975                                         ^ cBOOL(is_VERTWS_high(locinput))))
6976                         {
6977                             sayNO;
6978                         }
6979                         break;
6980                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
6981                     case _CC_ENUM_ASCII:
6982                         if (! to_complement) {
6983                             sayNO;
6984                         }
6985                         break;
6986                 }
6987                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
6988             }
6989             break;
6990
6991         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6992                        a Unicode extended Grapheme Cluster */
6993             if (NEXTCHR_IS_EOS || locinput >= loceol)
6994                 sayNO;
6995             if  (! utf8_target) {
6996
6997                 /* Match either CR LF  or '.', as all the other possibilities
6998                  * require utf8 */
6999                 locinput++;         /* Match the . or CR */
7000                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
7001                                        match the LF */
7002                     && locinput <  loceol
7003                     && UCHARAT(locinput) == '\n')
7004                 {
7005                     locinput++;
7006                 }
7007             }
7008             else {
7009
7010                 /* Get the gcb type for the current character */
7011                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7012                                                        (U8*) reginfo->strend);
7013
7014                 /* Then scan through the input until we get to the first
7015                  * character whose type is supposed to be a gcb with the
7016                  * current character.  (There is always a break at the
7017                  * end-of-input) */
7018                 locinput += UTF8SKIP(locinput);
7019                 while (locinput < loceol) {
7020                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7021                                                          (U8*) reginfo->strend);
7022                     if (isGCB(prev_gcb, cur_gcb,
7023                               (U8*) reginfo->strbeg, (U8*) locinput,
7024                               utf8_target))
7025                     {
7026                         break;
7027                     }
7028
7029                     prev_gcb = cur_gcb;
7030                     locinput += UTF8SKIP(locinput);
7031                 }
7032
7033
7034             }
7035             break;
7036
7037         case NREFFL:  /*  /\g{name}/il  */
7038         {   /* The capture buffer cases.  The ones beginning with N for the
7039                named buffers just convert to the equivalent numbered and
7040                pretend they were called as the corresponding numbered buffer
7041                op.  */
7042             /* don't initialize these in the declaration, it makes C++
7043                unhappy */
7044             const char *s;
7045             char type;
7046             re_fold_t folder;
7047             const U8 *fold_array;
7048             UV utf8_fold_flags;
7049
7050             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7051             folder = foldEQ_locale;
7052             fold_array = PL_fold_locale;
7053             type = REFFL;
7054             utf8_fold_flags = FOLDEQ_LOCALE;
7055             goto do_nref;
7056
7057         case NREFFA:  /*  /\g{name}/iaa  */
7058             folder = foldEQ_latin1;
7059             fold_array = PL_fold_latin1;
7060             type = REFFA;
7061             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7062             goto do_nref;
7063
7064         case NREFFU:  /*  /\g{name}/iu  */
7065             folder = foldEQ_latin1;
7066             fold_array = PL_fold_latin1;
7067             type = REFFU;
7068             utf8_fold_flags = 0;
7069             goto do_nref;
7070
7071         case NREFF:  /*  /\g{name}/i  */
7072             folder = foldEQ;
7073             fold_array = PL_fold;
7074             type = REFF;
7075             utf8_fold_flags = 0;
7076             goto do_nref;
7077
7078         case NREF:  /*  /\g{name}/   */
7079             type = REF;
7080             folder = NULL;
7081             fold_array = NULL;
7082             utf8_fold_flags = 0;
7083           do_nref:
7084
7085             /* For the named back references, find the corresponding buffer
7086              * number */
7087             n = reg_check_named_buff_matched(rex,scan);
7088
7089             if ( ! n ) {
7090                 sayNO;
7091             }
7092             goto do_nref_ref_common;
7093
7094         case REFFL:  /*  /\1/il  */
7095             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7096             folder = foldEQ_locale;
7097             fold_array = PL_fold_locale;
7098             utf8_fold_flags = FOLDEQ_LOCALE;
7099             goto do_ref;
7100
7101         case REFFA:  /*  /\1/iaa  */
7102             folder = foldEQ_latin1;
7103             fold_array = PL_fold_latin1;
7104             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7105             goto do_ref;
7106
7107         case REFFU:  /*  /\1/iu  */
7108             folder = foldEQ_latin1;
7109             fold_array = PL_fold_latin1;
7110             utf8_fold_flags = 0;
7111             goto do_ref;
7112
7113         case REFF:  /*  /\1/i  */
7114             folder = foldEQ;
7115             fold_array = PL_fold;
7116             utf8_fold_flags = 0;
7117             goto do_ref;
7118
7119         case REF:  /*  /\1/    */
7120             folder = NULL;
7121             fold_array = NULL;
7122             utf8_fold_flags = 0;
7123
7124           do_ref:
7125             type = OP(scan);
7126             n = ARG(scan);  /* which paren pair */
7127
7128           do_nref_ref_common:
7129             ln = rex->offs[n].start;
7130             endref = rex->offs[n].end;
7131             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7132             if (rex->lastparen < n || ln == -1 || endref == -1)
7133                 sayNO;                  /* Do not match unless seen CLOSEn. */
7134             if (ln == endref)
7135                 break;
7136
7137             s = reginfo->strbeg + ln;
7138             if (type != REF     /* REF can do byte comparison */
7139                 && (utf8_target || type == REFFU || type == REFFL))
7140             {
7141                 char * limit = loceol;
7142
7143                 /* This call case insensitively compares the entire buffer
7144                     * at s, with the current input starting at locinput, but
7145                     * not going off the end given by loceol, and
7146                     * returns in <limit> upon success, how much of the
7147                     * current input was matched */
7148                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7149                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7150                 {
7151                     sayNO;
7152                 }
7153                 locinput = limit;
7154                 break;
7155             }
7156
7157             /* Not utf8:  Inline the first character, for speed. */
7158             if ( ! NEXTCHR_IS_EOS
7159                 && locinput < loceol
7160                 && UCHARAT(s) != nextchr
7161                 && (   type == REF
7162                     || UCHARAT(s) != fold_array[nextchr]))
7163             {
7164                 sayNO;
7165             }
7166             ln = endref - ln;
7167             if (locinput + ln > loceol)
7168                 sayNO;
7169             if (ln > 1 && (type == REF
7170                            ? memNE(s, locinput, ln)
7171                            : ! folder(locinput, s, ln)))
7172                 sayNO;
7173             locinput += ln;
7174             break;
7175         }
7176
7177         case NOTHING: /* null op; e.g. the 'nothing' following
7178                        * the '*' in m{(a+|b)*}' */
7179             break;
7180         case TAIL: /* placeholder while compiling (A|B|C) */
7181             break;
7182
7183 #undef  ST
7184 #define ST st->u.eval
7185 #define CUR_EVAL cur_eval->u.eval
7186
7187         {
7188             SV *ret;
7189             REGEXP *re_sv;
7190             regexp *re;
7191             regexp_internal *rei;
7192             regnode *startpoint;
7193             U32 arg;
7194
7195         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7196             arg= (U32)ARG(scan);
7197             if (cur_eval && cur_eval->locinput == locinput) {
7198                 if ( ++nochange_depth > max_nochange_depth )
7199                     Perl_croak(aTHX_
7200                         "Pattern subroutine nesting without pos change"
7201                         " exceeded limit in regex");
7202             } else {
7203                 nochange_depth = 0;
7204             }
7205             re_sv = rex_sv;
7206             re = rex;
7207             rei = rexi;
7208             startpoint = scan + ARG2L(scan);
7209             EVAL_CLOSE_PAREN_SET( st, arg );
7210             /* Detect infinite recursion
7211              *
7212              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7213              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7214              * So we track the position in the string we are at each time
7215              * we recurse and if we try to enter the same routine twice from
7216              * the same position we throw an error.
7217              */
7218             if ( rex->recurse_locinput[arg] == locinput ) {
7219                 /* FIXME: we should show the regop that is failing as part
7220                  * of the error message. */
7221                 Perl_croak(aTHX_ "Infinite recursion in regex");
7222             } else {
7223                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7224                 rex->recurse_locinput[arg]= locinput;
7225
7226                 DEBUG_r({
7227                     GET_RE_DEBUG_FLAGS_DECL;
7228                     DEBUG_STACK_r({
7229                         Perl_re_exec_indentf( aTHX_
7230                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7231                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7232                         );
7233                     });
7234                 });
7235             }
7236
7237             /* Save all the positions seen so far. */
7238             ST.cp = regcppush(rex, 0, maxopenparen);
7239             REGCP_SET(ST.lastcp);
7240
7241             /* and then jump to the code we share with EVAL */
7242             goto eval_recurse_doit;
7243             /* NOTREACHED */
7244
7245         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7246             if (cur_eval && cur_eval->locinput==locinput) {
7247                 if ( ++nochange_depth > max_nochange_depth )
7248                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7249             } else {
7250                 nochange_depth = 0;
7251             }
7252             {
7253                 /* execute the code in the {...} */
7254
7255                 dSP;
7256                 IV before;
7257                 OP * const oop = PL_op;
7258                 COP * const ocurcop = PL_curcop;
7259                 OP *nop;
7260                 CV *newcv;
7261
7262                 /* save *all* paren positions */
7263                 regcppush(rex, 0, maxopenparen);
7264                 REGCP_SET(ST.lastcp);
7265
7266                 if (!caller_cv)
7267                     caller_cv = find_runcv(NULL);
7268
7269                 n = ARG(scan);
7270
7271                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7272                     newcv = (ReANY(
7273                                     (REGEXP*)(rexi->data->data[n])
7274                             ))->qr_anoncv;
7275                     nop = (OP*)rexi->data->data[n+1];
7276                 }
7277                 else if (rexi->data->what[n] == 'l') { /* literal code */
7278                     newcv = caller_cv;
7279                     nop = (OP*)rexi->data->data[n];
7280                     assert(CvDEPTH(newcv));
7281                 }
7282                 else {
7283                     /* literal with own CV */
7284                     assert(rexi->data->what[n] == 'L');
7285                     newcv = rex->qr_anoncv;
7286                     nop = (OP*)rexi->data->data[n];
7287                 }
7288
7289                 /* Some notes about MULTICALL and the context and save stacks.
7290                  *
7291                  * In something like
7292                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7293                  * since codeblocks don't introduce a new scope (so that
7294                  * local() etc accumulate), at the end of a successful
7295                  * match there will be a SAVEt_CLEARSV on the savestack
7296                  * for each of $x, $y, $z. If the three code blocks above
7297                  * happen to have come from different CVs (e.g. via
7298                  * embedded qr//s), then we must ensure that during any
7299                  * savestack unwinding, PL_comppad always points to the
7300                  * right pad at each moment. We achieve this by
7301                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7302                  * there is a change of pad.
7303                  * In theory whenever we call a code block, we should
7304                  * push a CXt_SUB context, then pop it on return from
7305                  * that code block. This causes a bit of an issue in that
7306                  * normally popping a context also clears the savestack
7307                  * back to cx->blk_oldsaveix, but here we specifically
7308                  * don't want to clear the save stack on exit from the
7309                  * code block.
7310                  * Also for efficiency we don't want to keep pushing and
7311                  * popping the single SUB context as we backtrack etc.
7312                  * So instead, we push a single context the first time
7313                  * we need, it, then hang onto it until the end of this
7314                  * function. Whenever we encounter a new code block, we
7315                  * update the CV etc if that's changed. During the times
7316                  * in this function where we're not executing a code
7317                  * block, having the SUB context still there is a bit
7318                  * naughty - but we hope that no-one notices.
7319                  * When the SUB context is initially pushed, we fake up
7320                  * cx->blk_oldsaveix to be as if we'd pushed this context
7321                  * on first entry to S_regmatch rather than at some random
7322                  * point during the regexe execution. That way if we
7323                  * croak, popping the context stack will ensure that
7324                  * *everything* SAVEd by this function is undone and then
7325                  * the context popped, rather than e.g., popping the
7326                  * context (and restoring the original PL_comppad) then
7327                  * popping more of the savestack and restoring a bad
7328                  * PL_comppad.
7329                  */
7330
7331                 /* If this is the first EVAL, push a MULTICALL. On
7332                  * subsequent calls, if we're executing a different CV, or
7333                  * if PL_comppad has got messed up from backtracking
7334                  * through SAVECOMPPADs, then refresh the context.
7335                  */
7336                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7337                 {
7338                     U8 flags = (CXp_SUB_RE |
7339                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7340                     SAVECOMPPAD();
7341                     if (last_pushed_cv) {
7342                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7343                     }
7344                     else {
7345                         PUSH_MULTICALL_FLAGS(newcv, flags);
7346                     }
7347                     /* see notes above */
7348                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7349
7350                     last_pushed_cv = newcv;
7351                 }
7352                 else {
7353                     /* these assignments are just to silence compiler
7354                      * warnings */
7355                     multicall_cop = NULL;
7356                 }
7357                 last_pad = PL_comppad;
7358
7359                 /* the initial nextstate you would normally execute
7360                  * at the start of an eval (which would cause error
7361                  * messages to come from the eval), may be optimised
7362                  * away from the execution path in the regex code blocks;
7363                  * so manually set PL_curcop to it initially */
7364                 {
7365                     OP *o = cUNOPx(nop)->op_first;
7366                     assert(o->op_type == OP_NULL);
7367                     if (o->op_targ == OP_SCOPE) {
7368                         o = cUNOPo->op_first;
7369                     }
7370                     else {
7371                         assert(o->op_targ == OP_LEAVE);
7372                         o = cUNOPo->op_first;
7373                         assert(o->op_type == OP_ENTER);
7374                         o = OpSIBLING(o);
7375                     }
7376
7377                     if (o->op_type != OP_STUB) {
7378                         assert(    o->op_type == OP_NEXTSTATE
7379                                 || o->op_type == OP_DBSTATE
7380                                 || (o->op_type == OP_NULL
7381                                     &&  (  o->op_targ == OP_NEXTSTATE
7382                                         || o->op_targ == OP_DBSTATE
7383                                         )
7384                                     )
7385                         );
7386                         PL_curcop = (COP*)o;
7387                     }
7388                 }
7389                 nop = nop->op_next;
7390
7391                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7392                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7393
7394                 rex->offs[0].end = locinput - reginfo->strbeg;
7395                 if (reginfo->info_aux_eval->pos_magic)
7396                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7397                                   reginfo->sv, reginfo->strbeg,
7398                                   locinput - reginfo->strbeg);
7399
7400                 if (sv_yes_mark) {
7401                     SV *sv_mrk = get_sv("REGMARK", 1);
7402                     sv_setsv(sv_mrk, sv_yes_mark);
7403                 }
7404
7405                 /* we don't use MULTICALL here as we want to call the
7406                  * first op of the block of interest, rather than the
7407                  * first op of the sub. Also, we don't want to free
7408                  * the savestack frame */
7409                 before = (IV)(SP-PL_stack_base);
7410                 PL_op = nop;
7411                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7412                 SPAGAIN;
7413                 if ((IV)(SP-PL_stack_base) == before)
7414                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7415                 else {
7416                     ret = POPs;
7417                     PUTBACK;
7418                 }
7419
7420                 /* before restoring everything, evaluate the returned
7421                  * value, so that 'uninit' warnings don't use the wrong
7422                  * PL_op or pad. Also need to process any magic vars
7423                  * (e.g. $1) *before* parentheses are restored */
7424
7425                 PL_op = NULL;
7426
7427                 re_sv = NULL;
7428                 if (logical == 0) {       /*   (?{})/   */
7429                     SV *replsv = save_scalar(PL_replgv);
7430                     sv_setsv(replsv, ret); /* $^R */
7431                     SvSETMAGIC(replsv);
7432                 }
7433                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7434                     sw = cBOOL(SvTRUE_NN(ret));
7435                     logical = 0;
7436                 }
7437                 else {                   /*  /(??{})  */
7438                     /*  if its overloaded, let the regex compiler handle
7439                      *  it; otherwise extract regex, or stringify  */
7440                     if (SvGMAGICAL(ret))
7441                         ret = sv_mortalcopy(ret);
7442                     if (!SvAMAGIC(ret)) {
7443                         SV *sv = ret;
7444                         if (SvROK(sv))
7445                             sv = SvRV(sv);
7446                         if (SvTYPE(sv) == SVt_REGEXP)
7447                             re_sv = (REGEXP*) sv;
7448                         else if (SvSMAGICAL(ret)) {
7449                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7450                             if (mg)
7451                                 re_sv = (REGEXP *) mg->mg_obj;
7452                         }
7453
7454                         /* force any undef warnings here */
7455                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7456                             ret = sv_mortalcopy(ret);
7457                             (void) SvPV_force_nolen(ret);
7458                         }
7459                     }
7460
7461                 }
7462
7463                 /* *** Note that at this point we don't restore
7464                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7465                  * be used again soon. This is safe as long as nothing
7466                  * in the regexp code uses the pad ! */
7467                 PL_op = oop;
7468                 PL_curcop = ocurcop;
7469                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7470                 PL_curpm_under = PL_curpm;
7471                 PL_curpm = PL_reg_curpm;
7472
7473                 if (logical != 2) {
7474                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
7475                                     script_run_begin);
7476                     /* NOTREACHED */
7477                 }
7478             }
7479
7480                 /* only /(??{})/  from now on */
7481                 logical = 0;
7482                 {
7483                     /* extract RE object from returned value; compiling if
7484                      * necessary */
7485
7486                     if (re_sv) {
7487                         re_sv = reg_temp_copy(NULL, re_sv);
7488                     }
7489                     else {
7490                         U32 pm_flags = 0;
7491
7492                         if (SvUTF8(ret) && IN_BYTES) {
7493                             /* In use 'bytes': make a copy of the octet
7494                              * sequence, but without the flag on */
7495                             STRLEN len;
7496                             const char *const p = SvPV(ret, len);
7497                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7498                         }
7499                         if (rex->intflags & PREGf_USE_RE_EVAL)
7500                             pm_flags |= PMf_USE_RE_EVAL;
7501
7502                         /* if we got here, it should be an engine which
7503                          * supports compiling code blocks and stuff */
7504                         assert(rex->engine && rex->engine->op_comp);
7505                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7506                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7507                                     rex->engine, NULL, NULL,
7508                                     /* copy /msixn etc to inner pattern */
7509                                     ARG2L(scan),
7510                                     pm_flags);
7511
7512                         if (!(SvFLAGS(ret)
7513                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7514                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7515                             /* This isn't a first class regexp. Instead, it's
7516                                caching a regexp onto an existing, Perl visible
7517                                scalar.  */
7518                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7519                         }
7520                     }
7521                     SAVEFREESV(re_sv);
7522                     re = ReANY(re_sv);
7523                 }
7524                 RXp_MATCH_COPIED_off(re);
7525                 re->subbeg = rex->subbeg;
7526                 re->sublen = rex->sublen;
7527                 re->suboffset = rex->suboffset;
7528                 re->subcoffset = rex->subcoffset;
7529                 re->lastparen = 0;
7530                 re->lastcloseparen = 0;
7531                 rei = RXi_GET(re);
7532                 DEBUG_EXECUTE_r(
7533                     debug_start_match(re_sv, utf8_target, locinput,
7534                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7535                 );
7536                 startpoint = rei->program + 1;
7537                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7538                                              * close_paren only for GOSUB */
7539                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7540                 /* Save all the seen positions so far. */
7541                 ST.cp = regcppush(rex, 0, maxopenparen);
7542                 REGCP_SET(ST.lastcp);
7543                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7544                 maxopenparen = 0;
7545                 /* run the pattern returned from (??{...}) */
7546
7547               eval_recurse_doit: /* Share code with GOSUB below this line
7548                             * At this point we expect the stack context to be
7549                             * set up correctly */
7550
7551                 /* invalidate the S-L poscache. We're now executing a
7552                  * different set of WHILEM ops (and their associated
7553                  * indexes) against the same string, so the bits in the
7554                  * cache are meaningless. Setting maxiter to zero forces
7555                  * the cache to be invalidated and zeroed before reuse.
7556                  * XXX This is too dramatic a measure. Ideally we should
7557                  * save the old cache and restore when running the outer
7558                  * pattern again */
7559                 reginfo->poscache_maxiter = 0;
7560
7561                 /* the new regexp might have a different is_utf8_pat than we do */
7562                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7563
7564                 ST.prev_rex = rex_sv;
7565                 ST.prev_curlyx = cur_curlyx;
7566                 rex_sv = re_sv;
7567                 SET_reg_curpm(rex_sv);
7568                 rex = re;
7569                 rexi = rei;
7570                 cur_curlyx = NULL;
7571                 ST.B = next;
7572                 ST.prev_eval = cur_eval;
7573                 cur_eval = st;
7574                 /* now continue from first node in postoned RE */
7575                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
7576                                     loceol, script_run_begin);
7577                 NOT_REACHED; /* NOTREACHED */
7578         }
7579
7580         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7581             /* note: this is called twice; first after popping B, then A */
7582             DEBUG_STACK_r({
7583                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7584                     depth, cur_eval, ST.prev_eval);
7585             });
7586
7587 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7588             if ( cur_eval && CUR_EVAL.close_paren ) {\
7589                 DEBUG_STACK_r({ \
7590                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7591                         depth,    \
7592                         CUR_EVAL.close_paren - 1,\
7593                         cur_eval, \
7594                         VAL);     \
7595                 });               \
7596                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7597             }
7598
7599             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7600
7601             rex_sv = ST.prev_rex;
7602             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7603             SET_reg_curpm(rex_sv);
7604             rex = ReANY(rex_sv);
7605             rexi = RXi_GET(rex);
7606             {
7607                 /* preserve $^R across LEAVE's. See Bug 121070. */
7608                 SV *save_sv= GvSV(PL_replgv);
7609                 SV *replsv;
7610                 SvREFCNT_inc(save_sv);
7611                 regcpblow(ST.cp); /* LEAVE in disguise */
7612                 /* don't move this initialization up */
7613                 replsv = GvSV(PL_replgv);
7614                 sv_setsv(replsv, save_sv);
7615                 SvSETMAGIC(replsv);
7616                 SvREFCNT_dec(save_sv);
7617             }
7618             cur_eval = ST.prev_eval;
7619             cur_curlyx = ST.prev_curlyx;
7620
7621             /* Invalidate cache. See "invalidate" comment above. */
7622             reginfo->poscache_maxiter = 0;
7623             if ( nochange_depth )
7624                 nochange_depth--;
7625
7626             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7627             sayYES;
7628
7629
7630         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7631             REGCP_UNWIND(ST.lastcp);
7632             sayNO;
7633
7634         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7635             /* note: this is called twice; first after popping B, then A */
7636             DEBUG_STACK_r({
7637                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7638                     depth, cur_eval, ST.prev_eval);
7639             });
7640
7641             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7642
7643             rex_sv = ST.prev_rex;
7644             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7645             SET_reg_curpm(rex_sv);
7646             rex = ReANY(rex_sv);
7647             rexi = RXi_GET(rex);
7648
7649             REGCP_UNWIND(ST.lastcp);
7650             regcppop(rex, &maxopenparen);
7651             cur_eval = ST.prev_eval;
7652             cur_curlyx = ST.prev_curlyx;
7653
7654             /* Invalidate cache. See "invalidate" comment above. */
7655             reginfo->poscache_maxiter = 0;
7656             if ( nochange_depth )
7657                 nochange_depth--;
7658
7659             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7660             sayNO_SILENT;
7661 #undef ST
7662
7663         case OPEN: /*  (  */
7664             n = ARG(scan);  /* which paren pair */
7665             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7666             if (n > maxopenparen)
7667                 maxopenparen = n;
7668             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7669                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7670                 depth,
7671                 PTR2UV(rex),
7672                 PTR2UV(rex->offs),
7673                 (UV)n,
7674                 (IV)rex->offs[n].start_tmp,
7675                 (UV)maxopenparen
7676             ));
7677             lastopen = n;
7678             break;
7679
7680         case SROPEN: /*  (*SCRIPT_RUN:  */
7681             script_run_begin = (U8 *) locinput;
7682             break;
7683
7684
7685         case CLOSE:  /*  )  */
7686             n = ARG(scan);  /* which paren pair */
7687             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7688                              locinput - reginfo->strbeg);
7689             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7690                 goto fake_end;
7691
7692             break;
7693
7694         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7695
7696             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7697             {
7698                 sayNO;
7699             }
7700
7701             break;
7702
7703
7704         case ACCEPT:  /*  (*ACCEPT)  */
7705             if (scan->flags)
7706                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7707             if (ARG2L(scan)){
7708                 regnode *cursor;
7709                 for (cursor=scan;
7710                      cursor && OP(cursor)!=END;
7711                      cursor=regnext(cursor))
7712                 {
7713                     if ( OP(cursor)==CLOSE ){
7714                         n = ARG(cursor);
7715                         if ( n <= lastopen ) {
7716                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7717                                              locinput - reginfo->strbeg);
7718                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7719                                 break;
7720                         }
7721                     }
7722                 }
7723             }
7724             goto fake_end;
7725             /* NOTREACHED */
7726
7727         case GROUPP:  /*  (?(1))  */
7728             n = ARG(scan);  /* which paren pair */
7729             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7730             break;
7731
7732         case NGROUPP:  /*  (?(<name>))  */
7733             /* reg_check_named_buff_matched returns 0 for no match */
7734             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7735             break;
7736
7737         case INSUBP:   /*  (?(R))  */
7738             n = ARG(scan);
7739             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7740              * of SCAN is already set up as matches a eval.close_paren */
7741             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7742             break;
7743
7744         case DEFINEP:  /*  (?(DEFINE))  */
7745             sw = 0;
7746             break;
7747
7748         case IFTHEN:   /*  (?(cond)A|B)  */
7749             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7750             if (sw)
7751                 next = NEXTOPER(NEXTOPER(scan));
7752             else {
7753                 next = scan + ARG(scan);
7754                 if (OP(next) == IFTHEN) /* Fake one. */
7755                     next = NEXTOPER(NEXTOPER(next));
7756             }
7757             break;
7758
7759         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7760             logical = scan->flags;
7761             break;
7762
7763 /*******************************************************************
7764
7765 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7766 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7767 STAR/PLUS/CURLY/CURLYN are used instead.)
7768
7769 A*B is compiled as <CURLYX><A><WHILEM><B>
7770
7771 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7772 state, which contains the current count, initialised to -1. It also sets
7773 cur_curlyx to point to this state, with any previous value saved in the
7774 state block.
7775
7776 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7777 since the pattern may possibly match zero times (i.e. it's a while {} loop
7778 rather than a do {} while loop).
7779
7780 Each entry to WHILEM represents a successful match of A. The count in the
7781 CURLYX block is incremented, another WHILEM state is pushed, and execution
7782 passes to A or B depending on greediness and the current count.
7783
7784 For example, if matching against the string a1a2a3b (where the aN are
7785 substrings that match /A/), then the match progresses as follows: (the
7786 pushed states are interspersed with the bits of strings matched so far):
7787
7788     <CURLYX cnt=-1>
7789     <CURLYX cnt=0><WHILEM>
7790     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7791     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7792     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7793     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7794
7795 (Contrast this with something like CURLYM, which maintains only a single
7796 backtrack state:
7797
7798     <CURLYM cnt=0> a1
7799     a1 <CURLYM cnt=1> a2
7800     a1 a2 <CURLYM cnt=2> a3
7801     a1 a2 a3 <CURLYM cnt=3> b
7802 )
7803
7804 Each WHILEM state block marks a point to backtrack to upon partial failure
7805 of A or B, and also contains some minor state data related to that
7806 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7807 overall state, such as the count, and pointers to the A and B ops.
7808
7809 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7810 must always point to the *current* CURLYX block, the rules are:
7811
7812 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7813 and set cur_curlyx to point the new block.
7814
7815 When popping the CURLYX block after a successful or unsuccessful match,
7816 restore the previous cur_curlyx.
7817
7818 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7819 to the outer one saved in the CURLYX block.
7820
7821 When popping the WHILEM block after a successful or unsuccessful B match,
7822 restore the previous cur_curlyx.
7823
7824 Here's an example for the pattern (AI* BI)*BO
7825 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7826
7827 cur_
7828 curlyx backtrack stack
7829 ------ ---------------
7830 NULL
7831 CO     <CO prev=NULL> <WO>
7832 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7833 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7834 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7835
7836 At this point the pattern succeeds, and we work back down the stack to
7837 clean up, restoring as we go:
7838
7839 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7840 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7841 CO     <CO prev=NULL> <WO>
7842 NULL
7843
7844 *******************************************************************/
7845
7846 #define ST st->u.curlyx
7847
7848         case CURLYX:    /* start of /A*B/  (for complex A) */
7849         {
7850             /* No need to save/restore up to this paren */
7851             I32 parenfloor = scan->flags;
7852
7853             assert(next); /* keep Coverity happy */
7854             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7855                 next += ARG(next);
7856
7857             /* XXXX Probably it is better to teach regpush to support
7858                parenfloor > maxopenparen ... */
7859             if (parenfloor > (I32)rex->lastparen)
7860                 parenfloor = rex->lastparen; /* Pessimization... */
7861
7862             ST.prev_curlyx= cur_curlyx;
7863             cur_curlyx = st;
7864             ST.cp = PL_savestack_ix;
7865
7866             /* these fields contain the state of the current curly.
7867              * they are accessed by subsequent WHILEMs */
7868             ST.parenfloor = parenfloor;
7869             ST.me = scan;
7870             ST.B = next;
7871             ST.minmod = minmod;
7872             minmod = 0;
7873             ST.count = -1;      /* this will be updated by WHILEM */
7874             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7875
7876             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
7877                                 script_run_begin);
7878             NOT_REACHED; /* NOTREACHED */
7879         }
7880
7881         case CURLYX_end: /* just finished matching all of A*B */
7882             cur_curlyx = ST.prev_curlyx;
7883             sayYES;
7884             NOT_REACHED; /* NOTREACHED */
7885
7886         case CURLYX_end_fail: /* just failed to match all of A*B */
7887             regcpblow(ST.cp);
7888             cur_curlyx = ST.prev_curlyx;
7889             sayNO;
7890             NOT_REACHED; /* NOTREACHED */
7891
7892
7893 #undef ST
7894 #define ST st->u.whilem
7895
7896         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7897         {
7898             /* see the discussion above about CURLYX/WHILEM */
7899             I32 n;
7900             int min, max;
7901             regnode *A;
7902
7903             assert(cur_curlyx); /* keep Coverity happy */
7904
7905             min = ARG1(cur_curlyx->u.curlyx.me);
7906             max = ARG2(cur_curlyx->u.curlyx.me);
7907             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7908             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7909             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7910             ST.cache_offset = 0;
7911             ST.cache_mask = 0;
7912
7913
7914             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
7915                   depth, (long)n, min, max)
7916             );
7917
7918             /* First just match a string of min A's. */
7919
7920             if (n < min) {
7921                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7922                 cur_curlyx->u.curlyx.lastloc = locinput;
7923                 REGCP_SET(ST.lastcp);
7924
7925                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
7926                                 script_run_begin);
7927                 NOT_REACHED; /* NOTREACHED */
7928             }
7929
7930             /* If degenerate A matches "", assume A done. */
7931
7932             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7933                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
7934                    depth)
7935                 );
7936                 goto do_whilem_B_max;
7937             }
7938
7939             /* super-linear cache processing.
7940              *
7941              * The idea here is that for certain types of CURLYX/WHILEM -
7942              * principally those whose upper bound is infinity (and
7943              * excluding regexes that have things like \1 and other very
7944              * non-regular expresssiony things), then if a pattern like
7945              * /....A*.../ fails and we backtrack to the WHILEM, then we
7946              * make a note that this particular WHILEM op was at string
7947              * position 47 (say) when the rest of pattern failed. Then, if
7948              * we ever find ourselves back at that WHILEM, and at string
7949              * position 47 again, we can just fail immediately rather than
7950              * running the rest of the pattern again.
7951              *
7952              * This is very handy when patterns start to go
7953              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7954              * with a combinatorial explosion of backtracking.
7955              *
7956              * The cache is implemented as a bit array, with one bit per
7957              * string byte position per WHILEM op (up to 16) - so its
7958              * between 0.25 and 2x the string size.
7959              *
7960              * To avoid allocating a poscache buffer every time, we do an
7961              * initially countdown; only after we have  executed a WHILEM
7962              * op (string-length x #WHILEMs) times do we allocate the
7963              * cache.
7964              *
7965              * The top 4 bits of scan->flags byte say how many different
7966              * relevant CURLLYX/WHILEM op pairs there are, while the
7967              * bottom 4-bits is the identifying index number of this
7968              * WHILEM.
7969              */
7970
7971             if (scan->flags) {
7972
7973                 if (!reginfo->poscache_maxiter) {
7974                     /* start the countdown: Postpone detection until we
7975                      * know the match is not *that* much linear. */
7976                     reginfo->poscache_maxiter
7977                         =    (reginfo->strend - reginfo->strbeg + 1)
7978                            * (scan->flags>>4);
7979                     /* possible overflow for long strings and many CURLYX's */
7980                     if (reginfo->poscache_maxiter < 0)
7981                         reginfo->poscache_maxiter = I32_MAX;
7982                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7983                 }
7984
7985                 if (reginfo->poscache_iter-- == 0) {
7986                     /* initialise cache */
7987                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7988                     regmatch_info_aux *const aux = reginfo->info_aux;
7989                     if (aux->poscache) {
7990                         if ((SSize_t)reginfo->poscache_size < size) {
7991                             Renew(aux->poscache, size, char);
7992                             reginfo->poscache_size = size;
7993                         }
7994                         Zero(aux->poscache, size, char);
7995                     }
7996                     else {
7997                         reginfo->poscache_size = size;
7998                         Newxz(aux->poscache, size, char);
7999                     }
8000                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8001       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8002                               PL_colors[4], PL_colors[5])
8003                     );
8004                 }
8005
8006                 if (reginfo->poscache_iter < 0) {
8007                     /* have we already failed at this position? */
8008                     SSize_t offset, mask;
8009
8010                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8011                     offset  = (scan->flags & 0xf) - 1
8012                                 +   (locinput - reginfo->strbeg)
8013                                   * (scan->flags>>4);
8014                     mask    = 1 << (offset % 8);
8015                     offset /= 8;
8016                     if (reginfo->info_aux->poscache[offset] & mask) {
8017                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8018                             depth)
8019                         );
8020                         cur_curlyx->u.curlyx.count--;
8021                         sayNO; /* cache records failure */
8022                     }
8023                     ST.cache_offset = offset;
8024                     ST.cache_mask   = mask;
8025                 }
8026             }
8027
8028             /* Prefer B over A for minimal matching. */
8029
8030             if (cur_curlyx->u.curlyx.minmod) {
8031                 ST.save_curlyx = cur_curlyx;
8032                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8033                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8034                                     locinput, loceol, script_run_begin);
8035                 NOT_REACHED; /* NOTREACHED */
8036             }
8037
8038             /* Prefer A over B for maximal matching. */
8039
8040             if (n < max) { /* More greed allowed? */
8041                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8042                             maxopenparen);
8043                 cur_curlyx->u.curlyx.lastloc = locinput;
8044                 REGCP_SET(ST.lastcp);
8045                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8046                                 script_run_begin);
8047                 NOT_REACHED; /* NOTREACHED */
8048             }
8049             goto do_whilem_B_max;
8050         }
8051         NOT_REACHED; /* NOTREACHED */
8052
8053         case WHILEM_B_min: /* just matched B in a minimal match */
8054         case WHILEM_B_max: /* just matched B in a maximal match */
8055             cur_curlyx = ST.save_curlyx;
8056             sayYES;
8057             NOT_REACHED; /* NOTREACHED */
8058
8059         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8060             cur_curlyx = ST.save_curlyx;
8061             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8062             cur_curlyx->u.curlyx.count--;
8063             CACHEsayNO;
8064             NOT_REACHED; /* NOTREACHED */
8065
8066         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8067             /* FALLTHROUGH */
8068         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8069             REGCP_UNWIND(ST.lastcp);
8070             regcppop(rex, &maxopenparen);
8071             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8072             cur_curlyx->u.curlyx.count--;
8073             CACHEsayNO;
8074             NOT_REACHED; /* NOTREACHED */
8075
8076         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8077             REGCP_UNWIND(ST.lastcp);
8078             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8079             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8080                 depth)
8081             );
8082           do_whilem_B_max:
8083             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8084                 && ckWARN(WARN_REGEXP)
8085                 && !reginfo->warned)
8086             {
8087                 reginfo->warned = TRUE;
8088                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8089                      "Complex regular subexpression recursion limit (%d) "
8090                      "exceeded",
8091                      REG_INFTY - 1);
8092             }
8093
8094             /* now try B */
8095             ST.save_curlyx = cur_curlyx;
8096             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8097             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8098                                 locinput, loceol, script_run_begin);
8099             NOT_REACHED; /* NOTREACHED */
8100
8101         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8102             cur_curlyx = ST.save_curlyx;
8103
8104             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8105                 /* Maximum greed exceeded */
8106                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8107                     && ckWARN(WARN_REGEXP)
8108                     && !reginfo->warned)
8109                 {
8110                     reginfo->warned     = TRUE;
8111                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8112                         "Complex regular subexpression recursion "
8113                         "limit (%d) exceeded",
8114                         REG_INFTY - 1);
8115                 }
8116                 cur_curlyx->u.curlyx.count--;
8117                 CACHEsayNO;
8118             }
8119
8120             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8121             );
8122             /* Try grabbing another A and see if it helps. */
8123             cur_curlyx->u.curlyx.lastloc = locinput;
8124             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8125                             maxopenparen);
8126             REGCP_SET(ST.lastcp);
8127             PUSH_STATE_GOTO(WHILEM_A_min,
8128                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8129                 locinput, loceol, script_run_begin);
8130             NOT_REACHED; /* NOTREACHED */
8131
8132 #undef  ST
8133 #define ST st->u.branch
8134
8135         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8136             next = scan + ARG(scan);
8137             if (next == scan)
8138                 next = NULL;
8139             scan = NEXTOPER(scan);
8140             /* FALLTHROUGH */
8141
8142         case BRANCH:        /*  /(...|A|...)/ */
8143             scan = NEXTOPER(scan); /* scan now points to inner node */
8144             ST.lastparen = rex->lastparen;
8145             ST.lastcloseparen = rex->lastcloseparen;
8146             ST.next_branch = next;
8147             REGCP_SET(ST.cp);
8148
8149             /* Now go into the branch */
8150             if (has_cutgroup) {
8151                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8152                                     script_run_begin);
8153             } else {
8154                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8155                                 script_run_begin);
8156             }
8157             NOT_REACHED; /* NOTREACHED */
8158
8159         case CUTGROUP:  /*  /(*THEN)/  */
8160             sv_yes_mark = st->u.mark.mark_name = scan->flags
8161                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8162                 : NULL;
8163             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8164                             script_run_begin);
8165             NOT_REACHED; /* NOTREACHED */
8166
8167         case CUTGROUP_next_fail:
8168             do_cutgroup = 1;
8169             no_final = 1;
8170             if (st->u.mark.mark_name)
8171                 sv_commit = st->u.mark.mark_name;
8172             sayNO;
8173             NOT_REACHED; /* NOTREACHED */
8174
8175         case BRANCH_next:
8176             sayYES;
8177             NOT_REACHED; /* NOTREACHED */
8178
8179         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8180             if (do_cutgroup) {
8181                 do_cutgroup = 0;
8182                 no_final = 0;
8183             }
8184             REGCP_UNWIND(ST.cp);
8185             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8186             scan = ST.next_branch;
8187             /* no more branches? */
8188             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8189                 DEBUG_EXECUTE_r({
8190                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8191                         depth,
8192                         PL_colors[4],
8193                         PL_colors[5] );
8194                 });
8195                 sayNO_SILENT;
8196             }
8197             continue; /* execute next BRANCH[J] op */
8198             /* NOTREACHED */
8199
8200         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8201             minmod = 1;
8202             break;
8203
8204 #undef  ST
8205 #define ST st->u.curlym
8206
8207         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8208
8209             /* This is an optimisation of CURLYX that enables us to push
8210              * only a single backtracking state, no matter how many matches
8211              * there are in {m,n}. It relies on the pattern being constant
8212              * length, with no parens to influence future backrefs
8213              */
8214
8215             ST.me = scan;
8216             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8217
8218             ST.lastparen      = rex->lastparen;
8219             ST.lastcloseparen = rex->lastcloseparen;
8220
8221             /* if paren positive, emulate an OPEN/CLOSE around A */
8222             if (ST.me->flags) {
8223                 U32 paren = ST.me->flags;
8224                 if (paren > maxopenparen)
8225                     maxopenparen = paren;
8226                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8227             }
8228             ST.A = scan;
8229             ST.B = next;
8230             ST.alen = 0;
8231             ST.count = 0;
8232             ST.minmod = minmod;
8233             minmod = 0;
8234             ST.c1 = CHRTEST_UNINIT;
8235             REGCP_SET(ST.cp);
8236
8237             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8238                 goto curlym_do_B;
8239
8240           curlym_do_A: /* execute the A in /A{m,n}B/  */
8241             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8242                                 script_run_begin);
8243             NOT_REACHED; /* NOTREACHED */
8244
8245         case CURLYM_A: /* we've just matched an A */
8246             ST.count++;
8247             /* after first match, determine A's length: u.curlym.alen */
8248             if (ST.count == 1) {
8249                 if (reginfo->is_utf8_target) {
8250                     char *s = st->locinput;
8251                     while (s < locinput) {
8252                         ST.alen++;
8253                         s += UTF8SKIP(s);
8254                     }
8255                 }
8256                 else {
8257                     ST.alen = locinput - st->locinput;
8258                 }
8259                 if (ST.alen == 0)
8260                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8261             }
8262             DEBUG_EXECUTE_r(
8263                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8264                           depth, (IV) ST.count, (IV)ST.alen)
8265             );
8266
8267             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8268                 goto fake_end;
8269
8270             {
8271                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8272                 if ( max == REG_INFTY || ST.count < max )
8273                     goto curlym_do_A; /* try to match another A */
8274             }
8275             goto curlym_do_B; /* try to match B */
8276
8277         case CURLYM_A_fail: /* just failed to match an A */
8278             REGCP_UNWIND(ST.cp);
8279
8280
8281             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8282                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8283                 sayNO;
8284
8285           curlym_do_B: /* execute the B in /A{m,n}B/  */
8286             if (ST.c1 == CHRTEST_UNINIT) {
8287                 /* calculate c1 and c2 for possible match of 1st char
8288                  * following curly */
8289                 ST.c1 = ST.c2 = CHRTEST_VOID;
8290                 assert(ST.B);
8291                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8292                     regnode *text_node = ST.B;
8293                     if (! HAS_TEXT(text_node))
8294                         FIND_NEXT_IMPT(text_node);
8295                     if (PL_regkind[OP(text_node)] == EXACT) {
8296                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8297                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8298                            reginfo))
8299                         {
8300                             sayNO;
8301                         }
8302                     }
8303                 }
8304             }
8305
8306             DEBUG_EXECUTE_r(
8307                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8308                     depth, (IV)ST.count)
8309                 );
8310             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8311                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8312
8313                            /* (We can use memEQ and memNE in this file without
8314                             * having to worry about one being shorter than the
8315                             * other, since the first byte of each gives the
8316                             * length of the character) */
8317                     if (   memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
8318                                                               reginfo->strend))
8319                         && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
8320                                                              reginfo->strend)))
8321                     {
8322                         /* simulate B failing */
8323                         DEBUG_OPTIMISE_r(
8324                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8325                                 depth,
8326                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8327                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8328                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8329                         );
8330                         state_num = CURLYM_B_fail;
8331                         goto reenter_switch;
8332                     }
8333                 }
8334                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8335                     /* simulate B failing */
8336                     DEBUG_OPTIMISE_r(
8337                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8338                             depth,
8339                             (int) nextchr, ST.c1, ST.c2)
8340                     );
8341                     state_num = CURLYM_B_fail;
8342                     goto reenter_switch;
8343                 }
8344             }
8345
8346             if (ST.me->flags) {
8347                 /* emulate CLOSE: mark current A as captured */
8348                 U32 paren = (U32)ST.me->flags;
8349                 if (ST.count) {
8350                     CLOSE_CAPTURE(paren,
8351                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
8352                         locinput - reginfo->strbeg);
8353                 }
8354                 else
8355                     rex->offs[paren].end = -1;
8356
8357                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8358                 {
8359                     if (ST.count)
8360                         goto fake_end;
8361                     else
8362                         sayNO;
8363                 }
8364             }
8365
8366             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
8367                             script_run_begin);
8368             NOT_REACHED; /* NOTREACHED */
8369
8370         case CURLYM_B_fail: /* just failed to match a B */
8371             REGCP_UNWIND(ST.cp);
8372             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8373             if (ST.minmod) {
8374                 I32 max = ARG2(ST.me);
8375                 if (max != REG_INFTY && ST.count == max)
8376                     sayNO;
8377                 goto curlym_do_A; /* try to match a further A */
8378             }
8379             /* backtrack one A */
8380             if (ST.count == ARG1(ST.me) /* min */)
8381                 sayNO;
8382             ST.count--;
8383             SET_locinput(HOPc(locinput, -ST.alen));
8384             goto curlym_do_B; /* try to match B */
8385
8386 #undef ST
8387 #define ST st->u.curly
8388
8389 #define CURLY_SETPAREN(paren, success) \
8390     if (paren) { \
8391         if (success) { \
8392             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8393                                  locinput - reginfo->strbeg); \
8394         } \
8395         else { \
8396             rex->offs[paren].end = -1; \
8397             rex->lastparen      = ST.lastparen; \
8398             rex->lastcloseparen = ST.lastcloseparen; \
8399         } \
8400     }
8401
8402         case STAR:              /*  /A*B/ where A is width 1 char */
8403             ST.paren = 0;
8404             ST.min = 0;
8405             ST.max = REG_INFTY;
8406             scan = NEXTOPER(scan);
8407             goto repeat;
8408
8409         case PLUS:              /*  /A+B/ where A is width 1 char */
8410             ST.paren = 0;
8411             ST.min = 1;
8412             ST.max = REG_INFTY;
8413             scan = NEXTOPER(scan);
8414             goto repeat;
8415
8416         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8417             ST.paren = scan->flags;     /* Which paren to set */
8418             ST.lastparen      = rex->lastparen;
8419             ST.lastcloseparen = rex->lastcloseparen;
8420             if (ST.paren > maxopenparen)
8421                 maxopenparen = ST.paren;
8422             ST.min = ARG1(scan);  /* min to match */
8423             ST.max = ARG2(scan);  /* max to match */
8424             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8425
8426             /* handle the single-char capture called as a GOSUB etc */
8427             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8428             {
8429                 char *li = locinput;
8430                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
8431                     sayNO;
8432                 SET_locinput(li);
8433                 goto fake_end;
8434             }
8435
8436             goto repeat;
8437
8438         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8439             ST.paren = 0;
8440             ST.min = ARG1(scan);  /* min to match */
8441             ST.max = ARG2(scan);  /* max to match */
8442             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8443           repeat:
8444             /*
8445             * Lookahead to avoid useless match attempts
8446             * when we know what character comes next.
8447             *
8448             * Used to only do .*x and .*?x, but now it allows
8449             * for )'s, ('s and (?{ ... })'s to be in the way
8450             * of the quantifier and the EXACT-like node.  -- japhy
8451             */
8452
8453             assert(ST.min <= ST.max);
8454             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8455                 ST.c1 = ST.c2 = CHRTEST_VOID;
8456             }
8457             else {
8458                 regnode *text_node = next;
8459
8460                 if (! HAS_TEXT(text_node))
8461                     FIND_NEXT_IMPT(text_node);
8462
8463                 if (! HAS_TEXT(text_node))
8464                     ST.c1 = ST.c2 = CHRTEST_VOID;
8465                 else {
8466                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8467                         ST.c1 = ST.c2 = CHRTEST_VOID;
8468                     }
8469                     else {
8470                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8471                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8472                            reginfo))
8473                         {
8474                             sayNO;
8475                         }
8476                     }
8477                 }
8478             }
8479
8480             ST.A = scan;
8481             ST.B = next;
8482             if (minmod) {
8483                 char *li = locinput;
8484                 minmod = 0;
8485                 if (ST.min &&
8486                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
8487                             < ST.min)
8488                     sayNO;
8489                 SET_locinput(li);
8490                 ST.count = ST.min;
8491                 REGCP_SET(ST.cp);
8492                 if (ST.c1 == CHRTEST_VOID)
8493                     goto curly_try_B_min;
8494
8495                 ST.oldloc = locinput;
8496
8497                 /* set ST.maxpos to the furthest point along the
8498                  * string that could possibly match */
8499                 if  (ST.max == REG_INFTY) {
8500                     ST.maxpos = loceol - 1;
8501                     if (utf8_target)
8502                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8503                             ST.maxpos--;
8504                 }
8505                 else if (utf8_target) {
8506                     int m = ST.max - ST.min;
8507                     for (ST.maxpos = locinput;
8508                          m >0 && ST.maxpos <  loceol; m--)
8509                         ST.maxpos += UTF8SKIP(ST.maxpos);
8510                 }
8511                 else {
8512                     ST.maxpos = locinput + ST.max - ST.min;
8513                     if (ST.maxpos >=  loceol)
8514                         ST.maxpos =  loceol - 1;
8515                 }
8516                 goto curly_try_B_min_known;
8517
8518             }
8519             else {
8520                 /* avoid taking address of locinput, so it can remain
8521                  * a register var */
8522                 char *li = locinput;
8523                 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
8524                 if (ST.count < ST.min)
8525                     sayNO;
8526                 SET_locinput(li);
8527                 if ((ST.count > ST.min)
8528                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8529                 {
8530                     /* A{m,n} must come at the end of the string, there's
8531                      * no point in backing off ... */
8532                     ST.min = ST.count;
8533                     /* ...except that $ and \Z can match before *and* after
8534                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8535                        We may back off by one in this case. */
8536                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8537                         ST.min--;
8538                 }
8539                 REGCP_SET(ST.cp);
8540                 goto curly_try_B_max;
8541             }
8542             NOT_REACHED; /* NOTREACHED */
8543
8544         case CURLY_B_min_fail:
8545             /* failed to find B in a non-greedy match.
8546              * Handles both cases where c1,c2 valid or not */
8547
8548             REGCP_UNWIND(ST.cp);
8549             if (ST.paren) {
8550                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8551             }
8552
8553             if (ST.c1 == CHRTEST_VOID) {
8554                 /* failed -- move forward one */
8555                 char *li = locinput;
8556                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
8557                     sayNO;
8558                 }
8559                 locinput = li;
8560                 ST.count++;
8561                 if (!(   ST.count <= ST.max
8562                         /* count overflow ? */
8563                      || (ST.max == REG_INFTY && ST.count > 0))
8564                 )
8565                     sayNO;
8566             }
8567             else {
8568                 int n;
8569                 /* Couldn't or didn't -- move forward. */
8570                 ST.oldloc = locinput;
8571                 if (utf8_target)
8572                     locinput += UTF8SKIP(locinput);
8573                 else
8574                     locinput++;
8575                 ST.count++;
8576
8577               curly_try_B_min_known:
8578                 /* find the next place where 'B' could work, then call B */
8579                 if (utf8_target) {
8580                     n = (ST.oldloc == locinput) ? 0 : 1;
8581                     if (ST.c1 == ST.c2) {
8582                         /* set n to utf8_distance(oldloc, locinput) */
8583                         while (    locinput <= ST.maxpos
8584                                &&  locinput < loceol
8585                                &&  memNE(locinput, ST.c1_utf8,
8586                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8587                         {
8588                             locinput += UTF8_SAFE_SKIP(locinput,
8589                                                        reginfo->strend);
8590                             n++;
8591                         }
8592                     }
8593                     else {
8594                         /* set n to utf8_distance(oldloc, locinput) */
8595                         while (   locinput <= ST.maxpos
8596                                && locinput < loceol
8597                                && memNE(locinput, ST.c1_utf8,
8598                                      UTF8_SAFE_SKIP(locinput, reginfo->strend))
8599                                && memNE(locinput, ST.c2_utf8,
8600                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8601                         {
8602                             locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
8603                             n++;
8604                         }
8605                     }
8606                 }
8607                 else {  /* Not utf8_target */
8608                     if (ST.c1 == ST.c2) {
8609                         locinput = (char *) memchr(locinput,
8610                                                    ST.c1,
8611                                                    ST.maxpos + 1 - locinput);
8612                         if (! locinput) {
8613                             locinput = ST.maxpos + 1;
8614                         }
8615                     }
8616                     else {
8617                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8618
8619                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8620                             while (   locinput <= ST.maxpos
8621                                    && UCHARAT(locinput) != ST.c1
8622                                    && UCHARAT(locinput) != ST.c2)
8623                             {
8624                                 locinput++;
8625                             }
8626                         }
8627                         else {
8628                             /* If c1 and c2 only differ by a single bit, we can
8629                              * avoid a conditional each time through the loop,
8630                              * at the expense of a little preliminary setup and
8631                              * an extra mask each iteration.  By masking out
8632                              * that bit, we match exactly two characters, c1
8633                              * and c2, and so we don't have to test for both.
8634                              * On both ASCII and EBCDIC platforms, most of the
8635                              * ASCII-range and Latin1-range folded equivalents
8636                              * differ only in a single bit, so this is actually
8637                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8638                              * 0x61). */
8639                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8640                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8641                             while (   locinput <= ST.maxpos
8642                                    && (UCHARAT(locinput) & c1_c2_mask)
8643                                                                 != c1_masked)
8644                             {
8645                                 locinput++;
8646                             }
8647                         }
8648                     }
8649                     n = locinput - ST.oldloc;
8650                 }
8651                 if (locinput > ST.maxpos)
8652                     sayNO;
8653                 if (n) {
8654                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8655                      * at b; check that everything between oldloc and
8656                      * locinput matches */
8657                     char *li = ST.oldloc;
8658                     ST.count += n;
8659                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
8660                         sayNO;
8661                     assert(n == REG_INFTY || locinput == li);
8662                 }
8663             }
8664
8665           curly_try_B_min:
8666             CURLY_SETPAREN(ST.paren, ST.count);
8667             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
8668                             script_run_begin);
8669             NOT_REACHED; /* NOTREACHED */
8670
8671
8672           curly_try_B_max:
8673             /* a successful greedy match: now try to match B */
8674             {
8675                 bool could_match = locinput <  loceol;
8676
8677                 /* If it could work, try it. */
8678                 if (ST.c1 != CHRTEST_VOID && could_match) {
8679                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8680                     {
8681                         could_match =  memEQ(locinput, ST.c1_utf8,
8682                                              UTF8_SAFE_SKIP(locinput,
8683                                                             reginfo->strend))
8684                                     || memEQ(locinput, ST.c2_utf8,
8685                                              UTF8_SAFE_SKIP(locinput,
8686                                                             reginfo->strend));
8687                     }
8688                     else {
8689                         could_match =   UCHARAT(locinput) == ST.c1
8690                                      || UCHARAT(locinput) == ST.c2;
8691                     }
8692                 }
8693                 if (ST.c1 == CHRTEST_VOID || could_match) {
8694                     CURLY_SETPAREN(ST.paren, ST.count);
8695                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
8696                                     script_run_begin);
8697                     NOT_REACHED; /* NOTREACHED */
8698                 }
8699             }
8700             /* FALLTHROUGH */
8701
8702         case CURLY_B_max_fail:
8703             /* failed to find B in a greedy match */
8704
8705             REGCP_UNWIND(ST.cp);
8706             if (ST.paren) {
8707                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8708             }
8709             /*  back up. */
8710             if (--ST.count < ST.min)
8711                 sayNO;
8712             locinput = HOPc(locinput, -1);
8713             goto curly_try_B_max;
8714
8715 #undef ST
8716
8717         case END: /*  last op of main pattern  */
8718           fake_end:
8719             if (cur_eval) {
8720                 /* we've just finished A in /(??{A})B/; now continue with B */
8721                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8722                 st->u.eval.prev_rex = rex_sv;           /* inner */
8723
8724                 /* Save *all* the positions. */
8725                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8726                 rex_sv = CUR_EVAL.prev_rex;
8727                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8728                 SET_reg_curpm(rex_sv);
8729                 rex = ReANY(rex_sv);
8730                 rexi = RXi_GET(rex);
8731
8732                 st->u.eval.prev_curlyx = cur_curlyx;
8733                 cur_curlyx = CUR_EVAL.prev_curlyx;
8734
8735                 REGCP_SET(st->u.eval.lastcp);
8736
8737                 /* Restore parens of the outer rex without popping the
8738                  * savestack */
8739                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8740
8741                 st->u.eval.prev_eval = cur_eval;
8742                 cur_eval = CUR_EVAL.prev_eval;
8743                 DEBUG_EXECUTE_r(
8744                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8745                                       depth, cur_eval););
8746                 if ( nochange_depth )
8747                     nochange_depth--;
8748
8749                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8750
8751                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
8752                                     st->u.eval.prev_eval->u.eval.B,
8753                                     locinput, loceol, script_run_begin);
8754             }
8755
8756             if (locinput < reginfo->till) {
8757                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8758                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8759                                       PL_colors[4],
8760                                       (long)(locinput - startpos),
8761                                       (long)(reginfo->till - startpos),
8762                                       PL_colors[5]));
8763
8764                 sayNO_SILENT;           /* Cannot match: too short. */
8765             }
8766             sayYES;                     /* Success! */
8767
8768         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8769             DEBUG_EXECUTE_r(
8770             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8771                 depth, PL_colors[4], PL_colors[5]));
8772             sayYES;                     /* Success! */
8773
8774 #undef  ST
8775 #define ST st->u.ifmatch
8776
8777         case SUSPEND:   /* (?>A) */
8778             ST.wanted = 1;
8779             ST.start = locinput;
8780             ST.end = loceol;
8781             ST.count = 1;
8782             goto do_ifmatch;
8783
8784         case UNLESSM:   /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
8785             ST.wanted = 0;
8786             goto ifmatch_trivial_fail_test;
8787
8788         case IFMATCH:   /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
8789             ST.wanted = 1;
8790           ifmatch_trivial_fail_test:
8791             ST.count = scan->next_off + 1; /* next_off repurposed to be
8792                                               lookbehind count, requires
8793                                               non-zero flags */
8794             if (! scan->flags) {    /* 'flags' zero means lookahed */
8795
8796                 /* Lookahead starts here and ends at the normal place */
8797                 ST.start = locinput;
8798                 ST.end = loceol;
8799             }
8800             else {
8801                 PERL_UINT_FAST8_T back_count = scan->flags;
8802                 char * s;
8803
8804                 /* Lookbehind can look beyond the current position */
8805                 ST.end = loceol;
8806
8807                 /* ... and starts at the first place in the input that is in
8808                  * the range of the possible start positions */
8809                 for (; ST.count > 0; ST.count--, back_count--) {
8810                     s = HOPBACKc(locinput, back_count);
8811                     if (s) {
8812                         ST.start = s;
8813                         goto do_ifmatch;
8814                     }
8815                 }
8816
8817                 /* If the lookbehind doesn't start in the actual string, is a
8818                  * trivial match failure */
8819                 if (logical) {
8820                     logical = 0;
8821                     sw = 1 - cBOOL(ST.wanted);
8822                 }
8823                 else if (ST.wanted)
8824                     sayNO;
8825
8826                 /* Here, we didn't want it to match, so is actually success */
8827                 next = scan + ARG(scan);
8828                 if (next == scan)
8829                     next = NULL;
8830                 break;
8831             }
8832
8833           do_ifmatch:
8834             ST.me = scan;
8835             ST.logical = logical;
8836             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8837
8838             /* execute body of (?...A) */
8839             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
8840                                 ST.end, script_run_begin);
8841             NOT_REACHED; /* NOTREACHED */
8842
8843         {
8844             bool matched;
8845
8846         case IFMATCH_A_fail: /* body of (?...A) failed */
8847             if (! ST.logical && ST.count > 1) {
8848
8849                 /* It isn't a real failure until we've tried all starting
8850                  * positions.  Move to the next starting position and retry */
8851                 ST.count--;
8852                 ST.start = HOPc(ST.start, 1);
8853                 scan = ST.me;
8854                 logical = ST.logical;
8855                 goto do_ifmatch;
8856             }
8857
8858             /* Here, all starting positions have been tried. */
8859             matched = FALSE;
8860             goto ifmatch_done;
8861
8862         case IFMATCH_A: /* body of (?...A) succeeded */
8863             matched = TRUE;
8864           ifmatch_done:
8865             sw = matched == ST.wanted;
8866             if (! ST.logical && !sw) {
8867                 sayNO;
8868             }
8869
8870             if (OP(ST.me) != SUSPEND) {
8871                 /* restore old position except for (?>...) */
8872                 locinput = st->locinput;
8873                 loceol = st->loceol;
8874                 script_run_begin = st->sr0;
8875             }
8876             scan = ST.me + ARG(ST.me);
8877             if (scan == ST.me)
8878                 scan = NULL;
8879             continue; /* execute B */
8880         }
8881
8882 #undef ST
8883
8884         case LONGJMP: /*  alternative with many branches compiles to
8885                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8886             next = scan + ARG(scan);
8887             if (next == scan)
8888                 next = NULL;
8889             break;
8890
8891         case COMMIT:  /*  (*COMMIT)  */
8892             reginfo->cutpoint = loceol;
8893             /* FALLTHROUGH */
8894
8895         case PRUNE:   /*  (*PRUNE)   */
8896             if (scan->flags)
8897                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8898             PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
8899                             script_run_begin);
8900             NOT_REACHED; /* NOTREACHED */
8901
8902         case COMMIT_next_fail:
8903             no_final = 1;
8904             /* FALLTHROUGH */
8905             sayNO;
8906             NOT_REACHED; /* NOTREACHED */
8907
8908         case OPFAIL:   /* (*FAIL)  */
8909             if (scan->flags)
8910                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8911             if (logical) {
8912                 /* deal with (?(?!)X|Y) properly,
8913                  * make sure we trigger the no branch
8914                  * of the trailing IFTHEN structure*/
8915                 sw= 0;
8916                 break;
8917             } else {
8918                 sayNO;
8919             }
8920             NOT_REACHED; /* NOTREACHED */
8921
8922 #define ST st->u.mark
8923         case MARKPOINT: /*  (*MARK:foo)  */
8924             ST.prev_mark = mark_state;
8925             ST.mark_name = sv_commit = sv_yes_mark
8926                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8927             mark_state = st;
8928             ST.mark_loc = locinput;
8929             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
8930                                 script_run_begin);
8931             NOT_REACHED; /* NOTREACHED */
8932
8933         case MARKPOINT_next:
8934             mark_state = ST.prev_mark;
8935             sayYES;
8936             NOT_REACHED; /* NOTREACHED */
8937
8938         case MARKPOINT_next_fail:
8939             if (popmark && sv_eq(ST.mark_name,popmark))
8940             {
8941                 if (ST.mark_loc > startpoint)
8942                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8943                 popmark = NULL; /* we found our mark */
8944                 sv_commit = ST.mark_name;
8945
8946                 DEBUG_EXECUTE_r({
8947                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8948                             depth,
8949                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8950                 });
8951             }
8952             mark_state = ST.prev_mark;
8953             sv_yes_mark = mark_state ?
8954                 mark_state->u.mark.mark_name : NULL;
8955             sayNO;
8956             NOT_REACHED; /* NOTREACHED */
8957
8958         case SKIP:  /*  (*SKIP)  */
8959             if (!scan->flags) {
8960                 /* (*SKIP) : if we fail we cut here*/
8961                 ST.mark_name = NULL;
8962                 ST.mark_loc = locinput;
8963                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
8964                                 script_run_begin);
8965             } else {
8966                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8967                    otherwise do nothing.  Meaning we need to scan
8968                  */
8969                 regmatch_state *cur = mark_state;
8970                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8971
8972                 while (cur) {
8973                     if ( sv_eq( cur->u.mark.mark_name,
8974                                 find ) )
8975                     {
8976                         ST.mark_name = find;
8977                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
8978                                          script_run_begin);
8979                     }
8980                     cur = cur->u.mark.prev_mark;
8981                 }
8982             }
8983             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8984             break;
8985
8986         case SKIP_next_fail:
8987             if (ST.mark_name) {
8988                 /* (*CUT:NAME) - Set up to search for the name as we
8989                    collapse the stack*/
8990                 popmark = ST.mark_name;
8991             } else {
8992                 /* (*CUT) - No name, we cut here.*/
8993                 if (ST.mark_loc > startpoint)
8994                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8995                 /* but we set sv_commit to latest mark_name if there
8996                    is one so they can test to see how things lead to this
8997                    cut */
8998                 if (mark_state)
8999                     sv_commit=mark_state->u.mark.mark_name;
9000             }
9001             no_final = 1;
9002             sayNO;
9003             NOT_REACHED; /* NOTREACHED */
9004 #undef ST
9005
9006         case LNBREAK: /* \R */
9007             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9008                 locinput += n;
9009             } else
9010                 sayNO;
9011             break;
9012
9013         default:
9014             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9015                           PTR2UV(scan), OP(scan));
9016             Perl_croak(aTHX_ "regexp memory corruption");
9017
9018         /* this is a point to jump to in order to increment
9019          * locinput by one character */
9020           increment_locinput:
9021             assert(!NEXTCHR_IS_EOS);
9022             if (utf8_target) {
9023                 locinput += PL_utf8skip[nextchr];
9024                 /* locinput is allowed to go 1 char off the end (signifying
9025                  * EOS), but not 2+ */
9026                 if (locinput >  loceol)
9027                     sayNO;
9028             }
9029             else
9030                 locinput++;
9031             break;
9032
9033         } /* end switch */
9034
9035         /* switch break jumps here */
9036         scan = next; /* prepare to execute the next op and ... */
9037         continue;    /* ... jump back to the top, reusing st */
9038         /* NOTREACHED */
9039
9040       push_yes_state:
9041         /* push a state that backtracks on success */
9042         st->u.yes.prev_yes_state = yes_state;
9043         yes_state = st;
9044         /* FALLTHROUGH */
9045       push_state:
9046         /* push a new regex state, then continue at scan  */
9047         {
9048             regmatch_state *newst;
9049
9050             DEBUG_STACK_r({
9051                 regmatch_state *cur = st;
9052                 regmatch_state *curyes = yes_state;
9053                 U32 i;
9054                 regmatch_slab *slab = PL_regmatch_slab;
9055                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9056                     if (cur < SLAB_FIRST(slab)) {
9057                         slab = slab->prev;
9058                         cur = SLAB_LAST(slab);
9059                     }
9060                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9061                         depth,
9062                         i ? "    " : "push",
9063                         depth - i, PL_reg_name[cur->resume_state],
9064                         (curyes == cur) ? "yes" : ""
9065                     );
9066                     if (curyes == cur)
9067                         curyes = cur->u.yes.prev_yes_state;
9068                 }
9069             } else
9070                 DEBUG_STATE_pp("push")
9071             );
9072             depth++;
9073             st->locinput = locinput;
9074             st->loceol = loceol;
9075             st->sr0 = script_run_begin;
9076             newst = st+1;
9077             if (newst >  SLAB_LAST(PL_regmatch_slab))
9078                 newst = S_push_slab(aTHX);
9079             PL_regmatch_state = newst;
9080
9081             locinput = pushinput;
9082             loceol = pusheol;
9083             script_run_begin = pushsr0;
9084             st = newst;
9085             continue;
9086             /* NOTREACHED */
9087         }
9088     }
9089 #ifdef SOLARIS_BAD_OPTIMIZER
9090 #  undef PL_charclass
9091 #endif
9092
9093     /*
9094     * We get here only if there's trouble -- normally "case END" is
9095     * the terminating point.
9096     */
9097     Perl_croak(aTHX_ "corrupted regexp pointers");
9098     NOT_REACHED; /* NOTREACHED */
9099
9100   yes:
9101     if (yes_state) {
9102         /* we have successfully completed a subexpression, but we must now
9103          * pop to the state marked by yes_state and continue from there */
9104         assert(st != yes_state);
9105 #ifdef DEBUGGING
9106         while (st != yes_state) {
9107             st--;
9108             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9109                 PL_regmatch_slab = PL_regmatch_slab->prev;
9110                 st = SLAB_LAST(PL_regmatch_slab);
9111             }
9112             DEBUG_STATE_r({
9113                 if (no_final) {
9114                     DEBUG_STATE_pp("pop (no final)");
9115                 } else {
9116                     DEBUG_STATE_pp("pop (yes)");
9117                 }
9118             });
9119             depth--;
9120         }
9121 #else
9122         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9123             || yes_state > SLAB_LAST(PL_regmatch_slab))
9124         {
9125             /* not in this slab, pop slab */
9126             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9127             PL_regmatch_slab = PL_regmatch_slab->prev;
9128             st = SLAB_LAST(PL_regmatch_slab);
9129         }
9130         depth -= (st - yes_state);
9131 #endif
9132         st = yes_state;
9133         yes_state = st->u.yes.prev_yes_state;
9134         PL_regmatch_state = st;
9135
9136         if (no_final) {
9137             locinput= st->locinput;
9138             loceol= st->loceol;
9139             script_run_begin = st->sr0;
9140         }
9141         state_num = st->resume_state + no_final;
9142         goto reenter_switch;
9143     }
9144
9145     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9146                           PL_colors[4], PL_colors[5]));
9147
9148     if (reginfo->info_aux_eval) {
9149         /* each successfully executed (?{...}) block does the equivalent of
9150          *   local $^R = do {...}
9151          * When popping the save stack, all these locals would be undone;
9152          * bypass this by setting the outermost saved $^R to the latest
9153          * value */
9154         /* I dont know if this is needed or works properly now.
9155          * see code related to PL_replgv elsewhere in this file.
9156          * Yves
9157          */
9158         if (oreplsv != GvSV(PL_replgv)) {
9159             sv_setsv(oreplsv, GvSV(PL_replgv));
9160             SvSETMAGIC(oreplsv);
9161         }
9162     }
9163     result = 1;
9164     goto final_exit;
9165
9166   no:
9167     DEBUG_EXECUTE_r(
9168         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9169             depth,
9170             PL_colors[4], PL_colors[5])
9171         );
9172
9173   no_silent:
9174     if (no_final) {
9175         if (yes_state) {
9176             goto yes;
9177         } else {
9178             goto final_exit;
9179         }
9180     }
9181     if (depth) {
9182         /* there's a previous state to backtrack to */
9183         st--;
9184         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9185             PL_regmatch_slab = PL_regmatch_slab->prev;
9186             st = SLAB_LAST(PL_regmatch_slab);
9187         }
9188         PL_regmatch_state = st;
9189         locinput= st->locinput;
9190         loceol= st->loceol;
9191         script_run_begin = st->sr0;
9192
9193         DEBUG_STATE_pp("pop");
9194         depth--;
9195         if (yes_state == st)
9196             yes_state = st->u.yes.prev_yes_state;
9197
9198         state_num = st->resume_state + 1; /* failure = success + 1 */
9199         PERL_ASYNC_CHECK();
9200         goto reenter_switch;
9201     }
9202     result = 0;
9203
9204   final_exit:
9205     if (rex->intflags & PREGf_VERBARG_SEEN) {
9206         SV *sv_err = get_sv("REGERROR", 1);
9207         SV *sv_mrk = get_sv("REGMARK", 1);
9208         if (result) {
9209             sv_commit = &PL_sv_no;
9210             if (!sv_yes_mark)
9211                 sv_yes_mark = &PL_sv_yes;
9212         } else {
9213             if (!sv_commit)
9214                 sv_commit = &PL_sv_yes;
9215             sv_yes_mark = &PL_sv_no;
9216         }
9217         assert(sv_err);
9218         assert(sv_mrk);
9219         sv_setsv(sv_err, sv_commit);
9220         sv_setsv(sv_mrk, sv_yes_mark);
9221     }
9222
9223
9224     if (last_pushed_cv) {
9225         dSP;
9226         /* see "Some notes about MULTICALL" above */
9227         POP_MULTICALL;
9228         PERL_UNUSED_VAR(SP);
9229     }
9230     else
9231         LEAVE_SCOPE(orig_savestack_ix);
9232
9233     assert(!result ||  locinput - reginfo->strbeg >= 0);
9234     return result ?  locinput - reginfo->strbeg : -1;
9235 }
9236
9237 /*
9238  - regrepeat - repeatedly match something simple, report how many
9239  *
9240  * What 'simple' means is a node which can be the operand of a quantifier like
9241  * '+', or {1,3}
9242  *
9243  * startposp - pointer to a pointer to the start position.  This is updated
9244  *             to point to the byte following the highest successful
9245  *             match.
9246  * p         - the regnode to be repeatedly matched against.
9247  * loceol    - pointer to the end position beyond which we aren't supposed to
9248  *             look.
9249  * reginfo   - struct holding match state, such as utf8_target
9250  * max       - maximum number of things to match.
9251  * depth     - (for debugging) backtracking depth.
9252  */
9253 STATIC I32
9254 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9255             char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9256 {
9257     dVAR;
9258     char *scan;     /* Pointer to current position in target string */
9259     I32 c;
9260     char *this_eol = loceol;   /* potentially adjusted version. */
9261     I32 hardcount = 0;  /* How many matches so far */
9262     bool utf8_target = reginfo->is_utf8_target;
9263     unsigned int to_complement = 0;  /* Invert the result? */
9264     UV utf8_flags = 0;
9265     _char_class_number classnum;
9266
9267     PERL_ARGS_ASSERT_REGREPEAT;
9268
9269     /* This routine is structured so that we switch on the input OP.  Each OP
9270      * case: statement contains a loop to repeatedly apply the OP, advancing
9271      * the input until it fails, or reaches the end of the input, or until it
9272      * reaches the upper limit of matches. */
9273
9274     scan = *startposp;
9275     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
9276                                max */
9277         max = I32_MAX;
9278     else if (! utf8_target && this_eol - scan > max)
9279         this_eol = scan + max;
9280
9281     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
9282      * to the maximum of how far we should go in it (leaving it set to the real
9283      * end, if the maximum permissible would take us beyond that).  This allows
9284      * us to make the loop exit condition that we haven't gone past <this_eol> to
9285      * also mean that we haven't exceeded the max permissible count, saving a
9286      * test each time through the loops.  But it assumes that the OP matches a
9287      * single byte, which is true for most of the OPs below when applied to a
9288      * non-UTF-8 target.  Those relatively few OPs that don't have this
9289      * characteristic will have to compensate.
9290      *
9291      * There is no adjustment for UTF-8 targets, as the number of bytes per
9292      * character varies.  OPs will have to test both that the count is less
9293      * than the max permissible (using <hardcount> to keep track), and that we
9294      * are still within the bounds of the string (using <this_eol>.  A few OPs
9295      * match a single byte no matter what the encoding.  They can omit the max
9296      * test if, for the UTF-8 case, they do the adjustment that was skipped
9297      * above.
9298      *
9299      * Thus, the code above sets things up for the common case; and exceptional
9300      * cases need extra work; the common case is to make sure <scan> doesn't
9301      * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9302      * count doesn't exceed the maximum permissible */
9303
9304     switch (OP(p)) {
9305     case REG_ANY:
9306         if (utf8_target) {
9307             while (scan < this_eol && hardcount < max && *scan != '\n') {
9308                 scan += UTF8SKIP(scan);
9309                 hardcount++;
9310             }
9311         } else {
9312             scan = (char *) memchr(scan, '\n', this_eol - scan);
9313             if (! scan) {
9314                 scan = this_eol;
9315             }
9316         }
9317         break;
9318     case SANY:
9319         if (utf8_target) {
9320             while (scan < this_eol && hardcount < max) {
9321                 scan += UTF8SKIP(scan);
9322                 hardcount++;
9323             }
9324         }
9325         else
9326             scan = this_eol;
9327         break;
9328     case EXACTL:
9329         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9330         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9331             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9332         }
9333         goto do_exact;
9334
9335     case EXACT_ONLY8:
9336         if (! utf8_target) {
9337             break;
9338         }
9339         /* FALLTHROUGH */
9340     case EXACT:
9341       do_exact:
9342         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9343
9344         c = (U8)*STRING(p);
9345
9346         /* Can use a simple find if the pattern char to match on is invariant
9347          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9348          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9349          * true iff it doesn't matter if the argument is in UTF-8 or not */
9350         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9351             if (utf8_target && this_eol - scan > max) {
9352                 /* We didn't adjust <this_eol> because is UTF-8, but ok to do so,
9353                  * since here, to match at all, 1 char == 1 byte */
9354                 this_eol = scan + max;
9355             }
9356             scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9357         }
9358         else if (reginfo->is_utf8_pat) {
9359             if (utf8_target) {
9360                 STRLEN scan_char_len;
9361
9362                 /* When both target and pattern are UTF-8, we have to do
9363                  * string EQ */
9364                 while (hardcount < max
9365                        && scan < this_eol
9366                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9367                        && memEQ(scan, STRING(p), scan_char_len))
9368                 {
9369                     scan += scan_char_len;
9370                     hardcount++;
9371                 }
9372             }
9373             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9374
9375                 /* Target isn't utf8; convert the character in the UTF-8
9376                  * pattern to non-UTF8, and do a simple find */
9377                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9378                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9379             } /* else pattern char is above Latin1, can't possibly match the
9380                  non-UTF-8 target */
9381         }
9382         else {
9383
9384             /* Here, the string must be utf8; pattern isn't, and <c> is
9385              * different in utf8 than not, so can't compare them directly.
9386              * Outside the loop, find the two utf8 bytes that represent c, and
9387              * then look for those in sequence in the utf8 string */
9388             U8 high = UTF8_TWO_BYTE_HI(c);
9389             U8 low = UTF8_TWO_BYTE_LO(c);
9390
9391             while (hardcount < max
9392                     && scan + 1 < this_eol
9393                     && UCHARAT(scan) == high
9394                     && UCHARAT(scan + 1) == low)
9395             {
9396                 scan += 2;
9397                 hardcount++;
9398             }
9399         }
9400         break;
9401
9402     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9403         assert(! reginfo->is_utf8_pat);
9404         /* FALLTHROUGH */
9405     case EXACTFAA:
9406         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9407         if (reginfo->is_utf8_pat || ! utf8_target) {
9408
9409             /* The possible presence of a MICRO SIGN in the pattern forbids us
9410              * to view a non-UTF-8 pattern as folded when there is a UTF-8
9411              * target.  */
9412             utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
9413         }
9414         goto do_exactf;
9415
9416     case EXACTFL:
9417         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9418         utf8_flags = FOLDEQ_LOCALE;
9419         goto do_exactf;
9420
9421     case EXACTF:   /* This node only generated for non-utf8 patterns */
9422         assert(! reginfo->is_utf8_pat);
9423         goto do_exactf;
9424
9425     case EXACTFLU8:
9426         if (! utf8_target) {
9427             break;
9428         }
9429         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9430                                     | FOLDEQ_S2_FOLDS_SANE;
9431         goto do_exactf;
9432
9433     case EXACTFU_ONLY8:
9434         if (! utf8_target) {
9435             break;
9436         }
9437         assert(reginfo->is_utf8_pat);
9438         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9439         goto do_exactf;
9440
9441     case EXACTFU:
9442         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9443         /* FALLTHROUGH */
9444
9445     case EXACTFUP:
9446
9447       do_exactf: {
9448         int c1, c2;
9449         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9450
9451         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9452
9453         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9454                                         reginfo))
9455         {
9456             if (c1 == CHRTEST_VOID) {
9457                 /* Use full Unicode fold matching */
9458                 char *tmpeol = loceol;
9459                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9460                 while (hardcount < max
9461                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9462                                              STRING(p), NULL, pat_len,
9463                                              reginfo->is_utf8_pat, utf8_flags))
9464                 {
9465                     scan = tmpeol;
9466                     tmpeol = loceol;
9467                     hardcount++;
9468                 }
9469             }
9470             else if (utf8_target) {
9471                 if (c1 == c2) {
9472                     while (scan < this_eol
9473                            && hardcount < max
9474                            && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9475                                                                   loceol)))
9476                     {
9477                         scan += UTF8SKIP(c1_utf8);
9478                         hardcount++;
9479                     }
9480                 }
9481                 else {
9482                     while (scan < this_eol
9483                            && hardcount < max
9484                            && (   memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9485                                                                      loceol))
9486                                || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
9487                                                                      loceol))))
9488                     {
9489                         scan += UTF8_SAFE_SKIP(scan, loceol);
9490                         hardcount++;
9491                     }
9492                 }
9493             }
9494             else if (c1 == c2) {
9495                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c1);
9496             }
9497             else {
9498                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9499                  * a conditional each time through the loop if the characters
9500                  * differ only in a single bit, as is the usual situation */
9501                 U8 c1_c2_bits_differing = c1 ^ c2;
9502
9503                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9504                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9505
9506                     scan = (char *) find_span_end_mask((U8 *) scan,
9507                                                        (U8 *) this_eol,
9508                                                        c1 & c1_c2_mask,
9509                                                        c1_c2_mask);
9510                 }
9511                 else {
9512                     while (    scan < this_eol
9513                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9514                     {
9515                         scan++;
9516                     }
9517                 }
9518             }
9519         }
9520         break;
9521     }
9522     case ANYOFPOSIXL:
9523     case ANYOFL:
9524         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9525
9526         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9527             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9528         }
9529         /* FALLTHROUGH */
9530     case ANYOFD:
9531     case ANYOF:
9532         if (utf8_target) {
9533             while (hardcount < max
9534                    && scan < this_eol
9535                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
9536             {
9537                 scan += UTF8SKIP(scan);
9538                 hardcount++;
9539             }
9540         }
9541         else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
9542             while (scan < this_eol
9543                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9544                 scan++;
9545         }
9546         else {
9547             while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9548                 scan++;
9549         }
9550         break;
9551
9552     case ANYOFM:
9553         if (utf8_target && this_eol - scan > max) {
9554
9555             /* We didn't adjust <this_eol> at the beginning of this routine
9556              * because is UTF-8, but it is actually ok to do so, since here, to
9557              * match, 1 char == 1 byte. */
9558             this_eol = scan + max;
9559         }
9560
9561         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9562         break;
9563
9564     case NANYOFM:
9565         if (utf8_target) {
9566             while (     hardcount < max
9567                    &&   scan < this_eol
9568                    &&  (*scan & FLAGS(p)) != ARG(p))
9569             {
9570                 scan += UTF8SKIP(scan);
9571                 hardcount++;
9572             }
9573         }
9574         else {
9575             scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9576         }
9577         break;
9578
9579     case ANYOFH:
9580         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9581             if (ANYOF_FLAGS(p)) {   /* If we know the first byte of what
9582                                        matches, we can avoid calling reginclass
9583                                      */
9584                 while (   hardcount < max
9585                        && scan < this_eol
9586                        && (U8) *scan == ANYOF_FLAGS(p)
9587                        && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
9588                                                                   TRUE))
9589                 {
9590                     scan += UTF8SKIP(scan);
9591                     hardcount++;
9592                 }
9593             }
9594             else while (  hardcount < max
9595                         && scan < this_eol
9596                         && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9597             {
9598                 scan += UTF8SKIP(scan);
9599                 hardcount++;
9600             }
9601         }
9602         break;
9603
9604     /* The argument (FLAGS) to all the POSIX node types is the class number */
9605
9606     case NPOSIXL:
9607         to_complement = 1;
9608         /* FALLTHROUGH */
9609
9610     case POSIXL:
9611         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9612         if (! utf8_target) {
9613             while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9614                                                                    *scan)))
9615             {
9616                 scan++;
9617             }
9618         } else {
9619             while (hardcount < max && scan < this_eol
9620                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9621                                                                   (U8 *) scan,
9622                                                                   (U8 *) this_eol)))
9623             {
9624                 scan += UTF8SKIP(scan);
9625                 hardcount++;
9626             }
9627         }
9628         break;
9629
9630     case POSIXD:
9631         if (utf8_target) {
9632             goto utf8_posix;
9633         }
9634         /* FALLTHROUGH */
9635
9636     case POSIXA:
9637         if (utf8_target && this_eol - scan > max) {
9638
9639             /* We didn't adjust <this_eol> at the beginning of this routine
9640              * because is UTF-8, but it is actually ok to do so, since here, to
9641              * match, 1 char == 1 byte. */
9642             this_eol = scan + max;
9643         }
9644         while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9645             scan++;
9646         }
9647         break;
9648
9649     case NPOSIXD:
9650         if (utf8_target) {
9651             to_complement = 1;
9652             goto utf8_posix;
9653         }
9654         /* FALLTHROUGH */
9655
9656     case NPOSIXA:
9657         if (! utf8_target) {
9658             while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9659                 scan++;
9660             }
9661         }
9662         else {
9663
9664             /* The complement of something that matches only ASCII matches all
9665              * non-ASCII, plus everything in ASCII that isn't in the class. */
9666             while (hardcount < max && scan < this_eol
9667                    && (   ! isASCII_utf8_safe(scan, loceol)
9668                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9669             {
9670                 scan += UTF8SKIP(scan);
9671                 hardcount++;
9672             }
9673         }
9674         break;
9675
9676     case NPOSIXU:
9677         to_complement = 1;
9678         /* FALLTHROUGH */
9679
9680     case POSIXU:
9681         if (! utf8_target) {
9682             while (scan < this_eol && to_complement
9683                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9684             {
9685                 scan++;
9686             }
9687         }
9688         else {
9689           utf8_posix:
9690             classnum = (_char_class_number) FLAGS(p);
9691             switch (classnum) {
9692                 default:
9693                     while (   hardcount < max && scan < this_eol
9694                            && to_complement ^ cBOOL(_invlist_contains_cp(
9695                                               PL_XPosix_ptrs[classnum],
9696                                               utf8_to_uvchr_buf((U8 *) scan,
9697                                                                 (U8 *) this_eol,
9698                                                                 NULL))))
9699                     {
9700                         scan += UTF8SKIP(scan);
9701                         hardcount++;
9702                     }
9703                     break;
9704
9705                     /* For the classes below, the knowledge of how to handle
9706                      * every code point is compiled in to Perl via a macro.
9707                      * This code is written for making the loops as tight as
9708                      * possible.  It could be refactored to save space instead.
9709                      * */
9710
9711                 case _CC_ENUM_SPACE:
9712                     while (hardcount < max
9713                            && scan < this_eol
9714                            && (to_complement
9715                                ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
9716                     {
9717                         scan += UTF8SKIP(scan);
9718                         hardcount++;
9719                     }
9720                     break;
9721                 case _CC_ENUM_BLANK:
9722                     while (hardcount < max
9723                            && scan < this_eol
9724                            && (to_complement
9725                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
9726                     {
9727                         scan += UTF8SKIP(scan);
9728                         hardcount++;
9729                     }
9730                     break;
9731                 case _CC_ENUM_XDIGIT:
9732                     while (hardcount < max
9733                            && scan < this_eol
9734                            && (to_complement
9735                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
9736                     {
9737                         scan += UTF8SKIP(scan);
9738                         hardcount++;
9739                     }
9740                     break;
9741                 case _CC_ENUM_VERTSPACE:
9742                     while (hardcount < max
9743                            && scan < this_eol
9744                            && (to_complement
9745                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
9746                     {
9747                         scan += UTF8SKIP(scan);
9748                         hardcount++;
9749                     }
9750                     break;
9751                 case _CC_ENUM_CNTRL:
9752                     while (hardcount < max
9753                            && scan < this_eol
9754                            && (to_complement
9755                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
9756                     {
9757                         scan += UTF8SKIP(scan);
9758                         hardcount++;
9759                     }
9760                     break;
9761             }
9762         }
9763         break;
9764
9765     case LNBREAK:
9766         if (utf8_target) {
9767             while (hardcount < max && scan < this_eol &&
9768                     (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
9769                 scan += c;
9770                 hardcount++;
9771             }
9772         } else {
9773             /* LNBREAK can match one or two latin chars, which is ok, but we
9774              * have to use hardcount in this situation, and throw away the
9775              * adjustment to <this_eol> done before the switch statement */
9776             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9777                 scan+=c;
9778                 hardcount++;
9779             }
9780         }
9781         break;
9782
9783     case BOUNDL:
9784     case NBOUNDL:
9785         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9786         /* FALLTHROUGH */
9787     case BOUND:
9788     case BOUNDA:
9789     case BOUNDU:
9790     case EOS:
9791     case GPOS:
9792     case KEEPS:
9793     case NBOUND:
9794     case NBOUNDA:
9795     case NBOUNDU:
9796     case OPFAIL:
9797     case SBOL:
9798     case SEOL:
9799         /* These are all 0 width, so match right here or not at all. */
9800         break;
9801
9802     default:
9803         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9804         NOT_REACHED; /* NOTREACHED */
9805
9806     }
9807
9808     if (hardcount)
9809         c = hardcount;
9810     else
9811         c = scan - *startposp;
9812     *startposp = scan;
9813
9814     DEBUG_r({
9815         GET_RE_DEBUG_FLAGS_DECL;
9816         DEBUG_EXECUTE_r({
9817             SV * const prop = sv_newmortal();
9818             regprop(prog, prop, p, reginfo, NULL);
9819             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9820                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9821         });
9822     });
9823
9824     return(c);
9825 }
9826
9827 /*
9828  - reginclass - determine if a character falls into a character class
9829
9830   n is the ANYOF-type regnode
9831   p is the target string
9832   p_end points to one byte beyond the end of the target string
9833   utf8_target tells whether p is in UTF-8.
9834
9835   Returns true if matched; false otherwise.
9836
9837   Note that this can be a synthetic start class, a combination of various
9838   nodes, so things you think might be mutually exclusive, such as locale,
9839   aren't.  It can match both locale and non-locale
9840
9841  */
9842
9843 STATIC bool
9844 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9845 {
9846     dVAR;
9847     const char flags = (OP(n) == ANYOFH) ? 0 : ANYOF_FLAGS(n);
9848     bool match = FALSE;
9849     UV c = *p;
9850
9851     PERL_ARGS_ASSERT_REGINCLASS;
9852
9853     /* If c is not already the code point, get it.  Note that
9854      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9855     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9856         STRLEN c_len = 0;
9857         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9858         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9859         if (c_len == (STRLEN)-1) {
9860             _force_out_malformed_utf8_message(p, p_end,
9861                                               utf8n_flags,
9862                                               1 /* 1 means die */ );
9863             NOT_REACHED; /* NOTREACHED */
9864         }
9865         if (     c > 255
9866             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
9867             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
9868         {
9869             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9870         }
9871     }
9872
9873     /* If this character is potentially in the bitmap, check it */
9874     if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
9875         if (ANYOF_BITMAP_TEST(n, c))
9876             match = TRUE;
9877         else if ((flags
9878                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9879                   && OP(n) == ANYOFD
9880                   && ! utf8_target
9881                   && ! isASCII(c))
9882         {
9883             match = TRUE;
9884         }
9885         else if (flags & ANYOF_LOCALE_FLAGS) {
9886             if (  (flags & ANYOFL_FOLD)
9887                 && c < sizeof(PL_fold_locale)
9888                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9889             {
9890                 match = TRUE;
9891             }
9892             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
9893                      && c <= U8_MAX  /* param to isFOO_lc() */
9894             ) {
9895
9896                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9897                  * if the class includes the Posix character class given by
9898                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9899                  * complemented Posix class given by int(bit/2).  So we loop
9900                  * through the bits, each time changing whether we complement
9901                  * the result or not.  Suppose for the sake of illustration
9902                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9903                  * is set, it means there is a match for this ANYOF node if the
9904                  * character is in the class given by the expression (0 / 2 = 0
9905                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9906                  * and since 'to_complement' is 0, the result will stay TRUE,
9907                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9908                  * bit 1 is 1.  That means there is a match if the character
9909                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9910                  * but will on bit 1.  On the second iteration 'to_complement'
9911                  * will be 1, so the exclusive or will reverse things, so we
9912                  * are testing for \W.  On the third iteration, 'to_complement'
9913                  * will be 0, and we would be testing for \s; the fourth
9914                  * iteration would test for \S, etc.
9915                  *
9916                  * Note that this code assumes that all the classes are closed
9917                  * under folding.  For example, if a character matches \w, then
9918                  * its fold does too; and vice versa.  This should be true for
9919                  * any well-behaved locale for all the currently defined Posix
9920                  * classes, except for :lower: and :upper:, which are handled
9921                  * by the pseudo-class :cased: which matches if either of the
9922                  * other two does.  To get rid of this assumption, an outer
9923                  * loop could be used below to iterate over both the source
9924                  * character, and its fold (if different) */
9925
9926                 int count = 0;
9927                 int to_complement = 0;
9928
9929                 while (count < ANYOF_MAX) {
9930                     if (ANYOF_POSIXL_TEST(n, count)
9931                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9932                     {
9933                         match = TRUE;
9934                         break;
9935                     }
9936                     count++;
9937                     to_complement ^= 1;
9938                 }
9939             }
9940         }
9941     }
9942
9943
9944     /* If the bitmap didn't (or couldn't) match, and something outside the
9945      * bitmap could match, try that. */
9946     if (!match) {
9947         if (c >= NUM_ANYOF_CODE_POINTS
9948             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9949         {
9950             match = TRUE;       /* Everything above the bitmap matches */
9951         }
9952             /* Here doesn't match everything above the bitmap.  If there is
9953              * some information available beyond the bitmap, we may find a
9954              * match in it.  If so, this is most likely because the code point
9955              * is outside the bitmap range.  But rarely, it could be because of
9956              * some other reason.  If so, various flags are set to indicate
9957              * this possibility.  On ANYOFD nodes, there may be matches that
9958              * happen only when the target string is UTF-8; or for other node
9959              * types, because runtime lookup is needed, regardless of the
9960              * UTF-8ness of the target string.  Finally, under /il, there may
9961              * be some matches only possible if the locale is a UTF-8 one. */
9962         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9963                  && (   c >= NUM_ANYOF_CODE_POINTS
9964                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9965                          && (   UNLIKELY(OP(n) != ANYOFD)
9966                              || (utf8_target && ! isASCII_uni(c)
9967 #                               if NUM_ANYOF_CODE_POINTS > 256
9968                                                                  && c < 256
9969 #                               endif
9970                                 )))
9971                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9972                          && IN_UTF8_CTYPE_LOCALE)))
9973         {
9974             SV* only_utf8_locale = NULL;
9975             SV * const definition = _get_regclass_nonbitmap_data(prog, n, TRUE,
9976                                                    0, &only_utf8_locale, NULL);
9977             if (definition) {
9978                 U8 utf8_buffer[2];
9979                 U8 * utf8_p;
9980                 if (utf8_target) {
9981                     utf8_p = (U8 *) p;
9982                 } else { /* Convert to utf8 */
9983                     utf8_p = utf8_buffer;
9984                     append_utf8_from_native_byte(*p, &utf8_p);
9985                     utf8_p = utf8_buffer;
9986                 }
9987
9988                 /* Turkish locales have these hard-coded rules overriding
9989                  * normal ones */
9990                 if (   UNLIKELY(PL_in_utf8_turkic_locale)
9991                     && isALPHA_FOLD_EQ(*p, 'i'))
9992                 {
9993                     if (*p == 'i') {
9994                         if (_invlist_contains_cp(definition,
9995                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
9996                         {
9997                             match = TRUE;
9998                         }
9999                     }
10000                     else if (*p == 'I') {
10001                         if (_invlist_contains_cp(definition,
10002                                                 LATIN_SMALL_LETTER_DOTLESS_I))
10003                         {
10004                             match = TRUE;
10005                         }
10006                     }
10007                 }
10008                 else if (_invlist_contains_cp(definition, c)) {
10009                     match = TRUE;
10010                 }
10011             }
10012             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10013                 match = _invlist_contains_cp(only_utf8_locale, c);
10014             }
10015         }
10016
10017         /* In a Turkic locale under folding, hard-code the I i case pair
10018          * matches */
10019         if (     UNLIKELY(PL_in_utf8_turkic_locale)
10020             && ! match
10021             &&   (flags & ANYOFL_FOLD)
10022             &&   utf8_target)
10023         {
10024             if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10025                 if (ANYOF_BITMAP_TEST(n, 'i')) {
10026                     match = TRUE;
10027                 }
10028             }
10029             else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10030                 if (ANYOF_BITMAP_TEST(n, 'I')) {
10031                     match = TRUE;
10032                 }
10033             }
10034         }
10035
10036         if (UNICODE_IS_SUPER(c)
10037             && (flags
10038                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10039             && OP(n) != ANYOFD
10040             && ckWARN_d(WARN_NON_UNICODE))
10041         {
10042             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10043                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10044         }
10045     }
10046
10047 #if ANYOF_INVERT != 1
10048     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10049      * use it, don't */
10050 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10051 #endif
10052
10053     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10054     return (flags & ANYOF_INVERT) ^ match;
10055 }
10056
10057 STATIC U8 *
10058 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10059 {
10060     /* return the position 'off' UTF-8 characters away from 's', forward if
10061      * 'off' >= 0, backwards if negative.  But don't go outside of position
10062      * 'lim', which better be < s  if off < 0 */
10063
10064     PERL_ARGS_ASSERT_REGHOP3;
10065
10066     if (off >= 0) {
10067         while (off-- && s < lim) {
10068             /* XXX could check well-formedness here */
10069             U8 *new_s = s + UTF8SKIP(s);
10070             if (new_s > lim) /* lim may be in the middle of a long character */
10071                 return s;
10072             s = new_s;
10073         }
10074     }
10075     else {
10076         while (off++ && s > lim) {
10077             s--;
10078             if (UTF8_IS_CONTINUED(*s)) {
10079                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10080                     s--;
10081                 if (! UTF8_IS_START(*s)) {
10082                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10083                 }
10084             }
10085             /* XXX could check well-formedness here */
10086         }
10087     }
10088     return s;
10089 }
10090
10091 STATIC U8 *
10092 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10093 {
10094     PERL_ARGS_ASSERT_REGHOP4;
10095
10096     if (off >= 0) {
10097         while (off-- && s < rlim) {
10098             /* XXX could check well-formedness here */
10099             s += UTF8SKIP(s);
10100         }
10101     }
10102     else {
10103         while (off++ && s > llim) {
10104             s--;
10105             if (UTF8_IS_CONTINUED(*s)) {
10106                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10107                     s--;
10108                 if (! UTF8_IS_START(*s)) {
10109                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10110                 }
10111             }
10112             /* XXX could check well-formedness here */
10113         }
10114     }
10115     return s;
10116 }
10117
10118 /* like reghop3, but returns NULL on overrun, rather than returning last
10119  * char pos */
10120
10121 STATIC U8 *
10122 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10123 {
10124     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10125
10126     if (off >= 0) {
10127         while (off-- && s < lim) {
10128             /* XXX could check well-formedness here */
10129             s += UTF8SKIP(s);
10130         }
10131         if (off >= 0)
10132             return NULL;
10133     }
10134     else {
10135         while (off++ && s > lim) {
10136             s--;
10137             if (UTF8_IS_CONTINUED(*s)) {
10138                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10139                     s--;
10140                 if (! UTF8_IS_START(*s)) {
10141                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10142                 }
10143             }
10144             /* XXX could check well-formedness here */
10145         }
10146         if (off <= 0)
10147             return NULL;
10148     }
10149     return s;
10150 }
10151
10152
10153 /* when executing a regex that may have (?{}), extra stuff needs setting
10154    up that will be visible to the called code, even before the current
10155    match has finished. In particular:
10156
10157    * $_ is localised to the SV currently being matched;
10158    * pos($_) is created if necessary, ready to be updated on each call-out
10159      to code;
10160    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10161      isn't set until the current pattern is successfully finished), so that
10162      $1 etc of the match-so-far can be seen;
10163    * save the old values of subbeg etc of the current regex, and  set then
10164      to the current string (again, this is normally only done at the end
10165      of execution)
10166 */
10167
10168 static void
10169 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10170 {
10171     MAGIC *mg;
10172     regexp *const rex = ReANY(reginfo->prog);
10173     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10174
10175     eval_state->rex = rex;
10176
10177     if (reginfo->sv) {
10178         /* Make $_ available to executed code. */
10179         if (reginfo->sv != DEFSV) {
10180             SAVE_DEFSV;
10181             DEFSV_set(reginfo->sv);
10182         }
10183
10184         if (!(mg = mg_find_mglob(reginfo->sv))) {
10185             /* prepare for quick setting of pos */
10186             mg = sv_magicext_mglob(reginfo->sv);
10187             mg->mg_len = -1;
10188         }
10189         eval_state->pos_magic = mg;
10190         eval_state->pos       = mg->mg_len;
10191         eval_state->pos_flags = mg->mg_flags;
10192     }
10193     else
10194         eval_state->pos_magic = NULL;
10195
10196     if (!PL_reg_curpm) {
10197         /* PL_reg_curpm is a fake PMOP that we can attach the current
10198          * regex to and point PL_curpm at, so that $1 et al are visible
10199          * within a /(?{})/. It's just allocated once per interpreter the
10200          * first time its needed */
10201         Newxz(PL_reg_curpm, 1, PMOP);
10202 #ifdef USE_ITHREADS
10203         {
10204             SV* const repointer = &PL_sv_undef;
10205             /* this regexp is also owned by the new PL_reg_curpm, which
10206                will try to free it.  */
10207             av_push(PL_regex_padav, repointer);
10208             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10209             PL_regex_pad = AvARRAY(PL_regex_padav);
10210         }
10211 #endif
10212     }
10213     SET_reg_curpm(reginfo->prog);
10214     eval_state->curpm = PL_curpm;
10215     PL_curpm_under = PL_curpm;
10216     PL_curpm = PL_reg_curpm;
10217     if (RXp_MATCH_COPIED(rex)) {
10218         /*  Here is a serious problem: we cannot rewrite subbeg,
10219             since it may be needed if this match fails.  Thus
10220             $` inside (?{}) could fail... */
10221         eval_state->subbeg     = rex->subbeg;
10222         eval_state->sublen     = rex->sublen;
10223         eval_state->suboffset  = rex->suboffset;
10224         eval_state->subcoffset = rex->subcoffset;
10225 #ifdef PERL_ANY_COW
10226         eval_state->saved_copy = rex->saved_copy;
10227 #endif
10228         RXp_MATCH_COPIED_off(rex);
10229     }
10230     else
10231         eval_state->subbeg = NULL;
10232     rex->subbeg = (char *)reginfo->strbeg;
10233     rex->suboffset = 0;
10234     rex->subcoffset = 0;
10235     rex->sublen = reginfo->strend - reginfo->strbeg;
10236 }
10237
10238
10239 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10240
10241 static void
10242 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10243 {
10244     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10245     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10246     regmatch_slab *s;
10247
10248     Safefree(aux->poscache);
10249
10250     if (eval_state) {
10251
10252         /* undo the effects of S_setup_eval_state() */
10253
10254         if (eval_state->subbeg) {
10255             regexp * const rex = eval_state->rex;
10256             rex->subbeg     = eval_state->subbeg;
10257             rex->sublen     = eval_state->sublen;
10258             rex->suboffset  = eval_state->suboffset;
10259             rex->subcoffset = eval_state->subcoffset;
10260 #ifdef PERL_ANY_COW
10261             rex->saved_copy = eval_state->saved_copy;
10262 #endif
10263             RXp_MATCH_COPIED_on(rex);
10264         }
10265         if (eval_state->pos_magic)
10266         {
10267             eval_state->pos_magic->mg_len = eval_state->pos;
10268             eval_state->pos_magic->mg_flags =
10269                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10270                | (eval_state->pos_flags & MGf_BYTES);
10271         }
10272
10273         PL_curpm = eval_state->curpm;
10274     }
10275
10276     PL_regmatch_state = aux->old_regmatch_state;
10277     PL_regmatch_slab  = aux->old_regmatch_slab;
10278
10279     /* free all slabs above current one - this must be the last action
10280      * of this function, as aux and eval_state are allocated within
10281      * slabs and may be freed here */
10282
10283     s = PL_regmatch_slab->next;
10284     if (s) {
10285         PL_regmatch_slab->next = NULL;
10286         while (s) {
10287             regmatch_slab * const osl = s;
10288             s = s->next;
10289             Safefree(osl);
10290         }
10291     }
10292 }
10293
10294
10295 STATIC void
10296 S_to_utf8_substr(pTHX_ regexp *prog)
10297 {
10298     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10299      * on the converted value */
10300
10301     int i = 1;
10302
10303     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10304
10305     do {
10306         if (prog->substrs->data[i].substr
10307             && !prog->substrs->data[i].utf8_substr) {
10308             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10309             prog->substrs->data[i].utf8_substr = sv;
10310             sv_utf8_upgrade(sv);
10311             if (SvVALID(prog->substrs->data[i].substr)) {
10312                 if (SvTAIL(prog->substrs->data[i].substr)) {
10313                     /* Trim the trailing \n that fbm_compile added last
10314                        time.  */
10315                     SvCUR_set(sv, SvCUR(sv) - 1);
10316                     /* Whilst this makes the SV technically "invalid" (as its
10317                        buffer is no longer followed by "\0") when fbm_compile()
10318                        adds the "\n" back, a "\0" is restored.  */
10319                     fbm_compile(sv, FBMcf_TAIL);
10320                 } else
10321                     fbm_compile(sv, 0);
10322             }
10323             if (prog->substrs->data[i].substr == prog->check_substr)
10324                 prog->check_utf8 = sv;
10325         }
10326     } while (i--);
10327 }
10328
10329 STATIC bool
10330 S_to_byte_substr(pTHX_ regexp *prog)
10331 {
10332     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10333      * on the converted value; returns FALSE if can't be converted. */
10334
10335     int i = 1;
10336
10337     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10338
10339     do {
10340         if (prog->substrs->data[i].utf8_substr
10341             && !prog->substrs->data[i].substr) {
10342             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10343             if (! sv_utf8_downgrade(sv, TRUE)) {
10344                 return FALSE;
10345             }
10346             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10347                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10348                     /* Trim the trailing \n that fbm_compile added last
10349                         time.  */
10350                     SvCUR_set(sv, SvCUR(sv) - 1);
10351                     fbm_compile(sv, FBMcf_TAIL);
10352                 } else
10353                     fbm_compile(sv, 0);
10354             }
10355             prog->substrs->data[i].substr = sv;
10356             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10357                 prog->check_substr = sv;
10358         }
10359     } while (i--);
10360
10361     return TRUE;
10362 }
10363
10364 #ifndef PERL_IN_XSUB_RE
10365
10366 bool
10367 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10368 {
10369     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10370      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10371      * the larger string bounded by 'strbeg' and 'strend'.
10372      *
10373      * 'cp' needs to be assigned (if not a future version of the Unicode
10374      * Standard could make it something that combines with adjacent characters,
10375      * so code using it would then break), and there has to be a GCB break
10376      * before and after the character. */
10377
10378     dVAR;
10379
10380     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10381     const U8 * prev_cp_start;
10382
10383     PERL_ARGS_ASSERT__IS_GRAPHEME;
10384
10385     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10386         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10387     {
10388         /* These are considered graphemes */
10389         return TRUE;
10390     }
10391
10392     /* Otherwise, unassigned code points are forbidden */
10393     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10394                                     _invlist_search(PL_Assigned_invlist, cp))))
10395     {
10396         return FALSE;
10397     }
10398
10399     cp_gcb_val = getGCB_VAL_CP(cp);
10400
10401     /* Find the GCB value of the previous code point in the input */
10402     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10403     if (UNLIKELY(prev_cp_start == s)) {
10404         prev_cp_gcb_val = GCB_EDGE;
10405     }
10406     else {
10407         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10408     }
10409
10410     /* And check that is a grapheme boundary */
10411     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10412                 TRUE /* is UTF-8 encoded */ ))
10413     {
10414         return FALSE;
10415     }
10416
10417     /* Similarly verify there is a break between the current character and the
10418      * following one */
10419     s += UTF8SKIP(s);
10420     if (s >= strend) {
10421         next_cp_gcb_val = GCB_EDGE;
10422     }
10423     else {
10424         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10425     }
10426
10427     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10428 }
10429
10430 /*
10431 =head1 Unicode Support
10432
10433 =for apidoc isSCRIPT_RUN
10434
10435 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10436 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10437 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10438 two degenerate cases given below, this function returns TRUE iff all code
10439 points in it come from any combination of three "scripts" given by the Unicode
10440 "Script Extensions" property: Common, Inherited, and possibly one other.
10441 Additionally all decimal digits must come from the same consecutive sequence of
10442 10.
10443
10444 For example, if all the characters in the sequence are Greek, or Common, or
10445 Inherited, this function will return TRUE, provided any decimal digits in it
10446 are from the same block of digits in Common.  (These are the ASCII digits
10447 "0".."9" and additionally a block for full width forms of these, and several
10448 others used in mathematical notation.)   For scripts (unlike Greek) that have
10449 their own digits defined this will accept either digits from that set or from
10450 one of the Common digit sets, but not a combination of the two.  Some scripts,
10451 such as Arabic, have more than one set of digits.  All digits must come from
10452 the same set for this function to return TRUE.
10453
10454 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10455 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10456 C<SCX_INVALID> if the function returns FALSE.
10457
10458 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10459 will be C<SCX_INVALID>.
10460
10461 If the sequence contains a single code point which is unassigned to a character
10462 in the version of Unicode being used, the function will return TRUE, and the
10463 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10464 in the input sequence will result in the function treating the input as not
10465 being a script run.
10466
10467 The returned script will be C<SCX_Inherited> iff all the code points in it are
10468 from the Inherited script.
10469
10470 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10471 it are from the Inherited or Common scripts.
10472
10473 =cut
10474
10475 */
10476
10477 bool
10478 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10479 {
10480     /* Basically, it looks at each character in the sequence to see if the
10481      * above conditions are met; if not it fails.  It uses an inversion map to
10482      * find the enum corresponding to the script of each character.  But this
10483      * is complicated by the fact that a few code points can be in any of
10484      * several scripts.  The data has been constructed so that there are
10485      * additional enum values (all negative) for these situations.  The
10486      * absolute value of those is an index into another table which contains
10487      * pointers to auxiliary tables for each such situation.  Each aux array
10488      * lists all the scripts for the given situation.  There is another,
10489      * parallel, table that gives the number of entries in each aux table.
10490      * These are all defined in charclass_invlists.h */
10491
10492     /* XXX Here are the additional things UTS 39 says could be done:
10493      *
10494      * Forbid sequences of the same nonspacing mark
10495      *
10496      * Check to see that all the characters are in the sets of exemplar
10497      * characters for at least one language in the Unicode Common Locale Data
10498      * Repository [CLDR]. */
10499
10500     dVAR;
10501
10502     /* Things that match /\d/u */
10503     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10504     UV * decimals_array = invlist_array(decimals_invlist);
10505
10506     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10507      * not currently known) */
10508     UV zero_of_run = 0;
10509
10510     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10511     SCX_enum script_of_char = SCX_INVALID;
10512
10513     /* If the script remains not fully determined from iteration to iteration,
10514      * this is the current intersection of the possiblities.  */
10515     SCX_enum * intersection = NULL;
10516     PERL_UINT_FAST8_T intersection_len = 0;
10517
10518     bool retval = TRUE;
10519     SCX_enum * ret_script = NULL;
10520
10521     assert(send >= s);
10522
10523     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10524
10525     /* All code points in 0..255 are either Common or Latin, so must be a
10526      * script run.  We can return immediately unless we need to know which
10527      * script it is. */
10528     if (! utf8_target && LIKELY(send > s)) {
10529         if (ret_script == NULL) {
10530             return TRUE;
10531         }
10532
10533         /* If any character is Latin, the run is Latin */
10534         while (s < send) {
10535             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10536                 *ret_script = SCX_Latin;
10537                 return TRUE;
10538             }
10539         }
10540
10541         /* Here, all are Common */
10542         *ret_script = SCX_Common;
10543         return TRUE;
10544     }
10545
10546     /* Look at each character in the sequence */
10547     while (s < send) {
10548         /* If the current character being examined is a digit, this is the code
10549          * point of the zero for its sequence of 10 */
10550         UV zero_of_char;
10551
10552         UV cp;
10553
10554         /* The code allows all scripts to use the ASCII digits.  This is
10555          * because they are in the Common script.  Hence any ASCII ones found
10556          * are ok, unless and until a digit from another set has already been
10557          * encountered.  digit ranges in Common are not similarly blessed) */
10558         if (UNLIKELY(isDIGIT(*s))) {
10559             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10560                 retval = FALSE;
10561                 break;
10562             }
10563             if (zero_of_run) {
10564                 if (zero_of_run != '0') {
10565                     retval = FALSE;
10566                     break;
10567                 }
10568             }
10569             else {
10570                 zero_of_run = '0';
10571             }
10572             s++;
10573             continue;
10574         }
10575
10576         /* Here, isn't an ASCII digit.  Find the code point of the character */
10577         if (! UTF8_IS_INVARIANT(*s)) {
10578             Size_t len;
10579             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10580             s += len;
10581         }
10582         else {
10583             cp = *(s++);
10584         }
10585
10586         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10587          * digit in that script.  We can skip the rest of this code for this
10588          * character. */
10589         if (UNLIKELY(   zero_of_run
10590                      && cp >= zero_of_run
10591                      && cp - zero_of_run <= 9))
10592         {
10593             continue;
10594         }
10595
10596         /* Find the character's script.  The correct values are hard-coded here
10597          * for small-enough code points. */
10598         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10599                                unlikely to change */
10600             if (       cp > 255
10601                 || (   isALPHA_L1(cp)
10602                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10603             {
10604                 script_of_char = SCX_Latin;
10605             }
10606             else {
10607                 script_of_char = SCX_Common;
10608             }
10609         }
10610         else {
10611             script_of_char = _Perl_SCX_invmap[
10612                                        _invlist_search(PL_SCX_invlist, cp)];
10613         }
10614
10615         /* We arbitrarily accept a single unassigned character, but not in
10616          * combination with anything else, and not a run of them. */
10617         if (   UNLIKELY(script_of_run == SCX_Unknown)
10618             || UNLIKELY(   script_of_run != SCX_INVALID
10619                         && script_of_char == SCX_Unknown))
10620         {
10621             retval = FALSE;
10622             break;
10623         }
10624
10625         /* For the first character, or the run is inherited, the run's script
10626          * is set to the char's */
10627         if (   UNLIKELY(script_of_run == SCX_INVALID)
10628             || UNLIKELY(script_of_run == SCX_Inherited))
10629         {
10630             script_of_run = script_of_char;
10631         }
10632
10633         /* For the character's script to be Unknown, it must be the first
10634          * character in the sequence (for otherwise a test above would have
10635          * prevented us from reaching here), and we have set the run's script
10636          * to it.  Nothing further to be done for this character */
10637         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10638             continue;
10639         }
10640
10641         /* We accept 'inherited' script characters currently even at the
10642          * beginning.  (We know that no characters in Inherited are digits, or
10643          * we'd have to check for that) */
10644         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10645             continue;
10646         }
10647
10648         /* If the run so far is Common, and the new character isn't, change the
10649          * run's script to that of this character */
10650         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10651             script_of_run = script_of_char;
10652         }
10653
10654         /* Now we can see if the script of the new character is the same as
10655          * that of the run */
10656         if (LIKELY(script_of_char == script_of_run)) {
10657             /* By far the most common case */
10658             goto scripts_match;
10659         }
10660
10661         /* Here, the script of the run isn't Common.  But characters in Common
10662          * match any script */
10663         if (script_of_char == SCX_Common) {
10664             goto scripts_match;
10665         }
10666
10667 #ifndef HAS_SCX_AUX_TABLES
10668
10669         /* Too early a Unicode version to have a code point belonging to more
10670          * than one script, so, if the scripts don't exactly match, fail */
10671         PERL_UNUSED_VAR(intersection_len);
10672         retval = FALSE;
10673         break;
10674
10675 #else
10676
10677         /* Here there is no exact match between the character's script and the
10678          * run's.  And we've handled the special cases of scripts Unknown,
10679          * Inherited, and Common.
10680          *
10681          * Negative script numbers signify that the value may be any of several
10682          * scripts, and we need to look at auxiliary information to make our
10683          * deterimination.  But if both are non-negative, we can fail now */
10684         if (LIKELY(script_of_char >= 0)) {
10685             const SCX_enum * search_in;
10686             PERL_UINT_FAST8_T search_in_len;
10687             PERL_UINT_FAST8_T i;
10688
10689             if (LIKELY(script_of_run >= 0)) {
10690                 retval = FALSE;
10691                 break;
10692             }
10693
10694             /* Use the previously constructed set of possible scripts, if any.
10695              * */
10696             if (intersection) {
10697                 search_in = intersection;
10698                 search_in_len = intersection_len;
10699             }
10700             else {
10701                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10702                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10703             }
10704
10705             for (i = 0; i < search_in_len; i++) {
10706                 if (search_in[i] == script_of_char) {
10707                     script_of_run = script_of_char;
10708                     goto scripts_match;
10709                 }
10710             }
10711
10712             retval = FALSE;
10713             break;
10714         }
10715         else if (LIKELY(script_of_run >= 0)) {
10716             /* script of character could be one of several, but run is a single
10717              * script */
10718             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10719             const PERL_UINT_FAST8_T search_in_len
10720                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10721             PERL_UINT_FAST8_T i;
10722
10723             for (i = 0; i < search_in_len; i++) {
10724                 if (search_in[i] == script_of_run) {
10725                     script_of_char = script_of_run;
10726                     goto scripts_match;
10727                 }
10728             }
10729
10730             retval = FALSE;
10731             break;
10732         }
10733         else {
10734             /* Both run and char could be in one of several scripts.  If the
10735              * intersection is empty, then this character isn't in this script
10736              * run.  Otherwise, we need to calculate the intersection to use
10737              * for future iterations of the loop, unless we are already at the
10738              * final character */
10739             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10740             const PERL_UINT_FAST8_T char_len
10741                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10742             const SCX_enum * search_run;
10743             PERL_UINT_FAST8_T run_len;
10744
10745             SCX_enum * new_overlap = NULL;
10746             PERL_UINT_FAST8_T i, j;
10747
10748             if (intersection) {
10749                 search_run = intersection;
10750                 run_len = intersection_len;
10751             }
10752             else {
10753                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10754                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10755             }
10756
10757             intersection_len = 0;
10758
10759             for (i = 0; i < run_len; i++) {
10760                 for (j = 0; j < char_len; j++) {
10761                     if (search_run[i] == search_char[j]) {
10762
10763                         /* Here, the script at i,j matches.  That means this
10764                          * character is in the run.  But continue on to find
10765                          * the complete intersection, for the next loop
10766                          * iteration, and for the digit check after it.
10767                          *
10768                          * On the first found common script, we malloc space
10769                          * for the intersection list for the worst case of the
10770                          * intersection, which is the minimum of the number of
10771                          * scripts remaining in each set. */
10772                         if (intersection_len == 0) {
10773                             Newx(new_overlap,
10774                                  MIN(run_len - i, char_len - j),
10775                                  SCX_enum);
10776                         }
10777                         new_overlap[intersection_len++] = search_run[i];
10778                     }
10779                 }
10780             }
10781
10782             /* Here we've looked through everything.  If they have no scripts
10783              * in common, not a run */
10784             if (intersection_len == 0) {
10785                 retval = FALSE;
10786                 break;
10787             }
10788
10789             /* If there is only a single script in common, set to that.
10790              * Otherwise, use the intersection going forward */
10791             Safefree(intersection);
10792             intersection = NULL;
10793             if (intersection_len == 1) {
10794                 script_of_run = script_of_char = new_overlap[0];
10795                 Safefree(new_overlap);
10796                 new_overlap = NULL;
10797             }
10798             else {
10799                 intersection = new_overlap;
10800             }
10801         }
10802
10803 #endif
10804
10805   scripts_match:
10806
10807         /* Here, the script of the character is compatible with that of the
10808          * run.  That means that in most cases, it continues the script run.
10809          * Either it and the run match exactly, or one or both can be in any of
10810          * several scripts, and the intersection is not empty.  However, if the
10811          * character is a decimal digit, it could still mean failure if it is
10812          * from the wrong sequence of 10.  So, we need to look at if it's a
10813          * digit.  We've already handled the 10 decimal digits, and the next
10814          * lowest one is this one: */
10815         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
10816             continue;   /* Not a digit; this character is part of the run */
10817         }
10818
10819         /* If we have a definitive '0' for the script of this character, we
10820          * know that for this to be a digit, it must be in the range of +0..+9
10821          * of that zero. */
10822         if (   script_of_char >= 0
10823             && (zero_of_char = script_zeros[script_of_char]))
10824         {
10825             if (   cp < zero_of_char
10826                 || cp > zero_of_char + 9)
10827             {
10828                 continue;   /* Not a digit; this character is part of the run
10829                              */
10830             }
10831
10832         }
10833         else {  /* Need to look up if this character is a digit or not */
10834             SSize_t index_of_zero_of_char;
10835             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
10836             if (     UNLIKELY(index_of_zero_of_char < 0)
10837                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
10838             {
10839                 continue;   /* Not a digit; this character is part of the run.
10840                              */
10841             }
10842
10843             zero_of_char = decimals_array[index_of_zero_of_char];
10844         }
10845
10846         /* Here, the character is a decimal digit, and the zero of its sequence
10847          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
10848          * they better be the same. */
10849         if (zero_of_run) {
10850             if (zero_of_run != zero_of_char) {
10851                 retval = FALSE;
10852                 break;
10853             }
10854         }
10855         else {  /* Otherwise we now have a zero for this run */
10856             zero_of_run = zero_of_char;
10857         }
10858     } /* end of looping through CLOSESR text */
10859
10860     Safefree(intersection);
10861
10862     if (ret_script != NULL) {
10863         if (retval) {
10864             *ret_script = script_of_run;
10865         }
10866         else {
10867             *ret_script = SCX_INVALID;
10868         }
10869     }
10870
10871     return retval;
10872 }
10873
10874 #endif /* ifndef PERL_IN_XSUB_RE */
10875
10876 /*
10877  * ex: set ts=8 sts=4 sw=4 et:
10878  */