regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char non_utf8_target_but_utf8_required[]
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /*
 111  * Forwards.
 112  */
 113
 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 115
 116 #define HOPc(pos,off) \
 117         (char *)(reginfo->is_utf8_target \
 118             ? reghop3((U8*)pos, off, \
 119                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 120             : (U8*)(pos + off))
 121
 122 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 123 #define HOPBACK3(pos, off, lim) \
 124         (reginfo->is_utf8_target                          \
 125             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 126             : (pos - off >= lim)                                 \
 127                 ? (U8*)pos - off                                 \
 128                 : NULL)
 129
 130 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 131
 132 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 133 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 134
 135 /* lim must be +ve. Returns NULL on overshoot */
 136 #define HOPMAYBE3(pos,off,lim) \
 137         (reginfo->is_utf8_target                        \
 138             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 139             : ((U8*)pos + off <= lim)                   \
 140                 ? (U8*)pos + off                        \
 141                 : NULL)
 142
 143 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 144  * off must be >=0; args should be vars rather than expressions */
 145 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 146     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 147     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 148 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 149
 150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 151     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 152     : (U8*)(pos + off))
 153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 154
 155 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 156 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 157
 158 /* for use after a quantifier and before an EXACT-like node -- japhy */
 159 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 160  *
 161  * NOTE that *nothing* that affects backtracking should be in here, specifically
 162  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 163  * node that is in between two EXACT like nodes when ascertaining what the required
 164  * "follow" character is. This should probably be moved to regex compile time
 165  * although it may be done at run time beause of the REF possibility - more
 166  * investigation required. -- demerphq
 167 */
 168 #define JUMPABLE(rn) (                                                             \
 169     OP(rn) == OPEN ||                                                              \
 170     (OP(rn) == CLOSE &&                                                            \
 171      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 172     OP(rn) == EVAL ||                                                              \
 173     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 174     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 175     OP(rn) == KEEPS ||                                                             \
 176     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 177 )
 178 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 179
 180 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 181
 182 /*
 183   Search for mandatory following text node; for lookahead, the text must
 184   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 185 */
 186 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 187     while (JUMPABLE(rn)) { \
 188         const OPCODE type = OP(rn); \
 189         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 190             rn = NEXTOPER(NEXTOPER(rn)); \
 191         else if (type == PLUS) \
 192             rn = NEXTOPER(rn); \
 193         else if (type == IFMATCH) \
 194             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 195         else rn += NEXT_OFF(rn); \
 196     } \
 197 } STMT_END
 198
 199 #define SLAB_FIRST(s) (&(s)->states[0])
 200 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 201
 202 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 203 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 204 static regmatch_state * S_push_slab(pTHX);
 205
 206 #define REGCP_PAREN_ELEMS 3
 207 #define REGCP_OTHER_ELEMS 3
 208 #define REGCP_FRAME_ELEMS 1
 209 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 210  * are needed for the regexp context stack bookkeeping. */
 211
 212 STATIC CHECKPOINT
 213 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 214 {
 215     const int retval = PL_savestack_ix;
 216     const int paren_elems_to_push =
 217                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 218     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 219     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 220     I32 p;
 221     GET_RE_DEBUG_FLAGS_DECL;
 222
 223     PERL_ARGS_ASSERT_REGCPPUSH;
 224
 225     if (paren_elems_to_push < 0)
 226         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 227                    (int)paren_elems_to_push, (int)maxopenparen,
 228                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 229
 230     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 231         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 232                    " out of range (%lu-%ld)",
 233                    total_elems,
 234                    (unsigned long)maxopenparen,
 235                    (long)parenfloor);
 236
 237     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 238
 239     DEBUG_BUFFERS_r(
 240         if ((int)maxopenparen > (int)parenfloor)
 241             Perl_re_exec_indentf( aTHX_
 242                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 243                 depth,
 244                 PTR2UV(rex),
 245                 PTR2UV(rex->offs)
 246             );
 247     );
 248     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 249 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 250         SSPUSHIV(rex->offs[p].end);
 251         SSPUSHIV(rex->offs[p].start);
 252         SSPUSHINT(rex->offs[p].start_tmp);
 253         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 254             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 255             depth,
 256             (UV)p,
 257             (IV)rex->offs[p].start,
 258             (IV)rex->offs[p].start_tmp,
 259             (IV)rex->offs[p].end
 260         ));
 261     }
 262 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 263     SSPUSHINT(maxopenparen);
 264     SSPUSHINT(rex->lastparen);
 265     SSPUSHINT(rex->lastcloseparen);
 266     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 267
 268     return retval;
 269 }
 270
 271 /* These are needed since we do not localize EVAL nodes: */
 272 #define REGCP_SET(cp)                                           \
 273     DEBUG_STATE_r(                                              \
 274         Perl_re_exec_indentf( aTHX_                             \
 275             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 276             depth, (IV)PL_savestack_ix                          \
 277         )                                                       \
 278     );                                                          \
 279     cp = PL_savestack_ix
 280
 281 #define REGCP_UNWIND(cp)                                        \
 282     DEBUG_STATE_r(                                              \
 283         if (cp != PL_savestack_ix)                              \
 284             Perl_re_exec_indentf( aTHX_                         \
 285                 "Clearing an EVAL scope, savestack=%"           \
 286                 IVdf "..%" IVdf "\n",                           \
 287                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 288             )                                                   \
 289     );                                                          \
 290     regcpblow(cp)
 291
 292 /* set the start and end positions of capture ix */
 293 #define CLOSE_CAPTURE(ix, s, e)                                            \
 294     rex->offs[ix].start = s;                                               \
 295     rex->offs[ix].end = e;                                                 \
 296     if (ix > rex->lastparen)                                               \
 297         rex->lastparen = ix;                                               \
 298     rex->lastcloseparen = ix;                                              \
 299     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
 300         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
 301         depth,                                                             \
 302         PTR2UV(rex),                                                       \
 303         PTR2UV(rex->offs),                                                 \
 304         (UV)ix,                                                            \
 305         (IV)rex->offs[ix].start,                                           \
 306         (IV)rex->offs[ix].end,                                             \
 307         (UV)rex->lastparen                                                 \
 308     ))
 309
 310 #define UNWIND_PAREN(lp, lcp)               \
 311     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
 312         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
 313         depth,                              \
 314         PTR2UV(rex),                        \
 315         PTR2UV(rex->offs),                  \
 316         (UV)(lp),                           \
 317         (UV)(rex->lastparen),               \
 318         (UV)(lcp)                           \
 319     ));                                     \
 320     for (n = rex->lastparen; n > lp; n--)   \
 321         rex->offs[n].end = -1;              \
 322     rex->lastparen = n;                     \
 323     rex->lastcloseparen = lcp;
 324
 325
 326 STATIC void
 327 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 328 {
 329     UV i;
 330     U32 paren;
 331     GET_RE_DEBUG_FLAGS_DECL;
 332
 333     PERL_ARGS_ASSERT_REGCPPOP;
 334
 335     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 336     i = SSPOPUV;
 337     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 338     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 339     rex->lastcloseparen = SSPOPINT;
 340     rex->lastparen = SSPOPINT;
 341     *maxopenparen_p = SSPOPINT;
 342
 343     i -= REGCP_OTHER_ELEMS;
 344     /* Now restore the parentheses context. */
 345     DEBUG_BUFFERS_r(
 346         if (i || rex->lastparen + 1 <= rex->nparens)
 347             Perl_re_exec_indentf( aTHX_
 348                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 349                 depth,
 350                 PTR2UV(rex),
 351                 PTR2UV(rex->offs)
 352             );
 353     );
 354     paren = *maxopenparen_p;
 355     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 356         SSize_t tmps;
 357         rex->offs[paren].start_tmp = SSPOPINT;
 358         rex->offs[paren].start = SSPOPIV;
 359         tmps = SSPOPIV;
 360         if (paren <= rex->lastparen)
 361             rex->offs[paren].end = tmps;
 362         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 363             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 364             depth,
 365             (UV)paren,
 366             (IV)rex->offs[paren].start,
 367             (IV)rex->offs[paren].start_tmp,
 368             (IV)rex->offs[paren].end,
 369             (paren > rex->lastparen ? "(skipped)" : ""));
 370         );
 371         paren--;
 372     }
 373 #if 1
 374     /* It would seem that the similar code in regtry()
 375      * already takes care of this, and in fact it is in
 376      * a better location to since this code can #if 0-ed out
 377      * but the code in regtry() is needed or otherwise tests
 378      * requiring null fields (pat.t#187 and split.t#{13,14}
 379      * (as of patchlevel 7877)  will fail.  Then again,
 380      * this code seems to be necessary or otherwise
 381      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 382      * --jhi updated by dapm */
 383     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 384         if (i > *maxopenparen_p)
 385             rex->offs[i].start = -1;
 386         rex->offs[i].end = -1;
 387         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 388             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 389             depth,
 390             (UV)i,
 391             (i > *maxopenparen_p) ? "-1" : "  "
 392         ));
 393     }
 394 #endif
 395 }
 396
 397 /* restore the parens and associated vars at savestack position ix,
 398  * but without popping the stack */
 399
 400 STATIC void
 401 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 402 {
 403     I32 tmpix = PL_savestack_ix;
 404     PERL_ARGS_ASSERT_REGCP_RESTORE;
 405
 406     PL_savestack_ix = ix;
 407     regcppop(rex, maxopenparen_p);
 408     PL_savestack_ix = tmpix;
 409 }
 410
 411 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 412
 413 #ifndef PERL_IN_XSUB_RE
 414
 415 bool
 416 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 417 {
 418     /* Returns a boolean as to whether or not 'character' is a member of the
 419      * Posix character class given by 'classnum' that should be equivalent to a
 420      * value in the typedef '_char_class_number'.
 421      *
 422      * Ideally this could be replaced by a just an array of function pointers
 423      * to the C library functions that implement the macros this calls.
 424      * However, to compile, the precise function signatures are required, and
 425      * these may vary from platform to to platform.  To avoid having to figure
 426      * out what those all are on each platform, I (khw) am using this method,
 427      * which adds an extra layer of function call overhead (unless the C
 428      * optimizer strips it away).  But we don't particularly care about
 429      * performance with locales anyway. */
 430
 431     switch ((_char_class_number) classnum) {
 432         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 433         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 434         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 435         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 436         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 437                                         || isUPPER_LC(character);
 438         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 439         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 440         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 441         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 442         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 443         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 444         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 445         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 446         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 447         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 448         default:    /* VERTSPACE should never occur in locales */
 449             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 450     }
 451
 452     NOT_REACHED; /* NOTREACHED */
 453     return FALSE;
 454 }
 455
 456 #endif
 457
 458 PERL_STATIC_INLINE I32
 459 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
 460 {
 461     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
 462      * folded.  Works on all folds representable without UTF-8, except for
 463      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
 464      * check that the strings each have at least 'len' characters.
 465      *
 466      * There is almost an identical API function where s2 need not be folded:
 467      * Perl_foldEQ_latin1() */
 468
 469     const U8 *a = (const U8 *)s1;
 470     const U8 *b = (const U8 *)s2;
 471
 472     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
 473
 474     assert(len >= 0);
 475
 476     while (len--) {
 477         assert(! isUPPER_L1(*b));
 478         if (toLOWER_L1(*a) != *b) {
 479             return 0;
 480         }
 481         a++, b++;
 482     }
 483     return 1;
 484 }
 485
 486 STATIC bool
 487 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 488 {
 489     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 490      * 'character' is a member of the Posix character class given by 'classnum'
 491      * that should be equivalent to a value in the typedef
 492      * '_char_class_number'.
 493      *
 494      * This just calls isFOO_lc on the code point for the character if it is in
 495      * the range 0-255.  Outside that range, all characters use Unicode
 496      * rules, ignoring any locale.  So use the Unicode function if this class
 497      * requires an inversion list, and use the Unicode macro otherwise. */
 498
 499     dVAR;
 500
 501     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 502
 503     if (UTF8_IS_INVARIANT(*character)) {
 504         return isFOO_lc(classnum, *character);
 505     }
 506     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 507         return isFOO_lc(classnum,
 508                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 509     }
 510
 511     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 512
 513     switch ((_char_class_number) classnum) {
 514         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 515         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 516         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 517         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 518         default:
 519             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 520                                         utf8_to_uvchr_buf(character, e, NULL));
 521     }
 522
 523     return FALSE; /* Things like CNTRL are always below 256 */
 524 }
 525
 526 STATIC U8 *
 527 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 528 {
 529     /* Returns the position of the first byte in the sequence between 's' and
 530      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 531      * */
 532
 533     PERL_ARGS_ASSERT_FIND_SPAN_END;
 534
 535     assert(send >= s);
 536
 537     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 538                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 539                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 540     {
 541         PERL_UINTMAX_T span_word;
 542
 543         /* Process per-byte until reach word boundary.  XXX This loop could be
 544          * eliminated if we knew that this platform had fast unaligned reads */
 545         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 546             if (*s != span_byte) {
 547                 return s;
 548             }
 549             s++;
 550         }
 551
 552         /* Create a word filled with the bytes we are spanning */
 553         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 554
 555         /* Process per-word as long as we have at least a full word left */
 556         do {
 557
 558             /* Keep going if the whole word is composed of 'span_byte's */
 559             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 560                 s += PERL_WORDSIZE;
 561                 continue;
 562             }
 563
 564             /* Here, at least one byte in the word isn't 'span_byte'. */
 565
 566 #ifdef EBCDIC
 567
 568             break;
 569
 570 #else
 571
 572             /* This xor leaves 1 bits only in those non-matching bytes */
 573             span_word ^= * (PERL_UINTMAX_T *) s;
 574
 575             /* Make sure the upper bit of each non-matching byte is set.  This
 576              * makes each such byte look like an ASCII platform variant byte */
 577             span_word |= span_word << 1;
 578             span_word |= span_word << 2;
 579             span_word |= span_word << 4;
 580
 581             /* That reduces the problem to what this function solves */
 582             return s + _variant_byte_number(span_word);
 583
 584 #endif
 585
 586         } while (s + PERL_WORDSIZE <= send);
 587     }
 588
 589     /* Process the straggler bytes beyond the final word boundary */
 590     while (s < send) {
 591         if (*s != span_byte) {
 592             return s;
 593         }
 594         s++;
 595     }
 596
 597     return s;
 598 }
 599
 600 STATIC U8 *
 601 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 602 {
 603     /* Returns the position of the first byte in the sequence between 's'
 604      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 605      * returns 'send' if none found.  It uses word-level operations instead of
 606      * byte to speed up the process */
 607
 608     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 609
 610     assert(send >= s);
 611     assert((byte & mask) == byte);
 612
 613 #ifndef EBCDIC
 614
 615     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 616                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 617                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 618     {
 619         PERL_UINTMAX_T word, mask_word;
 620
 621         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 622             if (((*s) & mask) == byte) {
 623                 return s;
 624             }
 625             s++;
 626         }
 627
 628         word      = PERL_COUNT_MULTIPLIER * byte;
 629         mask_word = PERL_COUNT_MULTIPLIER * mask;
 630
 631         do {
 632             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 633
 634             /* If 'masked' contains bytes with the bit pattern of 'byte' within
 635              * it, xoring with 'word' will leave each of the 8 bits in such
 636              * bytes be 0, and no byte containing any other bit pattern will be
 637              * 0. */
 638             masked ^= word;
 639
 640             /* This causes the most significant bit to be set to 1 for any
 641              * bytes in the word that aren't completely 0 */
 642             masked |= masked << 1;
 643             masked |= masked << 2;
 644             masked |= masked << 4;
 645
 646             /* The msbits are the same as what marks a byte as variant, so we
 647              * can use this mask.  If all msbits are 1, the word doesn't
 648              * contain 'byte' */
 649             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 650                 s += PERL_WORDSIZE;
 651                 continue;
 652             }
 653
 654             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 655              * and any that are, are 0.  Complement and re-AND to swap that */
 656             masked = ~ masked;
 657             masked &= PERL_VARIANTS_WORD_MASK;
 658
 659             /* This reduces the problem to that solved by this function */
 660             s += _variant_byte_number(masked);
 661             return s;
 662
 663         } while (s + PERL_WORDSIZE <= send);
 664     }
 665
 666 #endif
 667
 668     while (s < send) {
 669         if (((*s) & mask) == byte) {
 670             return s;
 671         }
 672         s++;
 673     }
 674
 675     return s;
 676 }
 677
 678 STATIC U8 *
 679 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 680 {
 681     /* Returns the position of the first byte in the sequence between 's' and
 682      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 683      * 'span_byte' should have been ANDed with 'mask' in the call of this
 684      * function.  Returns 'send' if none found.  Works like find_span_end(),
 685      * except for the AND */
 686
 687     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 688
 689     assert(send >= s);
 690     assert((span_byte & mask) == span_byte);
 691
 692     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 693                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 694                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 695     {
 696         PERL_UINTMAX_T span_word, mask_word;
 697
 698         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 699             if (((*s) & mask) != span_byte) {
 700                 return s;
 701             }
 702             s++;
 703         }
 704
 705         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 706         mask_word = PERL_COUNT_MULTIPLIER * mask;
 707
 708         do {
 709             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 710
 711             if (masked == span_word) {
 712                 s += PERL_WORDSIZE;
 713                 continue;
 714             }
 715
 716 #ifdef EBCDIC
 717
 718             break;
 719
 720 #else
 721
 722             masked ^= span_word;
 723             masked |= masked << 1;
 724             masked |= masked << 2;
 725             masked |= masked << 4;
 726             return s + _variant_byte_number(masked);
 727
 728 #endif
 729
 730         } while (s + PERL_WORDSIZE <= send);
 731     }
 732
 733     while (s < send) {
 734         if (((*s) & mask) != span_byte) {
 735             return s;
 736         }
 737         s++;
 738     }
 739
 740     return s;
 741 }
 742
 743 /*
 744  * pregexec and friends
 745  */
 746
 747 #ifndef PERL_IN_XSUB_RE
 748 /*
 749  - pregexec - match a regexp against a string
 750  */
 751 I32
 752 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 753          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 754 /* stringarg: the point in the string at which to begin matching */
 755 /* strend:    pointer to null at end of string */
 756 /* strbeg:    real beginning of string */
 757 /* minend:    end of match must be >= minend bytes after stringarg. */
 758 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 759  *            itself is accessed via the pointers above */
 760 /* nosave:    For optimizations. */
 761 {
 762     PERL_ARGS_ASSERT_PREGEXEC;
 763
 764     return
 765         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 766                       nosave ? 0 : REXEC_COPY_STR);
 767 }
 768 #endif
 769
 770
 771
 772 /* re_intuit_start():
 773  *
 774  * Based on some optimiser hints, try to find the earliest position in the
 775  * string where the regex could match.
 776  *
 777  *   rx:     the regex to match against
 778  *   sv:     the SV being matched: only used for utf8 flag; the string
 779  *           itself is accessed via the pointers below. Note that on
 780  *           something like an overloaded SV, SvPOK(sv) may be false
 781  *           and the string pointers may point to something unrelated to
 782  *           the SV itself.
 783  *   strbeg: real beginning of string
 784  *   strpos: the point in the string at which to begin matching
 785  *   strend: pointer to the byte following the last char of the string
 786  *   flags   currently unused; set to 0
 787  *   data:   currently unused; set to NULL
 788  *
 789  * The basic idea of re_intuit_start() is to use some known information
 790  * about the pattern, namely:
 791  *
 792  *   a) the longest known anchored substring (i.e. one that's at a
 793  *      constant offset from the beginning of the pattern; but not
 794  *      necessarily at a fixed offset from the beginning of the
 795  *      string);
 796  *   b) the longest floating substring (i.e. one that's not at a constant
 797  *      offset from the beginning of the pattern);
 798  *   c) Whether the pattern is anchored to the string; either
 799  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 800  *      or anchored to pos(): /\G/;
 801  *   d) A start class: a real or synthetic character class which
 802  *      represents which characters are legal at the start of the pattern;
 803  *
 804  * to either quickly reject the match, or to find the earliest position
 805  * within the string at which the pattern might match, thus avoiding
 806  * running the full NFA engine at those earlier locations, only to
 807  * eventually fail and retry further along.
 808  *
 809  * Returns NULL if the pattern can't match, or returns the address within
 810  * the string which is the earliest place the match could occur.
 811  *
 812  * The longest of the anchored and floating substrings is called 'check'
 813  * and is checked first. The other is called 'other' and is checked
 814  * second. The 'other' substring may not be present.  For example,
 815  *
 816  *    /(abc|xyz)ABC\d{0,3}DEFG/
 817  *
 818  * will have
 819  *
 820  *   check substr (float)    = "DEFG", offset 6..9 chars
 821  *   other substr (anchored) = "ABC",  offset 3..3 chars
 822  *   stclass = [ax]
 823  *
 824  * Be aware that during the course of this function, sometimes 'anchored'
 825  * refers to a substring being anchored relative to the start of the
 826  * pattern, and sometimes to the pattern itself being anchored relative to
 827  * the string. For example:
 828  *
 829  *   /\dabc/:   "abc" is anchored to the pattern;
 830  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 831  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 832  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 833  *                    but the pattern is anchored to the string.
 834  */
 835
 836 char *
 837 Perl_re_intuit_start(pTHX_
 838                     REGEXP * const rx,
 839                     SV *sv,
 840                     const char * const strbeg,
 841                     char *strpos,
 842                     char *strend,
 843                     const U32 flags,
 844                     re_scream_pos_data *data)
 845 {
 846     struct regexp *const prog = ReANY(rx);
 847     SSize_t start_shift = prog->check_offset_min;
 848     /* Should be nonnegative! */
 849     SSize_t end_shift   = 0;
 850     /* current lowest pos in string where the regex can start matching */
 851     char *rx_origin = strpos;
 852     SV *check;
 853     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 854     U8   other_ix = 1 - prog->substrs->check_ix;
 855     bool ml_anch = 0;
 856     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 857     char *check_at = NULL;              /* check substr found at this pos */
 858     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 859     RXi_GET_DECL(prog,progi);
 860     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 861     regmatch_info *const reginfo = &reginfo_buf;
 862     GET_RE_DEBUG_FLAGS_DECL;
 863
 864     PERL_ARGS_ASSERT_RE_INTUIT_START;
 865     PERL_UNUSED_ARG(flags);
 866     PERL_UNUSED_ARG(data);
 867
 868     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 869                 "Intuit: trying to determine minimum start position...\n"));
 870
 871     /* for now, assume that all substr offsets are positive. If at some point
 872      * in the future someone wants to do clever things with lookbehind and
 873      * -ve offsets, they'll need to fix up any code in this function
 874      * which uses these offsets. See the thread beginning
 875      * <20140113145929.GF27210@iabyn.com>
 876      */
 877     assert(prog->substrs->data[0].min_offset >= 0);
 878     assert(prog->substrs->data[0].max_offset >= 0);
 879     assert(prog->substrs->data[1].min_offset >= 0);
 880     assert(prog->substrs->data[1].max_offset >= 0);
 881     assert(prog->substrs->data[2].min_offset >= 0);
 882     assert(prog->substrs->data[2].max_offset >= 0);
 883
 884     /* for now, assume that if both present, that the floating substring
 885      * doesn't start before the anchored substring.
 886      * If you break this assumption (e.g. doing better optimisations
 887      * with lookahead/behind), then you'll need to audit the code in this
 888      * function carefully first
 889      */
 890     assert(
 891             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 892               && (prog->float_utf8    || prog->float_substr))
 893            || (prog->float_min_offset >= prog->anchored_offset));
 894
 895     /* byte rather than char calculation for efficiency. It fails
 896      * to quickly reject some cases that can't match, but will reject
 897      * them later after doing full char arithmetic */
 898     if (prog->minlen > strend - strpos) {
 899         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 900                               "  String too short...\n"));
 901         goto fail;
 902     }
 903
 904     RXp_MATCH_UTF8_set(prog, utf8_target);
 905     reginfo->is_utf8_target = cBOOL(utf8_target);
 906     reginfo->info_aux = NULL;
 907     reginfo->strbeg = strbeg;
 908     reginfo->strend = strend;
 909     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 910     reginfo->intuit = 1;
 911     /* not actually used within intuit, but zero for safety anyway */
 912     reginfo->poscache_maxiter = 0;
 913
 914     if (utf8_target) {
 915         if ((!prog->anchored_utf8 && prog->anchored_substr)
 916                 || (!prog->float_utf8 && prog->float_substr))
 917             to_utf8_substr(prog);
 918         check = prog->check_utf8;
 919     } else {
 920         if (!prog->check_substr && prog->check_utf8) {
 921             if (! to_byte_substr(prog)) {
 922                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 923             }
 924         }
 925         check = prog->check_substr;
 926     }
 927
 928     /* dump the various substring data */
 929     DEBUG_OPTIMISE_MORE_r({
 930         int i;
 931         for (i=0; i<=2; i++) {
 932             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 933                                   : prog->substrs->data[i].substr);
 934             if (!sv)
 935                 continue;
 936
 937             Perl_re_printf( aTHX_
 938                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
 939                 " useful=%" IVdf " utf8=%d [%s]\n",
 940                 i,
 941                 (IV)prog->substrs->data[i].min_offset,
 942                 (IV)prog->substrs->data[i].max_offset,
 943                 (IV)prog->substrs->data[i].end_shift,
 944                 BmUSEFUL(sv),
 945                 utf8_target ? 1 : 0,
 946                 SvPEEK(sv));
 947         }
 948     });
 949
 950     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 951
 952         /* ml_anch: check after \n?
 953          *
 954          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 955          * with /.*.../, these flags will have been added by the
 956          * compiler:
 957          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 958          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 959          */
 960         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 961                    && !(prog->intflags & PREGf_IMPLICIT);
 962
 963         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 964             /* we are only allowed to match at BOS or \G */
 965
 966             /* trivially reject if there's a BOS anchor and we're not at BOS.
 967              *
 968              * Note that we don't try to do a similar quick reject for
 969              * \G, since generally the caller will have calculated strpos
 970              * based on pos() and gofs, so the string is already correctly
 971              * anchored by definition; and handling the exceptions would
 972              * be too fiddly (e.g. REXEC_IGNOREPOS).
 973              */
 974             if (   strpos != strbeg
 975                 && (prog->intflags & PREGf_ANCH_SBOL))
 976             {
 977                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 978                                 "  Not at start...\n"));
 979                 goto fail;
 980             }
 981
 982             /* in the presence of an anchor, the anchored (relative to the
 983              * start of the regex) substr must also be anchored relative
 984              * to strpos. So quickly reject if substr isn't found there.
 985              * This works for \G too, because the caller will already have
 986              * subtracted gofs from pos, and gofs is the offset from the
 987              * \G to the start of the regex. For example, in /.abc\Gdef/,
 988              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 989              * caller will have set strpos=pos()-4; we look for the substr
 990              * at position pos()-4+1, which lines up with the "a" */
 991
 992             if (prog->check_offset_min == prog->check_offset_max) {
 993                 /* Substring at constant offset from beg-of-str... */
 994                 SSize_t slen = SvCUR(check);
 995                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 996
 997                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 998                     "  Looking for check substr at fixed offset %" IVdf "...\n",
 999                     (IV)prog->check_offset_min));
1000
1001                 if (SvTAIL(check)) {
1002                     /* In this case, the regex is anchored at the end too.
1003                      * Unless it's a multiline match, the lengths must match
1004                      * exactly, give or take a \n.  NB: slen >= 1 since
1005                      * the last char of check is \n */
1006                     if (!multiline
1007                         && (   strend - s > slen
1008                             || strend - s < slen - 1
1009                             || (strend - s == slen && strend[-1] != '\n')))
1010                     {
1011                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1012                                             "  String too long...\n"));
1013                         goto fail_finish;
1014                     }
1015                     /* Now should match s[0..slen-2] */
1016                     slen--;
1017                 }
1018                 if (slen && (strend - s < slen
1019                     || *SvPVX_const(check) != *s
1020                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1021                 {
1022                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1023                                     "  String not equal...\n"));
1024                     goto fail_finish;
1025                 }
1026
1027                 check_at = s;
1028                 goto success_at_start;
1029             }
1030         }
1031     }
1032
1033     end_shift = prog->check_end_shift;
1034
1035 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1036     if (end_shift < 0)
1037         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1038                    (IV)end_shift, RX_PRECOMP(rx));
1039 #endif
1040
1041   restart:
1042
1043     /* This is the (re)entry point of the main loop in this function.
1044      * The goal of this loop is to:
1045      * 1) find the "check" substring in the region rx_origin..strend
1046      *    (adjusted by start_shift / end_shift). If not found, reject
1047      *    immediately.
1048      * 2) If it exists, look for the "other" substr too if defined; for
1049      *    example, if the check substr maps to the anchored substr, then
1050      *    check the floating substr, and vice-versa. If not found, go
1051      *    back to (1) with rx_origin suitably incremented.
1052      * 3) If we find an rx_origin position that doesn't contradict
1053      *    either of the substrings, then check the possible additional
1054      *    constraints on rx_origin of /^.../m or a known start class.
1055      *    If these fail, then depending on which constraints fail, jump
1056      *    back to here, or to various other re-entry points further along
1057      *    that skip some of the first steps.
1058      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1059      *    substring. If the start position was determined to be at the
1060      *    beginning of the string  - so, not rejected, but not optimised,
1061      *    since we have to run regmatch from position 0 - decrement the
1062      *    BmUSEFUL() count. Otherwise increment it.
1063      */
1064
1065
1066     /* first, look for the 'check' substring */
1067
1068     {
1069         U8* start_point;
1070         U8* end_point;
1071
1072         DEBUG_OPTIMISE_MORE_r({
1073             Perl_re_printf( aTHX_
1074                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1075                 " Start shift: %" IVdf " End shift %" IVdf
1076                 " Real end Shift: %" IVdf "\n",
1077                 (IV)(rx_origin - strbeg),
1078                 (IV)prog->check_offset_min,
1079                 (IV)start_shift,
1080                 (IV)end_shift,
1081                 (IV)prog->check_end_shift);
1082         });
1083
1084         end_point = HOPBACK3(strend, end_shift, rx_origin);
1085         if (!end_point)
1086             goto fail_finish;
1087         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1088         if (!start_point)
1089             goto fail_finish;
1090
1091
1092         /* If the regex is absolutely anchored to either the start of the
1093          * string (SBOL) or to pos() (ANCH_GPOS), then
1094          * check_offset_max represents an upper bound on the string where
1095          * the substr could start. For the ANCH_GPOS case, we assume that
1096          * the caller of intuit will have already set strpos to
1097          * pos()-gofs, so in this case strpos + offset_max will still be
1098          * an upper bound on the substr.
1099          */
1100         if (!ml_anch
1101             && prog->intflags & PREGf_ANCH
1102             && prog->check_offset_max != SSize_t_MAX)
1103         {
1104             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1105             const char * const anchor =
1106                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1107             SSize_t targ_len = (char*)end_point - anchor;
1108
1109             if (check_len > targ_len) {
1110                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1111                               "Target string too short to match required substring...\n"));
1112                 goto fail_finish;
1113             }
1114
1115             /* do a bytes rather than chars comparison. It's conservative;
1116              * so it skips doing the HOP if the result can't possibly end
1117              * up earlier than the old value of end_point.
1118              */
1119             assert(anchor + check_len <= (char *)end_point);
1120             if (prog->check_offset_max + check_len < targ_len) {
1121                 end_point = HOP3lim((U8*)anchor,
1122                                 prog->check_offset_max,
1123                                 end_point - check_len
1124                             )
1125                             + check_len;
1126                 if (end_point < start_point)
1127                     goto fail_finish;
1128             }
1129         }
1130
1131         check_at = fbm_instr( start_point, end_point,
1132                       check, multiline ? FBMrf_MULTILINE : 0);
1133
1134         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1135             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1136             (IV)((char*)start_point - strbeg),
1137             (IV)((char*)end_point   - strbeg),
1138             (IV)(check_at ? check_at - strbeg : -1)
1139         ));
1140
1141         /* Update the count-of-usability, remove useless subpatterns,
1142             unshift s.  */
1143
1144         DEBUG_EXECUTE_r({
1145             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1146                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1147             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1148                               (check_at ? "Found" : "Did not find"),
1149                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1150                     ? "anchored" : "floating"),
1151                 quoted,
1152                 RE_SV_TAIL(check),
1153                 (check_at ? " at offset " : "...\n") );
1154         });
1155
1156         if (!check_at)
1157             goto fail_finish;
1158         /* set rx_origin to the minimum position where the regex could start
1159          * matching, given the constraint of the just-matched check substring.
1160          * But don't set it lower than previously.
1161          */
1162
1163         if (check_at - rx_origin > prog->check_offset_max)
1164             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1165         /* Finish the diagnostic message */
1166         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1167             "%ld (rx_origin now %" IVdf ")...\n",
1168             (long)(check_at - strbeg),
1169             (IV)(rx_origin - strbeg)
1170         ));
1171     }
1172
1173
1174     /* now look for the 'other' substring if defined */
1175
1176     if (prog->substrs->data[other_ix].utf8_substr
1177         || prog->substrs->data[other_ix].substr)
1178     {
1179         /* Take into account the "other" substring. */
1180         char *last, *last1;
1181         char *s;
1182         SV* must;
1183         struct reg_substr_datum *other;
1184
1185       do_other_substr:
1186         other = &prog->substrs->data[other_ix];
1187         if (!utf8_target && !other->substr) {
1188             if (!to_byte_substr(prog)) {
1189                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1190             }
1191         }
1192
1193         /* if "other" is anchored:
1194          * we've previously found a floating substr starting at check_at.
1195          * This means that the regex origin must lie somewhere
1196          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1197          * and max:                 HOP3(check_at, -check_offset_min)
1198          * (except that min will be >= strpos)
1199          * So the fixed  substr must lie somewhere between
1200          *  HOP3(min, anchored_offset)
1201          *  HOP3(max, anchored_offset) + SvCUR(substr)
1202          */
1203
1204         /* if "other" is floating
1205          * Calculate last1, the absolute latest point where the
1206          * floating substr could start in the string, ignoring any
1207          * constraints from the earlier fixed match. It is calculated
1208          * as follows:
1209          *
1210          * strend - prog->minlen (in chars) is the absolute latest
1211          * position within the string where the origin of the regex
1212          * could appear. The latest start point for the floating
1213          * substr is float_min_offset(*) on from the start of the
1214          * regex.  last1 simply combines thee two offsets.
1215          *
1216          * (*) You might think the latest start point should be
1217          * float_max_offset from the regex origin, and technically
1218          * you'd be correct. However, consider
1219          *    /a\d{2,4}bcd\w/
1220          * Here, float min, max are 3,5 and minlen is 7.
1221          * This can match either
1222          *    /a\d\dbcd\w/
1223          *    /a\d\d\dbcd\w/
1224          *    /a\d\d\d\dbcd\w/
1225          * In the first case, the regex matches minlen chars; in the
1226          * second, minlen+1, in the third, minlen+2.
1227          * In the first case, the floating offset is 3 (which equals
1228          * float_min), in the second, 4, and in the third, 5 (which
1229          * equals float_max). In all cases, the floating string bcd
1230          * can never start more than 4 chars from the end of the
1231          * string, which equals minlen - float_min. As the substring
1232          * starts to match more than float_min from the start of the
1233          * regex, it makes the regex match more than minlen chars,
1234          * and the two cancel each other out. So we can always use
1235          * float_min - minlen, rather than float_max - minlen for the
1236          * latest position in the string.
1237          *
1238          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1239          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1240          */
1241
1242         assert(prog->minlen >= other->min_offset);
1243         last1 = HOP3c(strend,
1244                         other->min_offset - prog->minlen, strbeg);
1245
1246         if (other_ix) {/* i.e. if (other-is-float) */
1247             /* last is the latest point where the floating substr could
1248              * start, *given* any constraints from the earlier fixed
1249              * match. This constraint is that the floating string starts
1250              * <= float_max_offset chars from the regex origin (rx_origin).
1251              * If this value is less than last1, use it instead.
1252              */
1253             assert(rx_origin <= last1);
1254             last =
1255                 /* this condition handles the offset==infinity case, and
1256                  * is a short-cut otherwise. Although it's comparing a
1257                  * byte offset to a char length, it does so in a safe way,
1258                  * since 1 char always occupies 1 or more bytes,
1259                  * so if a string range is  (last1 - rx_origin) bytes,
1260                  * it will be less than or equal to  (last1 - rx_origin)
1261                  * chars; meaning it errs towards doing the accurate HOP3
1262                  * rather than just using last1 as a short-cut */
1263                 (last1 - rx_origin) < other->max_offset
1264                     ? last1
1265                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1266         }
1267         else {
1268             assert(strpos + start_shift <= check_at);
1269             last = HOP4c(check_at, other->min_offset - start_shift,
1270                         strbeg, strend);
1271         }
1272
1273         s = HOP3c(rx_origin, other->min_offset, strend);
1274         if (s < other_last)     /* These positions already checked */
1275             s = other_last;
1276
1277         must = utf8_target ? other->utf8_substr : other->substr;
1278         assert(SvPOK(must));
1279         {
1280             char *from = s;
1281             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1282
1283             if (to > strend)
1284                 to = strend;
1285             if (from > to) {
1286                 s = NULL;
1287                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1288                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1289                     (IV)(from - strbeg),
1290                     (IV)(to   - strbeg)
1291                 ));
1292             }
1293             else {
1294                 s = fbm_instr(
1295                     (unsigned char*)from,
1296                     (unsigned char*)to,
1297                     must,
1298                     multiline ? FBMrf_MULTILINE : 0
1299                 );
1300                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1301                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1302                     (IV)(from - strbeg),
1303                     (IV)(to   - strbeg),
1304                     (IV)(s ? s - strbeg : -1)
1305                 ));
1306             }
1307         }
1308
1309         DEBUG_EXECUTE_r({
1310             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1311                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1312             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1313                 s ? "Found" : "Contradicts",
1314                 other_ix ? "floating" : "anchored",
1315                 quoted, RE_SV_TAIL(must));
1316         });
1317
1318
1319         if (!s) {
1320             /* last1 is latest possible substr location. If we didn't
1321              * find it before there, we never will */
1322             if (last >= last1) {
1323                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1324                                         "; giving up...\n"));
1325                 goto fail_finish;
1326             }
1327
1328             /* try to find the check substr again at a later
1329              * position. Maybe next time we'll find the "other" substr
1330              * in range too */
1331             other_last = HOP3c(last, 1, strend) /* highest failure */;
1332             rx_origin =
1333                 other_ix /* i.e. if other-is-float */
1334                     ? HOP3c(rx_origin, 1, strend)
1335                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1336             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1337                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1338                 (other_ix ? "floating" : "anchored"),
1339                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1340                 (IV)(rx_origin - strbeg)
1341             ));
1342             goto restart;
1343         }
1344         else {
1345             if (other_ix) { /* if (other-is-float) */
1346                 /* other_last is set to s, not s+1, since its possible for
1347                  * a floating substr to fail first time, then succeed
1348                  * second time at the same floating position; e.g.:
1349                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1350                  * The first time round, anchored and float match at
1351                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1352                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1353                  */
1354                 other_last = s;
1355             }
1356             else {
1357                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1358                 other_last = HOP3c(s, 1, strend);
1359             }
1360             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1361                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1362                   (long)(s - strbeg),
1363                 (IV)(rx_origin - strbeg)
1364               ));
1365
1366         }
1367     }
1368     else {
1369         DEBUG_OPTIMISE_MORE_r(
1370             Perl_re_printf( aTHX_
1371                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1372                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1373                 " strend:%" IVdf "\n",
1374                 (IV)prog->check_offset_min,
1375                 (IV)prog->check_offset_max,
1376                 (IV)(check_at-strbeg),
1377                 (IV)(rx_origin-strbeg),
1378                 (IV)(rx_origin-check_at),
1379                 (IV)(strend-strbeg)
1380             )
1381         );
1382     }
1383
1384   postprocess_substr_matches:
1385
1386     /* handle the extra constraint of /^.../m if present */
1387
1388     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1389         char *s;
1390
1391         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1392                         "  looking for /^/m anchor"));
1393
1394         /* we have failed the constraint of a \n before rx_origin.
1395          * Find the next \n, if any, even if it's beyond the current
1396          * anchored and/or floating substrings. Whether we should be
1397          * scanning ahead for the next \n or the next substr is debatable.
1398          * On the one hand you'd expect rare substrings to appear less
1399          * often than \n's. On the other hand, searching for \n means
1400          * we're effectively flipping between check_substr and "\n" on each
1401          * iteration as the current "rarest" string candidate, which
1402          * means for example that we'll quickly reject the whole string if
1403          * hasn't got a \n, rather than trying every substr position
1404          * first
1405          */
1406
1407         s = HOP3c(strend, - prog->minlen, strpos);
1408         if (s <= rx_origin ||
1409             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1410         {
1411             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1412                             "  Did not find /%s^%s/m...\n",
1413                             PL_colors[0], PL_colors[1]));
1414             goto fail_finish;
1415         }
1416
1417         /* earliest possible origin is 1 char after the \n.
1418          * (since *rx_origin == '\n', it's safe to ++ here rather than
1419          * HOP(rx_origin, 1)) */
1420         rx_origin++;
1421
1422         if (prog->substrs->check_ix == 0  /* check is anchored */
1423             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1424         {
1425             /* Position contradicts check-string; either because
1426              * check was anchored (and thus has no wiggle room),
1427              * or check was float and rx_origin is above the float range */
1428             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1429                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1430                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1431             goto restart;
1432         }
1433
1434         /* if we get here, the check substr must have been float,
1435          * is in range, and we may or may not have had an anchored
1436          * "other" substr which still contradicts */
1437         assert(prog->substrs->check_ix); /* check is float */
1438
1439         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1440             /* whoops, the anchored "other" substr exists, so we still
1441              * contradict. On the other hand, the float "check" substr
1442              * didn't contradict, so just retry the anchored "other"
1443              * substr */
1444             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1445                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1446                 PL_colors[0], PL_colors[1],
1447                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1448                 (IV)(rx_origin - strbeg)
1449             ));
1450             goto do_other_substr;
1451         }
1452
1453         /* success: we don't contradict the found floating substring
1454          * (and there's no anchored substr). */
1455         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1456             "  Found /%s^%s/m with rx_origin %ld...\n",
1457             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1458     }
1459     else {
1460         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1461             "  (multiline anchor test skipped)\n"));
1462     }
1463
1464   success_at_start:
1465
1466
1467     /* if we have a starting character class, then test that extra constraint.
1468      * (trie stclasses are too expensive to use here, we are better off to
1469      * leave it to regmatch itself) */
1470
1471     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1472         const U8* const str = (U8*)STRING(progi->regstclass);
1473
1474         /* XXX this value could be pre-computed */
1475         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1476                     ?  (reginfo->is_utf8_pat
1477                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1478                         : STR_LEN(progi->regstclass))
1479                     : 1);
1480         char * endpos;
1481         char *s;
1482         /* latest pos that a matching float substr constrains rx start to */
1483         char *rx_max_float = NULL;
1484
1485         /* if the current rx_origin is anchored, either by satisfying an
1486          * anchored substring constraint, or a /^.../m constraint, then we
1487          * can reject the current origin if the start class isn't found
1488          * at the current position. If we have a float-only match, then
1489          * rx_origin is constrained to a range; so look for the start class
1490          * in that range. if neither, then look for the start class in the
1491          * whole rest of the string */
1492
1493         /* XXX DAPM it's not clear what the minlen test is for, and why
1494          * it's not used in the floating case. Nothing in the test suite
1495          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1496          * Here are some old comments, which may or may not be correct:
1497          *
1498          *   minlen == 0 is possible if regstclass is \b or \B,
1499          *   and the fixed substr is ''$.
1500          *   Since minlen is already taken into account, rx_origin+1 is
1501          *   before strend; accidentally, minlen >= 1 guaranties no false
1502          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1503          *   0) below assumes that regstclass does not come from lookahead...
1504          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1505          *   This leaves EXACTF-ish only, which are dealt with in
1506          *   find_byclass().
1507          */
1508
1509         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1510             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1511         else if (prog->float_substr || prog->float_utf8) {
1512             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1513             endpos = HOP3clim(rx_max_float, cl_l, strend);
1514         }
1515         else
1516             endpos= strend;
1517
1518         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1519             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1520             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1521               (IV)start_shift, (IV)(check_at - strbeg),
1522               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1523
1524         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1525                             reginfo);
1526         if (!s) {
1527             if (endpos == strend) {
1528                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1529                                 "  Could not match STCLASS...\n") );
1530                 goto fail;
1531             }
1532             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1533                                "  This position contradicts STCLASS...\n") );
1534             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1535                         && !(prog->intflags & PREGf_IMPLICIT))
1536                 goto fail;
1537
1538             /* Contradict one of substrings */
1539             if (prog->anchored_substr || prog->anchored_utf8) {
1540                 if (prog->substrs->check_ix == 1) { /* check is float */
1541                     /* Have both, check_string is floating */
1542                     assert(rx_origin + start_shift <= check_at);
1543                     if (rx_origin + start_shift != check_at) {
1544                         /* not at latest position float substr could match:
1545                          * Recheck anchored substring, but not floating.
1546                          * The condition above is in bytes rather than
1547                          * chars for efficiency. It's conservative, in
1548                          * that it errs on the side of doing 'goto
1549                          * do_other_substr'. In this case, at worst,
1550                          * an extra anchored search may get done, but in
1551                          * practice the extra fbm_instr() is likely to
1552                          * get skipped anyway. */
1553                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1554                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1555                             (long)(other_last - strbeg),
1556                             (IV)(rx_origin - strbeg)
1557                         ));
1558                         goto do_other_substr;
1559                     }
1560                 }
1561             }
1562             else {
1563                 /* float-only */
1564
1565                 if (ml_anch) {
1566                     /* In the presence of ml_anch, we might be able to
1567                      * find another \n without breaking the current float
1568                      * constraint. */
1569
1570                     /* strictly speaking this should be HOP3c(..., 1, ...),
1571                      * but since we goto a block of code that's going to
1572                      * search for the next \n if any, its safe here */
1573                     rx_origin++;
1574                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1575                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1576                               PL_colors[0], PL_colors[1],
1577                               (long)(rx_origin - strbeg)) );
1578                     goto postprocess_substr_matches;
1579                 }
1580
1581                 /* strictly speaking this can never be true; but might
1582                  * be if we ever allow intuit without substrings */
1583                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1584                     goto fail;
1585
1586                 rx_origin = rx_max_float;
1587             }
1588
1589             /* at this point, any matching substrings have been
1590              * contradicted. Start again... */
1591
1592             rx_origin = HOP3c(rx_origin, 1, strend);
1593
1594             /* uses bytes rather than char calculations for efficiency.
1595              * It's conservative: it errs on the side of doing 'goto restart',
1596              * where there is code that does a proper char-based test */
1597             if (rx_origin + start_shift + end_shift > strend) {
1598                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1599                                        "  Could not match STCLASS...\n") );
1600                 goto fail;
1601             }
1602             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1603                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1604                 (prog->substrs->check_ix ? "floating" : "anchored"),
1605                 (long)(rx_origin + start_shift - strbeg),
1606                 (IV)(rx_origin - strbeg)
1607             ));
1608             goto restart;
1609         }
1610
1611         /* Success !!! */
1612
1613         if (rx_origin != s) {
1614             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1615                         "  By STCLASS: moving %ld --> %ld\n",
1616                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1617                    );
1618         }
1619         else {
1620             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1621                                   "  Does not contradict STCLASS...\n");
1622                    );
1623         }
1624     }
1625
1626     /* Decide whether using the substrings helped */
1627
1628     if (rx_origin != strpos) {
1629         /* Fixed substring is found far enough so that the match
1630            cannot start at strpos. */
1631
1632         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1633         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1634     }
1635     else {
1636         /* The found rx_origin position does not prohibit matching at
1637          * strpos, so calling intuit didn't gain us anything. Decrement
1638          * the BmUSEFUL() count on the check substring, and if we reach
1639          * zero, free it.  */
1640         if (!(prog->intflags & PREGf_NAUGHTY)
1641             && (utf8_target ? (
1642                 prog->check_utf8                /* Could be deleted already */
1643                 && --BmUSEFUL(prog->check_utf8) < 0
1644                 && (prog->check_utf8 == prog->float_utf8)
1645             ) : (
1646                 prog->check_substr              /* Could be deleted already */
1647                 && --BmUSEFUL(prog->check_substr) < 0
1648                 && (prog->check_substr == prog->float_substr)
1649             )))
1650         {
1651             /* If flags & SOMETHING - do not do it many times on the same match */
1652             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1653             /* XXX Does the destruction order has to change with utf8_target? */
1654             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1655             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1656             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1657             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1658             check = NULL;                       /* abort */
1659             /* XXXX This is a remnant of the old implementation.  It
1660                     looks wasteful, since now INTUIT can use many
1661                     other heuristics. */
1662             prog->extflags &= ~RXf_USE_INTUIT;
1663         }
1664     }
1665
1666     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1667             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1668              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1669
1670     return rx_origin;
1671
1672   fail_finish:                          /* Substring not found */
1673     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1674         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1675   fail:
1676     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1677                           PL_colors[4], PL_colors[5]));
1678     return NULL;
1679 }
1680
1681
1682 #define DECL_TRIE_TYPE(scan) \
1683     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1684                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1685                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1686                     trie_type = ((scan->flags == EXACT)                             \
1687                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1688                                  : (scan->flags == EXACTL)                          \
1689                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1690                                     : (scan->flags == EXACTFAA)                     \
1691                                       ? (utf8_target                                \
1692                                          ? trie_utf8_exactfa_fold                   \
1693                                          : trie_latin_utf8_exactfa_fold)            \
1694                                       : (scan->flags == EXACTFLU8                   \
1695                                          ? (utf8_target                             \
1696                                            ? trie_flu8                              \
1697                                            : trie_flu8_latin)                       \
1698                                          : (utf8_target                             \
1699                                            ? trie_utf8_fold                         \
1700                                            : trie_latin_utf8_fold)))
1701
1702 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1703  * 'foldbuf+sizeof(foldbuf)' */
1704 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1705 STMT_START {                                                                        \
1706     STRLEN skiplen;                                                                 \
1707     U8 flags = FOLD_FLAGS_FULL;                                                     \
1708     switch (trie_type) {                                                            \
1709     case trie_flu8:                                                                 \
1710         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1711         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1712             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1713         }                                                                           \
1714         goto do_trie_utf8_fold;                                                     \
1715     case trie_utf8_exactfa_fold:                                                    \
1716         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1717         /* FALLTHROUGH */                                                           \
1718     case trie_utf8_fold:                                                            \
1719       do_trie_utf8_fold:                                                            \
1720         if ( foldlen>0 ) {                                                          \
1721             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1722             foldlen -= len;                                                         \
1723             uscan += len;                                                           \
1724             len=0;                                                                  \
1725         } else {                                                                    \
1726             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1727                                                                             flags); \
1728             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1729             skiplen = UVCHR_SKIP( uvc );                                            \
1730             foldlen -= skiplen;                                                     \
1731             uscan = foldbuf + skiplen;                                              \
1732         }                                                                           \
1733         break;                                                                      \
1734     case trie_flu8_latin:                                                           \
1735         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1736         goto do_trie_latin_utf8_fold;                                               \
1737     case trie_latin_utf8_exactfa_fold:                                              \
1738         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1739         /* FALLTHROUGH */                                                           \
1740     case trie_latin_utf8_fold:                                                      \
1741       do_trie_latin_utf8_fold:                                                      \
1742         if ( foldlen>0 ) {                                                          \
1743             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1744             foldlen -= len;                                                         \
1745             uscan += len;                                                           \
1746             len=0;                                                                  \
1747         } else {                                                                    \
1748             len = 1;                                                                \
1749             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1750             skiplen = UVCHR_SKIP( uvc );                                            \
1751             foldlen -= skiplen;                                                     \
1752             uscan = foldbuf + skiplen;                                              \
1753         }                                                                           \
1754         break;                                                                      \
1755     case trie_utf8l:                                                                \
1756         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1757         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1758             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1759         }                                                                           \
1760         /* FALLTHROUGH */                                                           \
1761     case trie_utf8:                                                                 \
1762         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1763         break;                                                                      \
1764     case trie_plain:                                                                \
1765         uvc = (UV)*uc;                                                              \
1766         len = 1;                                                                    \
1767     }                                                                               \
1768     if (uvc < 256) {                                                                \
1769         charid = trie->charmap[ uvc ];                                              \
1770     }                                                                               \
1771     else {                                                                          \
1772         charid = 0;                                                                 \
1773         if (widecharmap) {                                                          \
1774             SV** const svpp = hv_fetch(widecharmap,                                 \
1775                         (char*)&uvc, sizeof(UV), 0);                                \
1776             if (svpp)                                                               \
1777                 charid = (U16)SvIV(*svpp);                                          \
1778         }                                                                           \
1779     }                                                                               \
1780 } STMT_END
1781
1782 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1783     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1784                 startpos, doutf8, depth)
1785
1786 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1787     STMT_START {                                            \
1788         while (s < strend) {                                \
1789             CODE                                            \
1790             s += ((UTF8)                                    \
1791                   ? UTF8_SAFE_SKIP(s, reginfo->strend)      \
1792                   : 1);                                     \
1793         }                                                   \
1794     } STMT_END
1795
1796 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1797     STMT_START {                                            \
1798         while (s < strend) {                                \
1799             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1800         }                                                   \
1801     } STMT_END
1802
1803 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1804     if (COND) {                                                \
1805         FBC_CHECK_AND_TRY                                      \
1806         s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
1807         previous_occurrence_end = s;                           \
1808     }                                                          \
1809     else {                                                     \
1810         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1811     }
1812
1813 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1814     if (utf8_target) {                                         \
1815         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1816     }                                                          \
1817     else {                                                     \
1818         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1819     }
1820
1821 /* We keep track of where the next character should start after an occurrence
1822  * of the one we're looking for.  Knowing that, we can see right away if the
1823  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1824  * don't accept the 2nd and succeeding adjacent occurrences */
1825 #define FBC_CHECK_AND_TRY                                           \
1826         if (   (   doevery                                          \
1827                 || s != previous_occurrence_end)                    \
1828             && (   reginfo->intuit                                  \
1829                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1830         {                                                           \
1831             goto got_it;                                            \
1832         }
1833
1834
1835 /* This differs from the above macros in that it calls a function which returns
1836  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1837  * there is no such occurrence. */
1838 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1839     while (s < strend) {                                    \
1840         s = (f);                                            \
1841         if (s >= strend) {                                  \
1842             break;                                          \
1843         }                                                   \
1844                                                             \
1845         FBC_CHECK_AND_TRY                                   \
1846         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1847         previous_occurrence_end = s;                        \
1848     }
1849
1850 /* This differs from the above macros in that it is passed a single byte that
1851  * is known to begin the next occurrence of the thing being looked for in 's'.
1852  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1853  * at that position. */
1854 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
1855     while (s < strend) {                                    \
1856         s = (char *) memchr(s, byte, strend -s);            \
1857         if (s == NULL) {                                    \
1858             s = (char *) strend;                            \
1859             break;                                          \
1860         }                                                   \
1861                                                             \
1862         if (COND) {                                         \
1863             FBC_CHECK_AND_TRY                               \
1864             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1865             previous_occurrence_end = s;                    \
1866         }                                                   \
1867         else {                                              \
1868             s += UTF8SKIP(s);                               \
1869         }                                                   \
1870     }
1871
1872 /* The three macros below are slightly different versions of the same logic.
1873  *
1874  * The first is for /a and /aa when the target string is UTF-8.  This can only
1875  * match ascii, but it must advance based on UTF-8.   The other two handle the
1876  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1877  * for the boundary (or non-boundary) between a word and non-word character.
1878  * The utf8 and non-utf8 cases have the same logic, but the details must be
1879  * different.  Find the "wordness" of the character just prior to this one, and
1880  * compare it with the wordness of this one.  If they differ, we have a
1881  * boundary.  At the beginning of the string, pretend that the previous
1882  * character was a new-line.
1883  *
1884  * All these macros uncleanly have side-effects with each other and outside
1885  * variables.  So far it's been too much trouble to clean-up
1886  *
1887  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1888  *               a word character or not.
1889  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1890  *               word/non-word
1891  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1892  *
1893  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1894  * are looking for a boundary or for a non-boundary.  If we are looking for a
1895  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1896  * see if this tentative match actually works, and if so, to quit the loop
1897  * here.  And vice-versa if we are looking for a non-boundary.
1898  *
1899  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1900  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1901  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1902  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1903  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1904  * complement.  But in that branch we complement tmp, meaning that at the
1905  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1906  * which means at the top of the loop in the next iteration, it is
1907  * TEST_NON_UTF8(s-1) */
1908 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1909     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1910     tmp = TEST_NON_UTF8(tmp);                                                  \
1911     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1912         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1913             tmp = !tmp;                                                        \
1914             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1915         }                                                                      \
1916         else {                                                                 \
1917             IF_FAIL;                                                           \
1918         }                                                                      \
1919     );                                                                         \
1920
1921 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1922  * TEST_UTF8 is a macro that for the same input code points returns identically
1923  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1924 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1925     if (s == reginfo->strbeg) {                                                \
1926         tmp = '\n';                                                            \
1927     }                                                                          \
1928     else { /* Back-up to the start of the previous character */                \
1929         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1930         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1931                                                        0, UTF8_ALLOW_DEFAULT); \
1932     }                                                                          \
1933     tmp = TEST_UV(tmp);                                                        \
1934     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1935         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1936             tmp = !tmp;                                                        \
1937             IF_SUCCESS;                                                        \
1938         }                                                                      \
1939         else {                                                                 \
1940             IF_FAIL;                                                           \
1941         }                                                                      \
1942     );
1943
1944 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1945  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1946  * macros below */
1947 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1948     if (utf8_target) {                                                         \
1949         UTF8_CODE                                                              \
1950     }                                                                          \
1951     else {  /* Not utf8 */                                                     \
1952         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1953         tmp = TEST_NON_UTF8(tmp);                                              \
1954         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
1955             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1956                 IF_SUCCESS;                                                    \
1957                 tmp = !tmp;                                                    \
1958             }                                                                  \
1959             else {                                                             \
1960                 IF_FAIL;                                                       \
1961             }                                                                  \
1962         );                                                                     \
1963     }                                                                          \
1964     /* Here, things have been set up by the previous code so that tmp is the   \
1965      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1966      * utf8ness of the target).  We also have to check if this matches against \
1967      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1968      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1969      * string */                                                               \
1970     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1971         IF_SUCCESS;                                                            \
1972     }                                                                          \
1973     else {                                                                     \
1974         IF_FAIL;                                                               \
1975     }
1976
1977 /* This is the macro to use when we want to see if something that looks like it
1978  * could match, actually does, and if so exits the loop.  It needs to be used
1979  * only for bounds checking macros, as it allows for matching beyond the end of
1980  * string (which should be zero length without having to look at the string
1981  * contents) */
1982 #define REXEC_FBC_TRYIT                                                     \
1983     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
1984         goto got_it
1985
1986 /* The only difference between the BOUND and NBOUND cases is that
1987  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1988  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1989  * with the other one being empty (PLACEHOLDER is defined as empty).
1990  *
1991  * The TEST_FOO parameters are for operating on different forms of input, but
1992  * all should be ones that return identically for the same underlying code
1993  * points */
1994 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1995     FBC_BOUND_COMMON(                                                          \
1996           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1997           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1998
1999 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2000     FBC_BOUND_COMMON(                                                          \
2001             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2002             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2003
2004 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2005     FBC_BOUND_COMMON(                                                          \
2006           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2007           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2008
2009 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2010     FBC_BOUND_COMMON(                                                          \
2011             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2012             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2013
2014 #ifdef DEBUGGING
2015 static IV
2016 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2017   IV cp_out = _invlist_search(invlist, cp_in);
2018   assert(cp_out >= 0);
2019   return cp_out;
2020 }
2021 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2022         invmap[S_get_break_val_cp_checked(invlist, cp)]
2023 #else
2024 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2025         invmap[_invlist_search(invlist, cp)]
2026 #endif
2027
2028 /* Takes a pointer to an inversion list, a pointer to its corresponding
2029  * inversion map, and a code point, and returns the code point's value
2030  * according to the two arrays.  It assumes that all code points have a value.
2031  * This is used as the base macro for macros for particular properties */
2032 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2033         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2034
2035 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2036  * of a code point, returning the value for the first code point in the string.
2037  * And it takes the particular macro name that finds the desired value given a
2038  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2039 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2040              (__ASSERT_(pos < strend)                                          \
2041                  /* Note assumes is valid UTF-8 */                             \
2042              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2043
2044 /* Returns the GCB value for the input code point */
2045 #define getGCB_VAL_CP(cp)                                                      \
2046           _generic_GET_BREAK_VAL_CP(                                           \
2047                                     PL_GCB_invlist,                            \
2048                                     _Perl_GCB_invmap,                          \
2049                                     (cp))
2050
2051 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2052  * bounded by pos and strend */
2053 #define getGCB_VAL_UTF8(pos, strend)                                           \
2054     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2055
2056 /* Returns the LB value for the input code point */
2057 #define getLB_VAL_CP(cp)                                                       \
2058           _generic_GET_BREAK_VAL_CP(                                           \
2059                                     PL_LB_invlist,                             \
2060                                     _Perl_LB_invmap,                           \
2061                                     (cp))
2062
2063 /* Returns the LB value for the first code point in the UTF-8 encoded string
2064  * bounded by pos and strend */
2065 #define getLB_VAL_UTF8(pos, strend)                                            \
2066     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2067
2068
2069 /* Returns the SB value for the input code point */
2070 #define getSB_VAL_CP(cp)                                                       \
2071           _generic_GET_BREAK_VAL_CP(                                           \
2072                                     PL_SB_invlist,                             \
2073                                     _Perl_SB_invmap,                     \
2074                                     (cp))
2075
2076 /* Returns the SB value for the first code point in the UTF-8 encoded string
2077  * bounded by pos and strend */
2078 #define getSB_VAL_UTF8(pos, strend)                                            \
2079     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2080
2081 /* Returns the WB value for the input code point */
2082 #define getWB_VAL_CP(cp)                                                       \
2083           _generic_GET_BREAK_VAL_CP(                                           \
2084                                     PL_WB_invlist,                             \
2085                                     _Perl_WB_invmap,                         \
2086                                     (cp))
2087
2088 /* Returns the WB value for the first code point in the UTF-8 encoded string
2089  * bounded by pos and strend */
2090 #define getWB_VAL_UTF8(pos, strend)                                            \
2091     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2092
2093 /* We know what class REx starts with.  Try to find this position... */
2094 /* if reginfo->intuit, its a dryrun */
2095 /* annoyingly all the vars in this routine have different names from their counterparts
2096    in regmatch. /grrr */
2097 STATIC char *
2098 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2099     const char *strend, regmatch_info *reginfo)
2100 {
2101     dVAR;
2102
2103     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2104     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2105
2106     char *pat_string;   /* The pattern's exactish string */
2107     char *pat_end;          /* ptr to end char of pat_string */
2108     re_fold_t folder;   /* Function for computing non-utf8 folds */
2109     const U8 *fold_array;   /* array for folding ords < 256 */
2110     STRLEN ln;
2111     STRLEN lnc;
2112     U8 c1;
2113     U8 c2;
2114     char *e = NULL;
2115
2116     /* In some cases we accept only the first occurence of 'x' in a sequence of
2117      * them.  This variable points to just beyond the end of the previous
2118      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2119      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2120      * hop back.) */
2121     char * previous_occurrence_end = 0;
2122
2123     I32 tmp;            /* Scratch variable */
2124     const bool utf8_target = reginfo->is_utf8_target;
2125     UV utf8_fold_flags = 0;
2126     const bool is_utf8_pat = reginfo->is_utf8_pat;
2127     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2128                                    with a result inverts that result, as 0^1 =
2129                                    1 and 1^1 = 0 */
2130     _char_class_number classnum;
2131
2132     RXi_GET_DECL(prog,progi);
2133
2134     PERL_ARGS_ASSERT_FIND_BYCLASS;
2135
2136     /* We know what class it must start with. */
2137     switch (OP(c)) {
2138     case ANYOFPOSIXL:
2139     case ANYOFL:
2140         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2141
2142         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2143             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2144         }
2145
2146         /* FALLTHROUGH */
2147     case ANYOFD:
2148     case ANYOF:
2149         if (utf8_target) {
2150             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2151                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2152         }
2153         else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2154             /* We know that s is in the bitmap range since the target isn't
2155              * UTF-8, so what happens for out-of-range values is not relevant,
2156              * so exclude that from the flags */
2157             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2158         }
2159         else {
2160             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2161         }
2162         break;
2163
2164     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2165         /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
2166          * use 0 */
2167         REXEC_FBC_FIND_NEXT_SCAN(0,
2168          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2169                                    (U8) ARG(c), FLAGS(c)));
2170         break;
2171
2172     case NANYOFM:   /* UTF-8ness does matter because can match UTF-8 variants.
2173                      */
2174         REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
2175          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2176                                    (U8) ARG(c), FLAGS(c)));
2177         break;
2178
2179     case ANYOFH:
2180         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2181             REXEC_FBC_CLASS_SCAN(TRUE,
2182                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2183         }
2184         break;
2185
2186     case ANYOFHb:
2187         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2188
2189             /* We know what the first byte of any matched string should be */
2190             U8 first_byte = FLAGS(c);
2191
2192             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2193                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2194         }
2195         break;
2196
2197     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2198         assert(! is_utf8_pat);
2199         /* FALLTHROUGH */
2200     case EXACTFAA:
2201         if (is_utf8_pat) {
2202             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2203                              |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
2204             goto do_exactf_utf8;
2205         }
2206         else if (utf8_target) {
2207
2208             /* Here, and elsewhere in this file, the reason we can't consider a
2209              * non-UTF-8 pattern already folded in the presence of a UTF-8
2210              * target is because any MICRO SIGN in the pattern won't be folded.
2211              * Since the fold of the MICRO SIGN requires UTF-8 to represent, we
2212              * can consider a non-UTF-8 pattern folded when matching a
2213              * non-UTF-8 target */
2214             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2215             goto do_exactf_utf8;
2216         }
2217
2218         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2219          * which these functions don't handle anyway */
2220         fold_array = PL_fold_latin1;
2221         folder = foldEQ_latin1_s2_folded;
2222         goto do_exactf_non_utf8;
2223
2224     case EXACTF:   /* This node only generated for non-utf8 patterns */
2225         assert(! is_utf8_pat);
2226         if (utf8_target) {
2227             goto do_exactf_utf8;
2228         }
2229         fold_array = PL_fold;
2230         folder = foldEQ;
2231         goto do_exactf_non_utf8;
2232
2233     case EXACTFL:
2234         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2235         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2236             utf8_fold_flags = FOLDEQ_LOCALE;
2237             goto do_exactf_utf8;
2238         }
2239         fold_array = PL_fold_locale;
2240         folder = foldEQ_locale;
2241         goto do_exactf_non_utf8;
2242
2243     case EXACTFUP:      /* Problematic even though pattern isn't UTF-8.  Use
2244                            full functionality normally not done except for
2245                            UTF-8 */
2246         assert(! is_utf8_pat);
2247         goto do_exactf_utf8;
2248
2249     case EXACTFLU8:
2250             if (! utf8_target) {    /* All code points in this node require
2251                                        UTF-8 to express.  */
2252                 break;
2253             }
2254             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2255                                              | FOLDEQ_S2_FOLDS_SANE;
2256             goto do_exactf_utf8;
2257
2258     case EXACTFU_ONLY8:
2259         if (! utf8_target) {
2260             break;
2261         }
2262         assert(is_utf8_pat);
2263         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2264         goto do_exactf_utf8;
2265
2266     case EXACTFU:
2267         if (is_utf8_pat || utf8_target) {
2268             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2269             goto do_exactf_utf8;
2270         }
2271
2272         /* Any 'ss' in the pattern should have been replaced by regcomp,
2273          * so we don't have to worry here about this single special case
2274          * in the Latin1 range */
2275         fold_array = PL_fold_latin1;
2276         folder = foldEQ_latin1_s2_folded;
2277
2278         /* FALLTHROUGH */
2279
2280       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2281                            are no glitches with fold-length differences
2282                            between the target string and pattern */
2283
2284         /* The idea in the non-utf8 EXACTF* cases is to first find the
2285          * first character of the EXACTF* node and then, if necessary,
2286          * case-insensitively compare the full text of the node.  c1 is the
2287          * first character.  c2 is its fold.  This logic will not work for
2288          * Unicode semantics and the german sharp ss, which hence should
2289          * not be compiled into a node that gets here. */
2290         pat_string = STRING(c);
2291         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2292
2293         /* We know that we have to match at least 'ln' bytes (which is the
2294          * same as characters, since not utf8).  If we have to match 3
2295          * characters, and there are only 2 availabe, we know without
2296          * trying that it will fail; so don't start a match past the
2297          * required minimum number from the far end */
2298         e = HOP3c(strend, -((SSize_t)ln), s);
2299         if (e < s)
2300             break;
2301
2302         c1 = *pat_string;
2303         c2 = fold_array[c1];
2304         if (c1 == c2) { /* If char and fold are the same */
2305             while (s <= e) {
2306                 s = (char *) memchr(s, c1, e + 1 - s);
2307                 if (s == NULL) {
2308                     break;
2309                 }
2310
2311                 /* Check that the rest of the node matches */
2312                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2313                     && (reginfo->intuit || regtry(reginfo, &s)) )
2314                 {
2315                     goto got_it;
2316                 }
2317                 s++;
2318             }
2319         }
2320         else {
2321             U8 bits_differing = c1 ^ c2;
2322
2323             /* If the folds differ in one bit position only, we can mask to
2324              * match either of them, and can use this faster find method.  Both
2325              * ASCII and EBCDIC tend to have their case folds differ in only
2326              * one position, so this is very likely */
2327             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2328                 bits_differing = ~ bits_differing;
2329                 while (s <= e) {
2330                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2331                                         (c1 & bits_differing), bits_differing);
2332                     if (s > e) {
2333                         break;
2334                     }
2335
2336                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2337                         && (reginfo->intuit || regtry(reginfo, &s)) )
2338                     {
2339                         goto got_it;
2340                     }
2341                     s++;
2342                 }
2343             }
2344             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2345                        should actually happen only in EXACTFL nodes */
2346                 while (s <= e) {
2347                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2348                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2349                         && (reginfo->intuit || regtry(reginfo, &s)) )
2350                     {
2351                         goto got_it;
2352                     }
2353                     s++;
2354                 }
2355             }
2356         }
2357         break;
2358
2359       do_exactf_utf8:
2360       {
2361         unsigned expansion;
2362
2363         /* If one of the operands is in utf8, we can't use the simpler folding
2364          * above, due to the fact that many different characters can have the
2365          * same fold, or portion of a fold, or different- length fold */
2366         pat_string = STRING(c);
2367         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2368         pat_end = pat_string + ln;
2369         lnc = is_utf8_pat       /* length to match in characters */
2370                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2371                 : ln;
2372
2373         /* We have 'lnc' characters to match in the pattern, but because of
2374          * multi-character folding, each character in the target can match
2375          * up to 3 characters (Unicode guarantees it will never exceed
2376          * this) if it is utf8-encoded; and up to 2 if not (based on the
2377          * fact that the Latin 1 folds are already determined, and the
2378          * only multi-char fold in that range is the sharp-s folding to
2379          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2380          * string character.  Adjust lnc accordingly, rounding up, so that
2381          * if we need to match at least 4+1/3 chars, that really is 5. */
2382         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2383         lnc = (lnc + expansion - 1) / expansion;
2384
2385         /* As in the non-UTF8 case, if we have to match 3 characters, and
2386          * only 2 are left, it's guaranteed to fail, so don't start a
2387          * match that would require us to go beyond the end of the string
2388          */
2389         e = HOP3c(strend, -((SSize_t)lnc), s);
2390
2391         /* XXX Note that we could recalculate e to stop the loop earlier,
2392          * as the worst case expansion above will rarely be met, and as we
2393          * go along we would usually find that e moves further to the left.
2394          * This would happen only after we reached the point in the loop
2395          * where if there were no expansion we should fail.  Unclear if
2396          * worth the expense */
2397
2398         while (s <= e) {
2399             char *my_strend= (char *)strend;
2400             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2401                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2402                 && (reginfo->intuit || regtry(reginfo, &s)) )
2403             {
2404                 goto got_it;
2405             }
2406             s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2407         }
2408         break;
2409     }
2410
2411     case BOUNDL:
2412         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2413         if (FLAGS(c) != TRADITIONAL_BOUND) {
2414             if (! IN_UTF8_CTYPE_LOCALE) {
2415                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2416                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2417             }
2418             goto do_boundu;
2419         }
2420
2421         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2422         break;
2423
2424     case NBOUNDL:
2425         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2426         if (FLAGS(c) != TRADITIONAL_BOUND) {
2427             if (! IN_UTF8_CTYPE_LOCALE) {
2428                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2429                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2430             }
2431             goto do_nboundu;
2432         }
2433
2434         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2435         break;
2436
2437     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2438                    meaning */
2439         assert(FLAGS(c) == TRADITIONAL_BOUND);
2440
2441         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2442         break;
2443
2444     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2445                    meaning */
2446         assert(FLAGS(c) == TRADITIONAL_BOUND);
2447
2448         FBC_BOUND_A(isWORDCHAR_A);
2449         break;
2450
2451     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2452                    meaning */
2453         assert(FLAGS(c) == TRADITIONAL_BOUND);
2454
2455         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2456         break;
2457
2458     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2459                    meaning */
2460         assert(FLAGS(c) == TRADITIONAL_BOUND);
2461
2462         FBC_NBOUND_A(isWORDCHAR_A);
2463         break;
2464
2465     case NBOUNDU:
2466         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2467             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2468             break;
2469         }
2470
2471       do_nboundu:
2472
2473         to_complement = 1;
2474         /* FALLTHROUGH */
2475
2476     case BOUNDU:
2477       do_boundu:
2478         switch((bound_type) FLAGS(c)) {
2479             case TRADITIONAL_BOUND:
2480                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2481                 break;
2482             case GCB_BOUND:
2483                 if (s == reginfo->strbeg) {
2484                     if (reginfo->intuit || regtry(reginfo, &s))
2485                     {
2486                         goto got_it;
2487                     }
2488
2489                     /* Didn't match.  Try at the next position (if there is one) */
2490                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2491                     if (UNLIKELY(s >= reginfo->strend)) {
2492                         break;
2493                     }
2494                 }
2495
2496                 if (utf8_target) {
2497                     GCB_enum before = getGCB_VAL_UTF8(
2498                                                reghop3((U8*)s, -1,
2499                                                        (U8*)(reginfo->strbeg)),
2500                                                (U8*) reginfo->strend);
2501                     while (s < strend) {
2502                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2503                                                         (U8*) reginfo->strend);
2504                         if (   (to_complement ^ isGCB(before,
2505                                                       after,
2506                                                       (U8*) reginfo->strbeg,
2507                                                       (U8*) s,
2508                                                       utf8_target))
2509                             && (reginfo->intuit || regtry(reginfo, &s)))
2510                         {
2511                             goto got_it;
2512                         }
2513                         before = after;
2514                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2515                     }
2516                 }
2517                 else {  /* Not utf8.  Everything is a GCB except between CR and
2518                            LF */
2519                     while (s < strend) {
2520                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2521                                               || UCHARAT(s) != '\n'))
2522                             && (reginfo->intuit || regtry(reginfo, &s)))
2523                         {
2524                             goto got_it;
2525                         }
2526                         s++;
2527                     }
2528                 }
2529
2530                 /* And, since this is a bound, it can match after the final
2531                  * character in the string */
2532                 if (   reginfo->intuit
2533                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2534                 {
2535                     goto got_it;
2536                 }
2537                 break;
2538
2539             case LB_BOUND:
2540                 if (s == reginfo->strbeg) {
2541                     if (reginfo->intuit || regtry(reginfo, &s)) {
2542                         goto got_it;
2543                     }
2544                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2545                     if (UNLIKELY(s >= reginfo->strend)) {
2546                         break;
2547                     }
2548                 }
2549
2550                 if (utf8_target) {
2551                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2552                                                                -1,
2553                                                                (U8*)(reginfo->strbeg)),
2554                                                        (U8*) reginfo->strend);
2555                     while (s < strend) {
2556                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2557                         if (to_complement ^ isLB(before,
2558                                                  after,
2559                                                  (U8*) reginfo->strbeg,
2560                                                  (U8*) s,
2561                                                  (U8*) reginfo->strend,
2562                                                  utf8_target)
2563                             && (reginfo->intuit || regtry(reginfo, &s)))
2564                         {
2565                             goto got_it;
2566                         }
2567                         before = after;
2568                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2569                     }
2570                 }
2571                 else {  /* Not utf8. */
2572                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2573                     while (s < strend) {
2574                         LB_enum after = getLB_VAL_CP((U8) *s);
2575                         if (to_complement ^ isLB(before,
2576                                                  after,
2577                                                  (U8*) reginfo->strbeg,
2578                                                  (U8*) s,
2579                                                  (U8*) reginfo->strend,
2580                                                  utf8_target)
2581                             && (reginfo->intuit || regtry(reginfo, &s)))
2582                         {
2583                             goto got_it;
2584                         }
2585                         before = after;
2586                         s++;
2587                     }
2588                 }
2589
2590                 if (   reginfo->intuit
2591                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2592                 {
2593                     goto got_it;
2594                 }
2595
2596                 break;
2597
2598             case SB_BOUND:
2599                 if (s == reginfo->strbeg) {
2600                     if (reginfo->intuit || regtry(reginfo, &s)) {
2601                         goto got_it;
2602                     }
2603                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2604                     if (UNLIKELY(s >= reginfo->strend)) {
2605                         break;
2606                     }
2607                 }
2608
2609                 if (utf8_target) {
2610                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2611                                                         -1,
2612                                                         (U8*)(reginfo->strbeg)),
2613                                                       (U8*) reginfo->strend);
2614                     while (s < strend) {
2615                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2616                                                          (U8*) reginfo->strend);
2617                         if ((to_complement ^ isSB(before,
2618                                                   after,
2619                                                   (U8*) reginfo->strbeg,
2620                                                   (U8*) s,
2621                                                   (U8*) reginfo->strend,
2622                                                   utf8_target))
2623                             && (reginfo->intuit || regtry(reginfo, &s)))
2624                         {
2625                             goto got_it;
2626                         }
2627                         before = after;
2628                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2629                     }
2630                 }
2631                 else {  /* Not utf8. */
2632                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2633                     while (s < strend) {
2634                         SB_enum after = getSB_VAL_CP((U8) *s);
2635                         if ((to_complement ^ isSB(before,
2636                                                   after,
2637                                                   (U8*) reginfo->strbeg,
2638                                                   (U8*) s,
2639                                                   (U8*) reginfo->strend,
2640                                                   utf8_target))
2641                             && (reginfo->intuit || regtry(reginfo, &s)))
2642                         {
2643                             goto got_it;
2644                         }
2645                         before = after;
2646                         s++;
2647                     }
2648                 }
2649
2650                 /* Here are at the final position in the target string.  The SB
2651                  * value is always true here, so matches, depending on other
2652                  * constraints */
2653                 if (   reginfo->intuit
2654                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2655                 {
2656                     goto got_it;
2657                 }
2658
2659                 break;
2660
2661             case WB_BOUND:
2662                 if (s == reginfo->strbeg) {
2663                     if (reginfo->intuit || regtry(reginfo, &s)) {
2664                         goto got_it;
2665                     }
2666                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2667                     if (UNLIKELY(s >= reginfo->strend)) {
2668                         break;
2669                     }
2670                 }
2671
2672                 if (utf8_target) {
2673                     /* We are at a boundary between char_sub_0 and char_sub_1.
2674                      * We also keep track of the value for char_sub_-1 as we
2675                      * loop through the line.   Context may be needed to make a
2676                      * determination, and if so, this can save having to
2677                      * recalculate it */
2678                     WB_enum previous = WB_UNKNOWN;
2679                     WB_enum before = getWB_VAL_UTF8(
2680                                               reghop3((U8*)s,
2681                                                       -1,
2682                                                       (U8*)(reginfo->strbeg)),
2683                                               (U8*) reginfo->strend);
2684                     while (s < strend) {
2685                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2686                                                         (U8*) reginfo->strend);
2687                         if ((to_complement ^ isWB(previous,
2688                                                   before,
2689                                                   after,
2690                                                   (U8*) reginfo->strbeg,
2691                                                   (U8*) s,
2692                                                   (U8*) reginfo->strend,
2693                                                   utf8_target))
2694                             && (reginfo->intuit || regtry(reginfo, &s)))
2695                         {
2696                             goto got_it;
2697                         }
2698                         previous = before;
2699                         before = after;
2700                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2701                     }
2702                 }
2703                 else {  /* Not utf8. */
2704                     WB_enum previous = WB_UNKNOWN;
2705                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2706                     while (s < strend) {
2707                         WB_enum after = getWB_VAL_CP((U8) *s);
2708                         if ((to_complement ^ isWB(previous,
2709                                                   before,
2710                                                   after,
2711                                                   (U8*) reginfo->strbeg,
2712                                                   (U8*) s,
2713                                                   (U8*) reginfo->strend,
2714                                                   utf8_target))
2715                             && (reginfo->intuit || regtry(reginfo, &s)))
2716                         {
2717                             goto got_it;
2718                         }
2719                         previous = before;
2720                         before = after;
2721                         s++;
2722                     }
2723                 }
2724
2725                 if (   reginfo->intuit
2726                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2727                 {
2728                     goto got_it;
2729                 }
2730         }
2731         break;
2732
2733     case LNBREAK:
2734         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2735                         is_LNBREAK_latin1_safe(s, strend)
2736         );
2737         break;
2738
2739     /* The argument to all the POSIX node types is the class number to pass to
2740      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2741
2742     case NPOSIXL:
2743         to_complement = 1;
2744         /* FALLTHROUGH */
2745
2746     case POSIXL:
2747         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2748         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2749                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2750         break;
2751
2752     case NPOSIXD:
2753         to_complement = 1;
2754         /* FALLTHROUGH */
2755
2756     case POSIXD:
2757         if (utf8_target) {
2758             goto posix_utf8;
2759         }
2760         goto posixa;
2761
2762     case NPOSIXA:
2763         if (utf8_target) {
2764             /* The complement of something that matches only ASCII matches all
2765              * non-ASCII, plus everything in ASCII that isn't in the class. */
2766             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2767                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2768             break;
2769         }
2770
2771         to_complement = 1;
2772         goto posixa;
2773
2774     case POSIXA:
2775         /* Don't need to worry about utf8, as it can match only a single
2776          * byte invariant character.  But we do anyway for performance reasons,
2777          * as otherwise we would have to examine all the continuation
2778          * characters */
2779         if (utf8_target) {
2780             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2781             break;
2782         }
2783
2784       posixa:
2785         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2786                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2787         break;
2788
2789     case NPOSIXU:
2790         to_complement = 1;
2791         /* FALLTHROUGH */
2792
2793     case POSIXU:
2794         if (! utf8_target) {
2795             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2796                                  to_complement ^ cBOOL(_generic_isCC(*s,
2797                                                                     FLAGS(c))));
2798         }
2799         else {
2800
2801           posix_utf8:
2802             classnum = (_char_class_number) FLAGS(c);
2803             switch (classnum) {
2804                 default:
2805                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2806                         to_complement ^ cBOOL(_invlist_contains_cp(
2807                                               PL_XPosix_ptrs[classnum],
2808                                               utf8_to_uvchr_buf((U8 *) s,
2809                                                                 (U8 *) strend,
2810                                                                 NULL))));
2811                     break;
2812                 case _CC_ENUM_SPACE:
2813                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2814                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2815                     break;
2816
2817                 case _CC_ENUM_BLANK:
2818                     REXEC_FBC_CLASS_SCAN(1,
2819                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2820                     break;
2821
2822                 case _CC_ENUM_XDIGIT:
2823                     REXEC_FBC_CLASS_SCAN(1,
2824                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2825                     break;
2826
2827                 case _CC_ENUM_VERTSPACE:
2828                     REXEC_FBC_CLASS_SCAN(1,
2829                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2830                     break;
2831
2832                 case _CC_ENUM_CNTRL:
2833                     REXEC_FBC_CLASS_SCAN(1,
2834                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2835                     break;
2836             }
2837         }
2838         break;
2839
2840     case AHOCORASICKC:
2841     case AHOCORASICK:
2842         {
2843             DECL_TRIE_TYPE(c);
2844             /* what trie are we using right now */
2845             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2846             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2847             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2848
2849             const char *last_start = strend - trie->minlen;
2850 #ifdef DEBUGGING
2851             const char *real_start = s;
2852 #endif
2853             STRLEN maxlen = trie->maxlen;
2854             SV *sv_points;
2855             U8 **points; /* map of where we were in the input string
2856                             when reading a given char. For ASCII this
2857                             is unnecessary overhead as the relationship
2858                             is always 1:1, but for Unicode, especially
2859                             case folded Unicode this is not true. */
2860             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2861             U8 *bitmap=NULL;
2862
2863
2864             GET_RE_DEBUG_FLAGS_DECL;
2865
2866             /* We can't just allocate points here. We need to wrap it in
2867              * an SV so it gets freed properly if there is a croak while
2868              * running the match */
2869             ENTER;
2870             SAVETMPS;
2871             sv_points=newSV(maxlen * sizeof(U8 *));
2872             SvCUR_set(sv_points,
2873                 maxlen * sizeof(U8 *));
2874             SvPOK_on(sv_points);
2875             sv_2mortal(sv_points);
2876             points=(U8**)SvPV_nolen(sv_points );
2877             if ( trie_type != trie_utf8_fold
2878                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2879             {
2880                 if (trie->bitmap)
2881                     bitmap=(U8*)trie->bitmap;
2882                 else
2883                     bitmap=(U8*)ANYOF_BITMAP(c);
2884             }
2885             /* this is the Aho-Corasick algorithm modified a touch
2886                to include special handling for long "unknown char" sequences.
2887                The basic idea being that we use AC as long as we are dealing
2888                with a possible matching char, when we encounter an unknown char
2889                (and we have not encountered an accepting state) we scan forward
2890                until we find a legal starting char.
2891                AC matching is basically that of trie matching, except that when
2892                we encounter a failing transition, we fall back to the current
2893                states "fail state", and try the current char again, a process
2894                we repeat until we reach the root state, state 1, or a legal
2895                transition. If we fail on the root state then we can either
2896                terminate if we have reached an accepting state previously, or
2897                restart the entire process from the beginning if we have not.
2898
2899              */
2900             while (s <= last_start) {
2901                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2902                 U8 *uc = (U8*)s;
2903                 U16 charid = 0;
2904                 U32 base = 1;
2905                 U32 state = 1;
2906                 UV uvc = 0;
2907                 STRLEN len = 0;
2908                 STRLEN foldlen = 0;
2909                 U8 *uscan = (U8*)NULL;
2910                 U8 *leftmost = NULL;
2911 #ifdef DEBUGGING
2912                 U32 accepted_word= 0;
2913 #endif
2914                 U32 pointpos = 0;
2915
2916                 while ( state && uc <= (U8*)strend ) {
2917                     int failed=0;
2918                     U32 word = aho->states[ state ].wordnum;
2919
2920                     if( state==1 ) {
2921                         if ( bitmap ) {
2922                             DEBUG_TRIE_EXECUTE_r(
2923                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2924                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2925                                         (char *)uc, utf8_target, 0 );
2926                                     Perl_re_printf( aTHX_
2927                                         " Scanning for legal start char...\n");
2928                                 }
2929                             );
2930                             if (utf8_target) {
2931                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2932                                     uc += UTF8SKIP(uc);
2933                                 }
2934                             } else {
2935                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2936                                     uc++;
2937                                 }
2938                             }
2939                             s= (char *)uc;
2940                         }
2941                         if (uc >(U8*)last_start) break;
2942                     }
2943
2944                     if ( word ) {
2945                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2946                         if (!leftmost || lpos < leftmost) {
2947                             DEBUG_r(accepted_word=word);
2948                             leftmost= lpos;
2949                         }
2950                         if (base==0) break;
2951
2952                     }
2953                     points[pointpos++ % maxlen]= uc;
2954                     if (foldlen || uc < (U8*)strend) {
2955                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2956                                              (U8 *) strend, uscan, len, uvc,
2957                                              charid, foldlen, foldbuf,
2958                                              uniflags);
2959                         DEBUG_TRIE_EXECUTE_r({
2960                             dump_exec_pos( (char *)uc, c, strend,
2961                                         real_start, s, utf8_target, 0);
2962                             Perl_re_printf( aTHX_
2963                                 " Charid:%3u CP:%4" UVxf " ",
2964                                  charid, uvc);
2965                         });
2966                     }
2967                     else {
2968                         len = 0;
2969                         charid = 0;
2970                     }
2971
2972
2973                     do {
2974 #ifdef DEBUGGING
2975                         word = aho->states[ state ].wordnum;
2976 #endif
2977                         base = aho->states[ state ].trans.base;
2978
2979                         DEBUG_TRIE_EXECUTE_r({
2980                             if (failed)
2981                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2982                                     s,   utf8_target, 0 );
2983                             Perl_re_printf( aTHX_
2984                                 "%sState: %4" UVxf ", word=%" UVxf,
2985                                 failed ? " Fail transition to " : "",
2986                                 (UV)state, (UV)word);
2987                         });
2988                         if ( base ) {
2989                             U32 tmp;
2990                             I32 offset;
2991                             if (charid &&
2992                                  ( ((offset = base + charid
2993                                     - 1 - trie->uniquecharcount)) >= 0)
2994                                  && ((U32)offset < trie->lasttrans)
2995                                  && trie->trans[offset].check == state
2996                                  && (tmp=trie->trans[offset].next))
2997                             {
2998                                 DEBUG_TRIE_EXECUTE_r(
2999                                     Perl_re_printf( aTHX_ " - legal\n"));
3000                                 state = tmp;
3001                                 break;
3002                             }
3003                             else {
3004                                 DEBUG_TRIE_EXECUTE_r(
3005                                     Perl_re_printf( aTHX_ " - fail\n"));
3006                                 failed = 1;
3007                                 state = aho->fail[state];
3008                             }
3009                         }
3010                         else {
3011                             /* we must be accepting here */
3012                             DEBUG_TRIE_EXECUTE_r(
3013                                     Perl_re_printf( aTHX_ " - accepting\n"));
3014                             failed = 1;
3015                             break;
3016                         }
3017                     } while(state);
3018                     uc += len;
3019                     if (failed) {
3020                         if (leftmost)
3021                             break;
3022                         if (!state) state = 1;
3023                     }
3024                 }
3025                 if ( aho->states[ state ].wordnum ) {
3026                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3027                     if (!leftmost || lpos < leftmost) {
3028                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3029                         leftmost = lpos;
3030                     }
3031                 }
3032                 if (leftmost) {
3033                     s = (char*)leftmost;
3034                     DEBUG_TRIE_EXECUTE_r({
3035                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3036                             (UV)accepted_word, (IV)(s - real_start)
3037                         );
3038                     });
3039                     if (reginfo->intuit || regtry(reginfo, &s)) {
3040                         FREETMPS;
3041                         LEAVE;
3042                         goto got_it;
3043                     }
3044                     if (s < reginfo->strend) {
3045                         s = HOPc(s,1);
3046                     }
3047                     DEBUG_TRIE_EXECUTE_r({
3048                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3049                     });
3050                 } else {
3051                     DEBUG_TRIE_EXECUTE_r(
3052                         Perl_re_printf( aTHX_ "No match.\n"));
3053                     break;
3054                 }
3055             }
3056             FREETMPS;
3057             LEAVE;
3058         }
3059         break;
3060     default:
3061         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3062     }
3063     return 0;
3064   got_it:
3065     return s;
3066 }
3067
3068 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3069  * flags have same meanings as with regexec_flags() */
3070
3071 static void
3072 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3073                             char *strbeg,
3074                             char *strend,
3075                             SV *sv,
3076                             U32 flags,
3077                             bool utf8_target)
3078 {
3079     struct regexp *const prog = ReANY(rx);
3080
3081     if (flags & REXEC_COPY_STR) {
3082 #ifdef PERL_ANY_COW
3083         if (SvCANCOW(sv)) {
3084             DEBUG_C(Perl_re_printf( aTHX_
3085                               "Copy on write: regexp capture, type %d\n",
3086                                     (int) SvTYPE(sv)));
3087             /* Create a new COW SV to share the match string and store
3088              * in saved_copy, unless the current COW SV in saved_copy
3089              * is valid and suitable for our purpose */
3090             if ((   prog->saved_copy
3091                  && SvIsCOW(prog->saved_copy)
3092                  && SvPOKp(prog->saved_copy)
3093                  && SvIsCOW(sv)
3094                  && SvPOKp(sv)
3095                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3096             {
3097                 /* just reuse saved_copy SV */
3098                 if (RXp_MATCH_COPIED(prog)) {
3099                     Safefree(prog->subbeg);
3100                     RXp_MATCH_COPIED_off(prog);
3101                 }
3102             }
3103             else {
3104                 /* create new COW SV to share string */
3105                 RXp_MATCH_COPY_FREE(prog);
3106                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3107             }
3108             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3109             assert (SvPOKp(prog->saved_copy));
3110             prog->sublen  = strend - strbeg;
3111             prog->suboffset = 0;
3112             prog->subcoffset = 0;
3113         } else
3114 #endif
3115         {
3116             SSize_t min = 0;
3117             SSize_t max = strend - strbeg;
3118             SSize_t sublen;
3119
3120             if (    (flags & REXEC_COPY_SKIP_POST)
3121                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3122                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3123             ) { /* don't copy $' part of string */
3124                 U32 n = 0;
3125                 max = -1;
3126                 /* calculate the right-most part of the string covered
3127                  * by a capture. Due to lookahead, this may be to
3128                  * the right of $&, so we have to scan all captures */
3129                 while (n <= prog->lastparen) {
3130                     if (prog->offs[n].end > max)
3131                         max = prog->offs[n].end;
3132                     n++;
3133                 }
3134                 if (max == -1)
3135                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3136                             ? prog->offs[0].start
3137                             : 0;
3138                 assert(max >= 0 && max <= strend - strbeg);
3139             }
3140
3141             if (    (flags & REXEC_COPY_SKIP_PRE)
3142                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3143                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3144             ) { /* don't copy $` part of string */
3145                 U32 n = 0;
3146                 min = max;
3147                 /* calculate the left-most part of the string covered
3148                  * by a capture. Due to lookbehind, this may be to
3149                  * the left of $&, so we have to scan all captures */
3150                 while (min && n <= prog->lastparen) {
3151                     if (   prog->offs[n].start != -1
3152                         && prog->offs[n].start < min)
3153                     {
3154                         min = prog->offs[n].start;
3155                     }
3156                     n++;
3157                 }
3158                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3159                     && min >  prog->offs[0].end
3160                 )
3161                     min = prog->offs[0].end;
3162
3163             }
3164
3165             assert(min >= 0 && min <= max && min <= strend - strbeg);
3166             sublen = max - min;
3167
3168             if (RXp_MATCH_COPIED(prog)) {
3169                 if (sublen > prog->sublen)
3170                     prog->subbeg =
3171                             (char*)saferealloc(prog->subbeg, sublen+1);
3172             }
3173             else
3174                 prog->subbeg = (char*)safemalloc(sublen+1);
3175             Copy(strbeg + min, prog->subbeg, sublen, char);
3176             prog->subbeg[sublen] = '\0';
3177             prog->suboffset = min;
3178             prog->sublen = sublen;
3179             RXp_MATCH_COPIED_on(prog);
3180         }
3181         prog->subcoffset = prog->suboffset;
3182         if (prog->suboffset && utf8_target) {
3183             /* Convert byte offset to chars.
3184              * XXX ideally should only compute this if @-/@+
3185              * has been seen, a la PL_sawampersand ??? */
3186
3187             /* If there's a direct correspondence between the
3188              * string which we're matching and the original SV,
3189              * then we can use the utf8 len cache associated with
3190              * the SV. In particular, it means that under //g,
3191              * sv_pos_b2u() will use the previously cached
3192              * position to speed up working out the new length of
3193              * subcoffset, rather than counting from the start of
3194              * the string each time. This stops
3195              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3196              * from going quadratic */
3197             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3198                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3199                                                 SV_GMAGIC|SV_CONST_RETURN);
3200             else
3201                 prog->subcoffset = utf8_length((U8*)strbeg,
3202                                     (U8*)(strbeg+prog->suboffset));
3203         }
3204     }
3205     else {
3206         RXp_MATCH_COPY_FREE(prog);
3207         prog->subbeg = strbeg;
3208         prog->suboffset = 0;
3209         prog->subcoffset = 0;
3210         prog->sublen = strend - strbeg;
3211     }
3212 }
3213
3214
3215
3216
3217 /*
3218  - regexec_flags - match a regexp against a string
3219  */
3220 I32
3221 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3222               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3223 /* stringarg: the point in the string at which to begin matching */
3224 /* strend:    pointer to null at end of string */
3225 /* strbeg:    real beginning of string */
3226 /* minend:    end of match must be >= minend bytes after stringarg. */
3227 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3228  *            itself is accessed via the pointers above */
3229 /* data:      May be used for some additional optimizations.
3230               Currently unused. */
3231 /* flags:     For optimizations. See REXEC_* in regexp.h */
3232
3233 {
3234     struct regexp *const prog = ReANY(rx);
3235     char *s;
3236     regnode *c;
3237     char *startpos;
3238     SSize_t minlen;             /* must match at least this many chars */
3239     SSize_t dontbother = 0;     /* how many characters not to try at end */
3240     const bool utf8_target = cBOOL(DO_UTF8(sv));
3241     I32 multiline;
3242     RXi_GET_DECL(prog,progi);
3243     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3244     regmatch_info *const reginfo = &reginfo_buf;
3245     regexp_paren_pair *swap = NULL;
3246     I32 oldsave;
3247     GET_RE_DEBUG_FLAGS_DECL;
3248
3249     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3250     PERL_UNUSED_ARG(data);
3251
3252     /* Be paranoid... */
3253     if (prog == NULL) {
3254         Perl_croak(aTHX_ "NULL regexp parameter");
3255     }
3256
3257     DEBUG_EXECUTE_r(
3258         debug_start_match(rx, utf8_target, stringarg, strend,
3259         "Matching");
3260     );
3261
3262     startpos = stringarg;
3263
3264     /* set these early as they may be used by the HOP macros below */
3265     reginfo->strbeg = strbeg;
3266     reginfo->strend = strend;
3267     reginfo->is_utf8_target = cBOOL(utf8_target);
3268
3269     if (prog->intflags & PREGf_GPOS_SEEN) {
3270         MAGIC *mg;
3271
3272         /* set reginfo->ganch, the position where \G can match */
3273
3274         reginfo->ganch =
3275             (flags & REXEC_IGNOREPOS)
3276             ? stringarg /* use start pos rather than pos() */
3277             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3278               /* Defined pos(): */
3279             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3280             : strbeg; /* pos() not defined; use start of string */
3281
3282         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3283             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3284
3285         /* in the presence of \G, we may need to start looking earlier in
3286          * the string than the suggested start point of stringarg:
3287          * if prog->gofs is set, then that's a known, fixed minimum
3288          * offset, such as
3289          * /..\G/:   gofs = 2
3290          * /ab|c\G/: gofs = 1
3291          * or if the minimum offset isn't known, then we have to go back
3292          * to the start of the string, e.g. /w+\G/
3293          */
3294
3295         if (prog->intflags & PREGf_ANCH_GPOS) {
3296             if (prog->gofs) {
3297                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3298                 if (!startpos ||
3299                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3300                 {
3301                     DEBUG_r(Perl_re_printf( aTHX_
3302                             "fail: ganch-gofs before earliest possible start\n"));
3303                     return 0;
3304                 }
3305             }
3306             else
3307                 startpos = reginfo->ganch;
3308         }
3309         else if (prog->gofs) {
3310             startpos = HOPBACKc(startpos, prog->gofs);
3311             if (!startpos)
3312                 startpos = strbeg;
3313         }
3314         else if (prog->intflags & PREGf_GPOS_FLOAT)
3315             startpos = strbeg;
3316     }
3317
3318     minlen = prog->minlen;
3319     if ((startpos + minlen) > strend || startpos < strbeg) {
3320         DEBUG_r(Perl_re_printf( aTHX_
3321                     "Regex match can't succeed, so not even tried\n"));
3322         return 0;
3323     }
3324
3325     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3326      * which will call destuctors to reset PL_regmatch_state, free higher
3327      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3328      * regmatch_info_aux_eval */
3329
3330     oldsave = PL_savestack_ix;
3331
3332     s = startpos;
3333
3334     if ((prog->extflags & RXf_USE_INTUIT)
3335         && !(flags & REXEC_CHECKED))
3336     {
3337         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3338                                     flags, NULL);
3339         if (!s)
3340             return 0;
3341
3342         if (prog->extflags & RXf_CHECK_ALL) {
3343             /* we can match based purely on the result of INTUIT.
3344              * Set up captures etc just for $& and $-[0]
3345              * (an intuit-only match wont have $1,$2,..) */
3346             assert(!prog->nparens);
3347
3348             /* s/// doesn't like it if $& is earlier than where we asked it to
3349              * start searching (which can happen on something like /.\G/) */
3350             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3351                     && (s < stringarg))
3352             {
3353                 /* this should only be possible under \G */
3354                 assert(prog->intflags & PREGf_GPOS_SEEN);
3355                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3356                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3357                 goto phooey;
3358             }
3359
3360             /* match via INTUIT shouldn't have any captures.
3361              * Let @-, @+, $^N know */
3362             prog->lastparen = prog->lastcloseparen = 0;
3363             RXp_MATCH_UTF8_set(prog, utf8_target);
3364             prog->offs[0].start = s - strbeg;
3365             prog->offs[0].end = utf8_target
3366                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3367                 : s - strbeg + prog->minlenret;
3368             if ( !(flags & REXEC_NOT_FIRST) )
3369                 S_reg_set_capture_string(aTHX_ rx,
3370                                         strbeg, strend,
3371                                         sv, flags, utf8_target);
3372
3373             return 1;
3374         }
3375     }
3376
3377     multiline = prog->extflags & RXf_PMf_MULTILINE;
3378
3379     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3380         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3381                               "String too short [regexec_flags]...\n"));
3382         goto phooey;
3383     }
3384
3385     /* Check validity of program. */
3386     if (UCHARAT(progi->program) != REG_MAGIC) {
3387         Perl_croak(aTHX_ "corrupted regexp program");
3388     }
3389
3390     RXp_MATCH_TAINTED_off(prog);
3391     RXp_MATCH_UTF8_set(prog, utf8_target);
3392
3393     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3394     reginfo->intuit = 0;
3395     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3396     reginfo->warned = FALSE;
3397     reginfo->sv = sv;
3398     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3399     /* see how far we have to get to not match where we matched before */
3400     reginfo->till = stringarg + minend;
3401
3402     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3403         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3404            S_cleanup_regmatch_info_aux has executed (registered by
3405            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3406            magic belonging to this SV.
3407            Not newSVsv, either, as it does not COW.
3408         */
3409         reginfo->sv = newSV(0);
3410         SvSetSV_nosteal(reginfo->sv, sv);
3411         SAVEFREESV(reginfo->sv);
3412     }
3413
3414     /* reserve next 2 or 3 slots in PL_regmatch_state:
3415      * slot N+0: may currently be in use: skip it
3416      * slot N+1: use for regmatch_info_aux struct
3417      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3418      * slot N+3: ready for use by regmatch()
3419      */
3420
3421     {
3422         regmatch_state *old_regmatch_state;
3423         regmatch_slab  *old_regmatch_slab;
3424         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3425
3426         /* on first ever match, allocate first slab */
3427         if (!PL_regmatch_slab) {
3428             Newx(PL_regmatch_slab, 1, regmatch_slab);
3429             PL_regmatch_slab->prev = NULL;
3430             PL_regmatch_slab->next = NULL;
3431             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3432         }
3433
3434         old_regmatch_state = PL_regmatch_state;
3435         old_regmatch_slab  = PL_regmatch_slab;
3436
3437         for (i=0; i <= max; i++) {
3438             if (i == 1)
3439                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3440             else if (i ==2)
3441                 reginfo->info_aux_eval =
3442                 reginfo->info_aux->info_aux_eval =
3443                             &(PL_regmatch_state->u.info_aux_eval);
3444
3445             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3446                 PL_regmatch_state = S_push_slab(aTHX);
3447         }
3448
3449         /* note initial PL_regmatch_state position; at end of match we'll
3450          * pop back to there and free any higher slabs */
3451
3452         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3453         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3454         reginfo->info_aux->poscache = NULL;
3455
3456         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3457
3458         if ((prog->extflags & RXf_EVAL_SEEN))
3459             S_setup_eval_state(aTHX_ reginfo);
3460         else
3461             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3462     }
3463
3464     /* If there is a "must appear" string, look for it. */
3465
3466     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3467         /* We have to be careful. If the previous successful match
3468            was from this regex we don't want a subsequent partially
3469            successful match to clobber the old results.
3470            So when we detect this possibility we add a swap buffer
3471            to the re, and switch the buffer each match. If we fail,
3472            we switch it back; otherwise we leave it swapped.
3473         */
3474         swap = prog->offs;
3475         /* avoid leak if we die, or clean up anyway if match completes */
3476         SAVEFREEPV(swap);
3477         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3478         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3479             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3480             0,
3481             PTR2UV(prog),
3482             PTR2UV(swap),
3483             PTR2UV(prog->offs)
3484         ));
3485     }
3486
3487     if (prog->recurse_locinput)
3488         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3489
3490     /* Simplest case: anchored match need be tried only once, or with
3491      * MBOL, only at the beginning of each line.
3492      *
3493      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3494      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3495      * match at the start of the string then it won't match anywhere else
3496      * either; while with /.*.../, if it doesn't match at the beginning,
3497      * the earliest it could match is at the start of the next line */
3498
3499     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3500         char *end;
3501
3502         if (regtry(reginfo, &s))
3503             goto got_it;
3504
3505         if (!(prog->intflags & PREGf_ANCH_MBOL))
3506             goto phooey;
3507
3508         /* didn't match at start, try at other newline positions */
3509
3510         if (minlen)
3511             dontbother = minlen - 1;
3512         end = HOP3c(strend, -dontbother, strbeg) - 1;
3513
3514         /* skip to next newline */
3515
3516         while (s <= end) { /* note it could be possible to match at the end of the string */
3517             /* NB: newlines are the same in unicode as they are in latin */
3518             if (*s++ != '\n')
3519                 continue;
3520             if (prog->check_substr || prog->check_utf8) {
3521             /* note that with PREGf_IMPLICIT, intuit can only fail
3522              * or return the start position, so it's of limited utility.
3523              * Nevertheless, I made the decision that the potential for
3524              * quick fail was still worth it - DAPM */
3525                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3526                 if (!s)
3527                     goto phooey;
3528             }
3529             if (regtry(reginfo, &s))
3530                 goto got_it;
3531         }
3532         goto phooey;
3533     } /* end anchored search */
3534
3535     if (prog->intflags & PREGf_ANCH_GPOS)
3536     {
3537         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3538         assert(prog->intflags & PREGf_GPOS_SEEN);
3539         /* For anchored \G, the only position it can match from is
3540          * (ganch-gofs); we already set startpos to this above; if intuit
3541          * moved us on from there, we can't possibly succeed */
3542         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3543         if (s == startpos && regtry(reginfo, &s))
3544             goto got_it;
3545         goto phooey;
3546     }
3547
3548     /* Messy cases:  unanchored match. */
3549     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3550         /* we have /x+whatever/ */
3551         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3552         char ch;
3553 #ifdef DEBUGGING
3554         int did_match = 0;
3555 #endif
3556         if (utf8_target) {
3557             if (! prog->anchored_utf8) {
3558                 to_utf8_substr(prog);
3559             }
3560             ch = SvPVX_const(prog->anchored_utf8)[0];
3561             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3562                 if (*s == ch) {
3563                     DEBUG_EXECUTE_r( did_match = 1 );
3564                     if (regtry(reginfo, &s)) goto got_it;
3565                     s += UTF8_SAFE_SKIP(s, strend);
3566                     while (s < strend && *s == ch)
3567                         s += UTF8SKIP(s);
3568                 }
3569             );
3570
3571         }
3572         else {
3573             if (! prog->anchored_substr) {
3574                 if (! to_byte_substr(prog)) {
3575                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3576                 }
3577             }
3578             ch = SvPVX_const(prog->anchored_substr)[0];
3579             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3580                 if (*s == ch) {
3581                     DEBUG_EXECUTE_r( did_match = 1 );
3582                     if (regtry(reginfo, &s)) goto got_it;
3583                     s++;
3584                     while (s < strend && *s == ch)
3585                         s++;
3586                 }
3587             );
3588         }
3589         DEBUG_EXECUTE_r(if (!did_match)
3590                 Perl_re_printf( aTHX_
3591                                   "Did not find anchored character...\n")
3592                );
3593     }
3594     else if (prog->anchored_substr != NULL
3595               || prog->anchored_utf8 != NULL
3596               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3597                   && prog->float_max_offset < strend - s)) {
3598         SV *must;
3599         SSize_t back_max;
3600         SSize_t back_min;
3601         char *last;
3602         char *last1;            /* Last position checked before */
3603 #ifdef DEBUGGING
3604         int did_match = 0;
3605 #endif
3606         if (prog->anchored_substr || prog->anchored_utf8) {
3607             if (utf8_target) {
3608                 if (! prog->anchored_utf8) {
3609                     to_utf8_substr(prog);
3610                 }
3611                 must = prog->anchored_utf8;
3612             }
3613             else {
3614                 if (! prog->anchored_substr) {
3615                     if (! to_byte_substr(prog)) {
3616                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3617                     }
3618                 }
3619                 must = prog->anchored_substr;
3620             }
3621             back_max = back_min = prog->anchored_offset;
3622         } else {
3623             if (utf8_target) {
3624                 if (! prog->float_utf8) {
3625                     to_utf8_substr(prog);
3626                 }
3627                 must = prog->float_utf8;
3628             }
3629             else {
3630                 if (! prog->float_substr) {
3631                     if (! to_byte_substr(prog)) {
3632                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3633                     }
3634                 }
3635                 must = prog->float_substr;
3636             }
3637             back_max = prog->float_max_offset;
3638             back_min = prog->float_min_offset;
3639         }
3640
3641         if (back_min<0) {
3642             last = strend;
3643         } else {
3644             last = HOP3c(strend,        /* Cannot start after this */
3645                   -(SSize_t)(CHR_SVLEN(must)
3646                          - (SvTAIL(must) != 0) + back_min), strbeg);
3647         }
3648         if (s > reginfo->strbeg)
3649             last1 = HOPc(s, -1);
3650         else
3651             last1 = s - 1;      /* bogus */
3652
3653         /* XXXX check_substr already used to find "s", can optimize if
3654            check_substr==must. */
3655         dontbother = 0;
3656         strend = HOPc(strend, -dontbother);
3657         while ( (s <= last) &&
3658                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3659                                   (unsigned char*)strend, must,
3660                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3661             DEBUG_EXECUTE_r( did_match = 1 );
3662             if (HOPc(s, -back_max) > last1) {
3663                 last1 = HOPc(s, -back_min);
3664                 s = HOPc(s, -back_max);
3665             }
3666             else {
3667                 char * const t = (last1 >= reginfo->strbeg)
3668                                     ? HOPc(last1, 1) : last1 + 1;
3669
3670                 last1 = HOPc(s, -back_min);
3671                 s = t;
3672             }
3673             if (utf8_target) {
3674                 while (s <= last1) {
3675                     if (regtry(reginfo, &s))
3676                         goto got_it;
3677                     if (s >= last1) {
3678                         s++; /* to break out of outer loop */
3679                         break;
3680                     }
3681                     s += UTF8SKIP(s);
3682                 }
3683             }
3684             else {
3685                 while (s <= last1) {
3686                     if (regtry(reginfo, &s))
3687                         goto got_it;
3688                     s++;
3689                 }
3690             }
3691         }
3692         DEBUG_EXECUTE_r(if (!did_match) {
3693             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3694                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3695             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3696                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3697                                ? "anchored" : "floating"),
3698                 quoted, RE_SV_TAIL(must));
3699         });
3700         goto phooey;
3701     }
3702     else if ( (c = progi->regstclass) ) {
3703         if (minlen) {
3704             const OPCODE op = OP(progi->regstclass);
3705             /* don't bother with what can't match */
3706             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3707                 strend = HOPc(strend, -(minlen - 1));
3708         }
3709         DEBUG_EXECUTE_r({
3710             SV * const prop = sv_newmortal();
3711             regprop(prog, prop, c, reginfo, NULL);
3712             {
3713                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3714                     s,strend-s,PL_dump_re_max_len);
3715                 Perl_re_printf( aTHX_
3716                     "Matching stclass %.*s against %s (%d bytes)\n",
3717                     (int)SvCUR(prop), SvPVX_const(prop),
3718                      quoted, (int)(strend - s));
3719             }
3720         });
3721         if (find_byclass(prog, c, s, strend, reginfo))
3722             goto got_it;
3723         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3724     }
3725     else {
3726         dontbother = 0;
3727         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3728             /* Trim the end. */
3729             char *last= NULL;
3730             SV* float_real;
3731             STRLEN len;
3732             const char *little;
3733
3734             if (utf8_target) {
3735                 if (! prog->float_utf8) {
3736                     to_utf8_substr(prog);
3737                 }
3738                 float_real = prog->float_utf8;
3739             }
3740             else {
3741                 if (! prog->float_substr) {
3742                     if (! to_byte_substr(prog)) {
3743                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3744                     }
3745                 }
3746                 float_real = prog->float_substr;
3747             }
3748
3749             little = SvPV_const(float_real, len);
3750             if (SvTAIL(float_real)) {
3751                     /* This means that float_real contains an artificial \n on
3752                      * the end due to the presence of something like this:
3753                      * /foo$/ where we can match both "foo" and "foo\n" at the
3754                      * end of the string.  So we have to compare the end of the
3755                      * string first against the float_real without the \n and
3756                      * then against the full float_real with the string.  We
3757                      * have to watch out for cases where the string might be
3758                      * smaller than the float_real or the float_real without
3759                      * the \n. */
3760                     char *checkpos= strend - len;
3761                     DEBUG_OPTIMISE_r(
3762                         Perl_re_printf( aTHX_
3763                             "%sChecking for float_real.%s\n",
3764                             PL_colors[4], PL_colors[5]));
3765                     if (checkpos + 1 < strbeg) {
3766                         /* can't match, even if we remove the trailing \n
3767                          * string is too short to match */
3768                         DEBUG_EXECUTE_r(
3769                             Perl_re_printf( aTHX_
3770                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3771                                 PL_colors[4], PL_colors[5]));
3772                         goto phooey;
3773                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3774                         /* can match, the end of the string matches without the
3775                          * "\n" */
3776                         last = checkpos + 1;
3777                     } else if (checkpos < strbeg) {
3778                         /* cant match, string is too short when the "\n" is
3779                          * included */
3780                         DEBUG_EXECUTE_r(
3781                             Perl_re_printf( aTHX_
3782                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3783                                 PL_colors[4], PL_colors[5]));
3784                         goto phooey;
3785                     } else if (!multiline) {
3786                         /* non multiline match, so compare with the "\n" at the
3787                          * end of the string */
3788                         if (memEQ(checkpos, little, len)) {
3789                             last= checkpos;
3790                         } else {
3791                             DEBUG_EXECUTE_r(
3792                                 Perl_re_printf( aTHX_
3793                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3794                                     PL_colors[4], PL_colors[5]));
3795                             goto phooey;
3796                         }
3797                     } else {
3798                         /* multiline match, so we have to search for a place
3799                          * where the full string is located */
3800                         goto find_last;
3801                     }
3802             } else {
3803                   find_last:
3804                     if (len)
3805                         last = rninstr(s, strend, little, little + len);
3806                     else
3807                         last = strend;  /* matching "$" */
3808             }
3809             if (!last) {
3810                 /* at one point this block contained a comment which was
3811                  * probably incorrect, which said that this was a "should not
3812                  * happen" case.  Even if it was true when it was written I am
3813                  * pretty sure it is not anymore, so I have removed the comment
3814                  * and replaced it with this one. Yves */
3815                 DEBUG_EXECUTE_r(
3816                     Perl_re_printf( aTHX_
3817                         "%sString does not contain required substring, cannot match.%s\n",
3818                         PL_colors[4], PL_colors[5]
3819                     ));
3820                 goto phooey;
3821             }
3822             dontbother = strend - last + prog->float_min_offset;
3823         }
3824         if (minlen && (dontbother < minlen))
3825             dontbother = minlen - 1;
3826         strend -= dontbother;              /* this one's always in bytes! */
3827         /* We don't know much -- general case. */
3828         if (utf8_target) {
3829             for (;;) {
3830                 if (regtry(reginfo, &s))
3831                     goto got_it;
3832                 if (s >= strend)
3833                     break;
3834                 s += UTF8SKIP(s);
3835             };
3836         }
3837         else {
3838             do {
3839                 if (regtry(reginfo, &s))
3840                     goto got_it;
3841             } while (s++ < strend);
3842         }
3843     }
3844
3845     /* Failure. */
3846     goto phooey;
3847
3848   got_it:
3849     /* s/// doesn't like it if $& is earlier than where we asked it to
3850      * start searching (which can happen on something like /.\G/) */
3851     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3852             && (prog->offs[0].start < stringarg - strbeg))
3853     {
3854         /* this should only be possible under \G */
3855         assert(prog->intflags & PREGf_GPOS_SEEN);
3856         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3857             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3858         goto phooey;
3859     }
3860
3861     /* clean up; this will trigger destructors that will free all slabs
3862      * above the current one, and cleanup the regmatch_info_aux
3863      * and regmatch_info_aux_eval sructs */
3864
3865     LEAVE_SCOPE(oldsave);
3866
3867     if (RXp_PAREN_NAMES(prog))
3868         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3869
3870     /* make sure $`, $&, $', and $digit will work later */
3871     if ( !(flags & REXEC_NOT_FIRST) )
3872         S_reg_set_capture_string(aTHX_ rx,
3873                                     strbeg, reginfo->strend,
3874                                     sv, flags, utf8_target);
3875
3876     return 1;
3877
3878   phooey:
3879     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3880                           PL_colors[4], PL_colors[5]));
3881
3882     if (swap) {
3883         /* we failed :-( roll it back.
3884          * Since the swap buffer will be freed on scope exit which follows
3885          * shortly, restore the old captures by copying 'swap's original
3886          * data to the new offs buffer
3887          */
3888         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3889             "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
3890             0,
3891             PTR2UV(prog),
3892             PTR2UV(prog->offs),
3893             PTR2UV(swap)
3894         ));
3895
3896         Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
3897     }
3898
3899     /* clean up; this will trigger destructors that will free all slabs
3900      * above the current one, and cleanup the regmatch_info_aux
3901      * and regmatch_info_aux_eval sructs */
3902
3903     LEAVE_SCOPE(oldsave);
3904
3905     return 0;
3906 }
3907
3908
3909 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3910  * Do inc before dec, in case old and new rex are the same */
3911 #define SET_reg_curpm(Re2)                          \
3912     if (reginfo->info_aux_eval) {                   \
3913         (void)ReREFCNT_inc(Re2);                    \
3914         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3915         PM_SETRE((PL_reg_curpm), (Re2));            \
3916     }
3917
3918
3919 /*
3920  - regtry - try match at specific point
3921  */
3922 STATIC bool                     /* 0 failure, 1 success */
3923 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3924 {
3925     CHECKPOINT lastcp;
3926     REGEXP *const rx = reginfo->prog;
3927     regexp *const prog = ReANY(rx);
3928     SSize_t result;
3929 #ifdef DEBUGGING
3930     U32 depth = 0; /* used by REGCP_SET */
3931 #endif
3932     RXi_GET_DECL(prog,progi);
3933     GET_RE_DEBUG_FLAGS_DECL;
3934
3935     PERL_ARGS_ASSERT_REGTRY;
3936
3937     reginfo->cutpoint=NULL;
3938
3939     prog->offs[0].start = *startposp - reginfo->strbeg;
3940     prog->lastparen = 0;
3941     prog->lastcloseparen = 0;
3942
3943     /* XXXX What this code is doing here?!!!  There should be no need
3944        to do this again and again, prog->lastparen should take care of
3945        this!  --ilya*/
3946
3947     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3948      * Actually, the code in regcppop() (which Ilya may be meaning by
3949      * prog->lastparen), is not needed at all by the test suite
3950      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3951      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3952      * Meanwhile, this code *is* needed for the
3953      * above-mentioned test suite tests to succeed.  The common theme
3954      * on those tests seems to be returning null fields from matches.
3955      * --jhi updated by dapm */
3956
3957     /* After encountering a variant of the issue mentioned above I think
3958      * the point Ilya was making is that if we properly unwind whenever
3959      * we set lastparen to a smaller value then we should not need to do
3960      * this every time, only when needed. So if we have tests that fail if
3961      * we remove this, then it suggests somewhere else we are improperly
3962      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3963      * places it is called, and related regcp() routines. - Yves */
3964 #if 1
3965     if (prog->nparens) {
3966         regexp_paren_pair *pp = prog->offs;
3967         I32 i;
3968         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3969             ++pp;
3970             pp->start = -1;
3971             pp->end = -1;
3972         }
3973     }
3974 #endif
3975     REGCP_SET(lastcp);
3976     result = regmatch(reginfo, *startposp, progi->program + 1);
3977     if (result != -1) {
3978         prog->offs[0].end = result;
3979         return 1;
3980     }
3981     if (reginfo->cutpoint)
3982         *startposp= reginfo->cutpoint;
3983     REGCP_UNWIND(lastcp);
3984     return 0;
3985 }
3986
3987 /* this is used to determine how far from the left messages like
3988    'failed...' are printed in regexec.c. It should be set such that
3989    messages are inline with the regop output that created them.
3990 */
3991 #define REPORT_CODE_OFF 29
3992 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3993 #ifdef DEBUGGING
3994 int
3995 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3996 {
3997     va_list ap;
3998     int result;
3999     PerlIO *f= Perl_debug_log;
4000     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4001     va_start(ap, depth);
4002     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4003     result = PerlIO_vprintf(f, fmt, ap);
4004     va_end(ap);
4005     return result;
4006 }
4007 #endif /* DEBUGGING */
4008
4009 /* grab a new slab and return the first slot in it */
4010
4011 STATIC regmatch_state *
4012 S_push_slab(pTHX)
4013 {
4014     regmatch_slab *s = PL_regmatch_slab->next;
4015     if (!s) {
4016         Newx(s, 1, regmatch_slab);
4017         s->prev = PL_regmatch_slab;
4018         s->next = NULL;
4019         PL_regmatch_slab->next = s;
4020     }
4021     PL_regmatch_slab = s;
4022     return SLAB_FIRST(s);
4023 }
4024
4025 #ifdef DEBUGGING
4026
4027 STATIC void
4028 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4029     const char *start, const char *end, const char *blurb)
4030 {
4031     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4032
4033     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4034
4035     if (!PL_colorset)
4036             reginitcolors();
4037     {
4038         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4039             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4040
4041         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4042             start, end - start, PL_dump_re_max_len);
4043
4044         Perl_re_printf( aTHX_
4045             "%s%s REx%s %s against %s\n",
4046                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4047
4048         if (utf8_target||utf8_pat)
4049             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4050                 utf8_pat ? "pattern" : "",
4051                 utf8_pat && utf8_target ? " and " : "",
4052                 utf8_target ? "string" : ""
4053             );
4054     }
4055 }
4056
4057 STATIC void
4058 S_dump_exec_pos(pTHX_ const char *locinput,
4059                       const regnode *scan,
4060                       const char *loc_regeol,
4061                       const char *loc_bostr,
4062                       const char *loc_reg_starttry,
4063                       const bool utf8_target,
4064                       const U32 depth
4065                 )
4066 {
4067     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4068     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4069     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4070     /* The part of the string before starttry has one color
4071        (pref0_len chars), between starttry and current
4072        position another one (pref_len - pref0_len chars),
4073        after the current position the third one.
4074        We assume that pref0_len <= pref_len, otherwise we
4075        decrease pref0_len.  */
4076     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4077         ? (5 + taill) - l : locinput - loc_bostr;
4078     int pref0_len;
4079
4080     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4081
4082     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4083         pref_len++;
4084     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4085     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4086         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4087               ? (5 + taill) - pref_len : loc_regeol - locinput);
4088     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4089         l--;
4090     if (pref0_len < 0)
4091         pref0_len = 0;
4092     if (pref0_len > pref_len)
4093         pref0_len = pref_len;
4094     {
4095         const int is_uni = utf8_target ? 1 : 0;
4096
4097         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4098             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4099
4100         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4101                     (locinput - pref_len + pref0_len),
4102                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4103
4104         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4105                     locinput, loc_regeol - locinput, 10, 0, 1);
4106
4107         const STRLEN tlen=len0+len1+len2;
4108         Perl_re_printf( aTHX_
4109                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4110                     (IV)(locinput - loc_bostr),
4111                     len0, s0,
4112                     len1, s1,
4113                     (docolor ? "" : "> <"),
4114                     len2, s2,
4115                     (int)(tlen > 19 ? 0 :  19 - tlen),
4116                     "",
4117                     depth);
4118     }
4119 }
4120
4121 #endif
4122
4123 /* reg_check_named_buff_matched()
4124  * Checks to see if a named buffer has matched. The data array of
4125  * buffer numbers corresponding to the buffer is expected to reside
4126  * in the regexp->data->data array in the slot stored in the ARG() of
4127  * node involved. Note that this routine doesn't actually care about the
4128  * name, that information is not preserved from compilation to execution.
4129  * Returns the index of the leftmost defined buffer with the given name
4130  * or 0 if non of the buffers matched.
4131  */
4132 STATIC I32
4133 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4134 {
4135     I32 n;
4136     RXi_GET_DECL(rex,rexi);
4137     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4138     I32 *nums=(I32*)SvPVX(sv_dat);
4139
4140     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4141
4142     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4143         if ((I32)rex->lastparen >= nums[n] &&
4144             rex->offs[nums[n]].end != -1)
4145         {
4146             return nums[n];
4147         }
4148     }
4149     return 0;
4150 }
4151
4152 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4153 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4154 #define CHRTEST_NOT_A_CP_1 -999
4155 #define CHRTEST_NOT_A_CP_2 -998
4156
4157 static bool
4158 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4159         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4160 {
4161     /* This function determines if there are zero, one, two, or more characters
4162      * that match the first character of the passed-in EXACTish node
4163      * <text_node>, and if there are one or two, it returns them in the
4164      * passed-in pointers.
4165      *
4166      * If it determines that no possible character in the target string can
4167      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4168      * the first character in <text_node> requires UTF-8 to represent, and the
4169      * target string isn't in UTF-8.)
4170      *
4171      * If there are more than two characters that could match the beginning of
4172      * <text_node>, or if more context is required to determine a match or not,
4173      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4174      *
4175      * The motiviation behind this function is to allow the caller to set up
4176      * tight loops for matching.  If <text_node> is of type EXACT, there is
4177      * only one possible character that can match its first character, and so
4178      * the situation is quite simple.  But things get much more complicated if
4179      * folding is involved.  It may be that the first character of an EXACTFish
4180      * node doesn't participate in any possible fold, e.g., punctuation, so it
4181      * can be matched only by itself.  The vast majority of characters that are
4182      * in folds match just two things, their lower and upper-case equivalents.
4183      * But not all are like that; some have multiple possible matches, or match
4184      * sequences of more than one character.  This function sorts all that out.
4185      *
4186      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4187      * loop of trying to match A*, we know we can't exit where the thing
4188      * following it isn't a B.  And something can't be a B unless it is the
4189      * beginning of B.  By putting a quick test for that beginning in a tight
4190      * loop, we can rule out things that can't possibly be B without having to
4191      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4192      * character, we can make a tight loop matching A*, using the outputs of
4193      * this function.
4194      *
4195      * If the target string to match isn't in UTF-8, and there aren't
4196      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4197      * the one or two possible octets (which are characters in this situation)
4198      * that can match.  In all cases, if there is only one character that can
4199      * match, *<c1p> and *<c2p> will be identical.
4200      *
4201      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4202      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4203      * can match the beginning of <text_node>.  They should be declared with at
4204      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4205      * undefined what these contain.)  If one or both of the buffers are
4206      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4207      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4208      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4209      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4210      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4211
4212     const bool utf8_target = reginfo->is_utf8_target;
4213
4214     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4215     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4216     bool use_chrtest_void = FALSE;
4217     const bool is_utf8_pat = reginfo->is_utf8_pat;
4218
4219     /* Used when we have both utf8 input and utf8 output, to avoid converting
4220      * to/from code points */
4221     bool utf8_has_been_setup = FALSE;
4222
4223     dVAR;
4224
4225     U8 *pat = (U8*)STRING(text_node);
4226     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4227
4228     if (   OP(text_node) == EXACT
4229         || OP(text_node) == EXACT_ONLY8
4230         || OP(text_node) == EXACTL)
4231     {
4232
4233         /* In an exact node, only one thing can be matched, that first
4234          * character.  If both the pat and the target are UTF-8, we can just
4235          * copy the input to the output, avoiding finding the code point of
4236          * that character */
4237         if (!is_utf8_pat) {
4238             assert(OP(text_node) != EXACT_ONLY8);
4239             c2 = c1 = *pat;
4240         }
4241         else if (utf8_target) {
4242             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4243             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4244             utf8_has_been_setup = TRUE;
4245         }
4246         else if (OP(text_node) == EXACT_ONLY8) {
4247             return FALSE;   /* Can only match UTF-8 target */
4248         }
4249         else {
4250             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4251         }
4252     }
4253     else { /* an EXACTFish node */
4254         U8 *pat_end = pat + STR_LEN(text_node);
4255
4256         /* An EXACTFL node has at least some characters unfolded, because what
4257          * they match is not known until now.  So, now is the time to fold
4258          * the first few of them, as many as are needed to determine 'c1' and
4259          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4260          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4261          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4262          * need to fold as many characters as a single character can fold to,
4263          * so that later we can check if the first ones are such a multi-char
4264          * fold.  But, in such a pattern only locale-problematic characters
4265          * aren't folded, so we can skip this completely if the first character
4266          * in the node isn't one of the tricky ones */
4267         if (OP(text_node) == EXACTFL) {
4268
4269             if (! is_utf8_pat) {
4270                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4271                 {
4272                     folded[0] = folded[1] = 's';
4273                     pat = folded;
4274                     pat_end = folded + 2;
4275                 }
4276             }
4277             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4278                 U8 *s = pat;
4279                 U8 *d = folded;
4280                 int i;
4281
4282                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4283                     if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
4284                         *(d++) = (U8) toFOLD_LC(*s);
4285                         s++;
4286                     }
4287                     else {
4288                         STRLEN len;
4289                         _toFOLD_utf8_flags(s,
4290                                            pat_end,
4291                                            d,
4292                                            &len,
4293                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4294                         d += len;
4295                         s += UTF8SKIP(s);
4296                     }
4297                 }
4298
4299                 pat = folded;
4300                 pat_end = d;
4301             }
4302         }
4303
4304         if (    ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4305              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4306         {
4307             /* Multi-character folds require more context to sort out.  Also
4308              * PL_utf8_foldclosures used below doesn't handle them, so have to
4309              * be handled outside this routine */
4310             use_chrtest_void = TRUE;
4311         }
4312         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4313             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4314
4315             if (   UNLIKELY(PL_in_utf8_turkic_locale)
4316                 && OP(text_node) == EXACTFL
4317                 && UNLIKELY(   c1 == 'i' || c1 == 'I'
4318                             || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
4319                             || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
4320             {   /* Hard-coded Turkish locale rules for these 4 characters
4321                    override normal rules */
4322                 if (c1 == 'i') {
4323                     c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4324                 }
4325                 else if (c1 == 'I') {
4326                     c2 = LATIN_SMALL_LETTER_DOTLESS_I;
4327                 }
4328                 else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
4329                     c2 = 'i';
4330                 }
4331                 else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
4332                     c2 = 'I';
4333                 }
4334             }
4335             else if (c1 > 255) {
4336                 const unsigned int * remaining_folds;
4337                 unsigned int first_fold;
4338
4339                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4340                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4341                 Size_t folds_count = _inverse_folds(c1, &first_fold,
4342                                                        &remaining_folds);
4343                 if (folds_count == 0) {
4344                     c2 = c1;    /* there is only a single character that could
4345                                    match */
4346                 }
4347                 else if (folds_count != 1) {
4348                     /* If there aren't exactly two folds to this (itself and
4349                      * another), it is outside the scope of this function */
4350                     use_chrtest_void = TRUE;
4351                 }
4352                 else {  /* There are two.  We already have one, get the other */
4353                     c2 = first_fold;
4354
4355                     /* Folds that cross the 255/256 boundary are forbidden if
4356                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4357                      * ASCIII.  The only other match to c1 is c2, and since c1
4358                      * is above 255, c2 better be as well under these
4359                      * circumstances.  If it isn't, it means the only legal
4360                      * match of c1 is itself. */
4361                     if (    c2 < 256
4362                         && (   (   OP(text_node) == EXACTFL
4363                                 && ! IN_UTF8_CTYPE_LOCALE)
4364                             || ((     OP(text_node) == EXACTFAA
4365                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4366                                 && (isASCII(c1) || isASCII(c2)))))
4367                     {
4368                         c2 = c1;
4369                     }
4370                 }
4371             }
4372             else /* Here, c1 is <= 255 */
4373                 if (   utf8_target
4374                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4375                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4376                     && (   (   OP(text_node) != EXACTFAA
4377                             && OP(text_node) != EXACTFAA_NO_TRIE)
4378                         ||   ! isASCII(c1)))
4379             {
4380                 /* Here, there could be something above Latin1 in the target
4381                  * which folds to this character in the pattern.  All such
4382                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4383                  * than two characters involved in their folds, so are outside
4384                  * the scope of this function */
4385                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4386                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4387                 }
4388                 else {
4389                     use_chrtest_void = TRUE;
4390                 }
4391             }
4392             else { /* Here nothing above Latin1 can fold to the pattern
4393                       character */
4394                 switch (OP(text_node)) {
4395
4396                     case EXACTFL:   /* /l rules */
4397                         c2 = PL_fold_locale[c1];
4398                         break;
4399
4400                     case EXACTF:   /* This node only generated for non-utf8
4401                                     patterns */
4402                         assert(! is_utf8_pat);
4403                         if (! utf8_target) {    /* /d rules */
4404                             c2 = PL_fold[c1];
4405                             break;
4406                         }
4407                         /* FALLTHROUGH */
4408                         /* /u rules for all these.  This happens to work for
4409                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4410                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4411                                                 non-utf8 patterns */
4412                         assert(! is_utf8_pat);
4413                         /* FALLTHROUGH */
4414                     case EXACTFAA:
4415                     case EXACTFUP:
4416                     case EXACTFU:
4417                         c2 = PL_fold_latin1[c1];
4418                         break;
4419                     case EXACTFU_ONLY8:
4420                         return FALSE;
4421                         NOT_REACHED; /* NOTREACHED */
4422
4423                     default:
4424                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4425                         NOT_REACHED; /* NOTREACHED */
4426                 }
4427             }
4428         }
4429     }
4430
4431     /* Here have figured things out.  Set up the returns */
4432     if (use_chrtest_void) {
4433         *c2p = *c1p = CHRTEST_VOID;
4434     }
4435     else if (utf8_target) {
4436         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4437             uvchr_to_utf8(c1_utf8, c1);
4438             uvchr_to_utf8(c2_utf8, c2);
4439         }
4440
4441         /* Invariants are stored in both the utf8 and byte outputs; Use
4442          * negative numbers otherwise for the byte ones.  Make sure that the
4443          * byte ones are the same iff the utf8 ones are the same */
4444         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4445         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4446                 ? *c2_utf8
4447                 : (c1 == c2)
4448                   ? CHRTEST_NOT_A_CP_1
4449                   : CHRTEST_NOT_A_CP_2;
4450     }
4451     else if (c1 > 255) {
4452        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4453                            can represent */
4454            return FALSE;
4455        }
4456
4457        *c1p = *c2p = c2;    /* c2 is the only representable value */
4458     }
4459     else {  /* c1 is representable; see about c2 */
4460        *c1p = c1;
4461        *c2p = (c2 < 256) ? c2 : c1;
4462     }
4463
4464     return TRUE;
4465 }
4466
4467 STATIC bool
4468 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4469 {
4470     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4471      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4472
4473     PERL_ARGS_ASSERT_ISGCB;
4474
4475     switch (GCB_table[before][after]) {
4476         case GCB_BREAKABLE:
4477             return TRUE;
4478
4479         case GCB_NOBREAK:
4480             return FALSE;
4481
4482         case GCB_RI_then_RI:
4483             {
4484                 int RI_count = 1;
4485                 U8 * temp_pos = (U8 *) curpos;
4486
4487                 /* Do not break within emoji flag sequences. That is, do not
4488                  * break between regional indicator (RI) symbols if there is an
4489                  * odd number of RI characters before the break point.
4490                  *  GB12   sot (RI RI)* RI × RI
4491                  *  GB13 [^RI] (RI RI)* RI × RI */
4492
4493                 while (backup_one_GCB(strbeg,
4494                                     &temp_pos,
4495                                     utf8_target) == GCB_Regional_Indicator)
4496                 {
4497                     RI_count++;
4498                 }
4499
4500                 return RI_count % 2 != 1;
4501             }
4502
4503         case GCB_EX_then_EM:
4504
4505             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4506             {
4507                 U8 * temp_pos = (U8 *) curpos;
4508                 GCB_enum prev;
4509
4510                 do {
4511                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4512                 }
4513                 while (prev == GCB_Extend);
4514
4515                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4516             }
4517
4518         case GCB_Maybe_Emoji_NonBreak:
4519
4520             {
4521
4522             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4523               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4524               */
4525                 U8 * temp_pos = (U8 *) curpos;
4526                 GCB_enum prev;
4527
4528                 do {
4529                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4530                 }
4531                 while (prev == GCB_Extend);
4532
4533                 return prev != GCB_XPG_XX;
4534             }
4535
4536         default:
4537             break;
4538     }
4539
4540 #ifdef DEBUGGING
4541     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4542                                   before, after, GCB_table[before][after]);
4543     assert(0);
4544 #endif
4545     return TRUE;
4546 }
4547
4548 STATIC GCB_enum
4549 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4550 {
4551     dVAR;
4552     GCB_enum gcb;
4553
4554     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4555
4556     if (*curpos < strbeg) {
4557         return GCB_EDGE;
4558     }
4559
4560     if (utf8_target) {
4561         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4562         U8 * prev_prev_char_pos;
4563
4564         if (! prev_char_pos) {
4565             return GCB_EDGE;
4566         }
4567
4568         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4569             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4570             *curpos = prev_char_pos;
4571             prev_char_pos = prev_prev_char_pos;
4572         }
4573         else {
4574             *curpos = (U8 *) strbeg;
4575             return GCB_EDGE;
4576         }
4577     }
4578     else {
4579         if (*curpos - 2 < strbeg) {
4580             *curpos = (U8 *) strbeg;
4581             return GCB_EDGE;
4582         }
4583         (*curpos)--;
4584         gcb = getGCB_VAL_CP(*(*curpos - 1));
4585     }
4586
4587     return gcb;
4588 }
4589
4590 /* Combining marks attach to most classes that precede them, but this defines
4591  * the exceptions (from TR14) */
4592 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4593                                      || prev == LB_Mandatory_Break      \
4594                                      || prev == LB_Carriage_Return      \
4595                                      || prev == LB_Line_Feed            \
4596                                      || prev == LB_Next_Line            \
4597                                      || prev == LB_Space                \
4598                                      || prev == LB_ZWSpace))
4599
4600 STATIC bool
4601 S_isLB(pTHX_ LB_enum before,
4602              LB_enum after,
4603              const U8 * const strbeg,
4604              const U8 * const curpos,
4605              const U8 * const strend,
4606              const bool utf8_target)
4607 {
4608     U8 * temp_pos = (U8 *) curpos;
4609     LB_enum prev = before;
4610
4611     /* Is the boundary between 'before' and 'after' line-breakable?
4612      * Most of this is just a table lookup of a generated table from Unicode
4613      * rules.  But some rules require context to decide, and so have to be
4614      * implemented in code */
4615
4616     PERL_ARGS_ASSERT_ISLB;
4617
4618     /* Rule numbers in the comments below are as of Unicode 9.0 */
4619
4620   redo:
4621     before = prev;
4622     switch (LB_table[before][after]) {
4623         case LB_BREAKABLE:
4624             return TRUE;
4625
4626         case LB_NOBREAK:
4627         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4628             return FALSE;
4629
4630         case LB_SP_foo + LB_BREAKABLE:
4631         case LB_SP_foo + LB_NOBREAK:
4632         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4633
4634             /* When we have something following a SP, we have to look at the
4635              * context in order to know what to do.
4636              *
4637              * SP SP should not reach here because LB7: Do not break before
4638              * spaces.  (For two spaces in a row there is nothing that
4639              * overrides that) */
4640             assert(after != LB_Space);
4641
4642             /* Here we have a space followed by a non-space.  Mostly this is a
4643              * case of LB18: "Break after spaces".  But there are complications
4644              * as the handling of spaces is somewhat tricky.  They are in a
4645              * number of rules, which have to be applied in priority order, but
4646              * something earlier in the string can cause a rule to be skipped
4647              * and a lower priority rule invoked.  A prime example is LB7 which
4648              * says don't break before a space.  But rule LB8 (lower priority)
4649              * says that the first break opportunity after a ZW is after any
4650              * span of spaces immediately after it.  If a ZW comes before a SP
4651              * in the input, rule LB8 applies, and not LB7.  Other such rules
4652              * involve combining marks which are rules 9 and 10, but they may
4653              * override higher priority rules if they come earlier in the
4654              * string.  Since we're doing random access into the middle of the
4655              * string, we have to look for rules that should get applied based
4656              * on both string position and priority.  Combining marks do not
4657              * attach to either ZW nor SP, so we don't have to consider them
4658              * until later.
4659              *
4660              * To check for LB8, we have to find the first non-space character
4661              * before this span of spaces */
4662             do {
4663                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4664             }
4665             while (prev == LB_Space);
4666
4667             /* LB8 Break before any character following a zero-width space,
4668              * even if one or more spaces intervene.
4669              *      ZW SP* ÷
4670              * So if we have a ZW just before this span, and to get here this
4671              * is the final space in the span. */
4672             if (prev == LB_ZWSpace) {
4673                 return TRUE;
4674             }
4675
4676             /* Here, not ZW SP+.  There are several rules that have higher
4677              * priority than LB18 and can be resolved now, as they don't depend
4678              * on anything earlier in the string (except ZW, which we have
4679              * already handled).  One of these rules is LB11 Do not break
4680              * before Word joiner, but we have specially encoded that in the
4681              * lookup table so it is caught by the single test below which
4682              * catches the other ones. */
4683             if (LB_table[LB_Space][after] - LB_SP_foo
4684                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4685             {
4686                 return FALSE;
4687             }
4688
4689             /* If we get here, we have to XXX consider combining marks. */
4690             if (prev == LB_Combining_Mark) {
4691
4692                 /* What happens with these depends on the character they
4693                  * follow.  */
4694                 do {
4695                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4696                 }
4697                 while (prev == LB_Combining_Mark);
4698
4699                 /* Most times these attach to and inherit the characteristics
4700                  * of that character, but not always, and when not, they are to
4701                  * be treated as AL by rule LB10. */
4702                 if (! LB_CM_ATTACHES_TO(prev)) {
4703                     prev = LB_Alphabetic;
4704                 }
4705             }
4706
4707             /* Here, we have the character preceding the span of spaces all set
4708              * up.  We follow LB18: "Break after spaces" unless the table shows
4709              * that is overriden */
4710             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4711
4712         case LB_CM_ZWJ_foo:
4713
4714             /* We don't know how to treat the CM except by looking at the first
4715              * non-CM character preceding it.  ZWJ is treated as CM */
4716             do {
4717                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4718             }
4719             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4720
4721             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4722              * attatches to it, then it inherits the behavior of 'prev'.  If it
4723              * doesn't attach, it is to be treated as an AL */
4724             if (! LB_CM_ATTACHES_TO(prev)) {
4725                 prev = LB_Alphabetic;
4726             }
4727
4728             goto redo;
4729
4730         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4731         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4732
4733             /* LB21a Don't break after Hebrew + Hyphen.
4734              * HL (HY | BA) × */
4735
4736             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4737                                                           == LB_Hebrew_Letter)
4738             {
4739                 return FALSE;
4740             }
4741
4742             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4743
4744         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4745         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4746
4747             /* LB25a (PR | PO) × ( OP | HY )? NU */
4748             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4749                 return FALSE;
4750             }
4751
4752             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4753                                                                 == LB_BREAKABLE;
4754
4755         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4756         case LB_SY_or_IS_then_various + LB_NOBREAK:
4757         {
4758             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4759
4760             LB_enum temp = prev;
4761             do {
4762                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4763             }
4764             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4765             if (temp == LB_Numeric) {
4766                 return FALSE;
4767             }
4768
4769             return LB_table[prev][after] - LB_SY_or_IS_then_various
4770                                                                == LB_BREAKABLE;
4771         }
4772
4773         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4774         case LB_various_then_PO_or_PR + LB_NOBREAK:
4775         {
4776             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4777
4778             LB_enum temp = prev;
4779             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4780             {
4781                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4782             }
4783             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4784                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4785             }
4786             if (temp == LB_Numeric) {
4787                 return FALSE;
4788             }
4789             return LB_various_then_PO_or_PR;
4790         }
4791
4792         case LB_RI_then_RI + LB_NOBREAK:
4793         case LB_RI_then_RI + LB_BREAKABLE:
4794             {
4795                 int RI_count = 1;
4796
4797                 /* LB30a Break between two regional indicator symbols if and
4798                  * only if there are an even number of regional indicators
4799                  * preceding the position of the break.
4800                  *
4801                  *    sot (RI RI)* RI × RI
4802                  *  [^RI] (RI RI)* RI × RI */
4803
4804                 while (backup_one_LB(strbeg,
4805                                      &temp_pos,
4806                                      utf8_target) == LB_Regional_Indicator)
4807                 {
4808                     RI_count++;
4809                 }
4810
4811                 return RI_count % 2 == 0;
4812             }
4813
4814         default:
4815             break;
4816     }
4817
4818 #ifdef DEBUGGING
4819     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4820                                   before, after, LB_table[before][after]);
4821     assert(0);
4822 #endif
4823     return TRUE;
4824 }
4825
4826 STATIC LB_enum
4827 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4828 {
4829     dVAR;
4830
4831     LB_enum lb;
4832
4833     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4834
4835     if (*curpos >= strend) {
4836         return LB_EDGE;
4837     }
4838
4839     if (utf8_target) {
4840         *curpos += UTF8SKIP(*curpos);
4841         if (*curpos >= strend) {
4842             return LB_EDGE;
4843         }
4844         lb = getLB_VAL_UTF8(*curpos, strend);
4845     }
4846     else {
4847         (*curpos)++;
4848         if (*curpos >= strend) {
4849             return LB_EDGE;
4850         }
4851         lb = getLB_VAL_CP(**curpos);
4852     }
4853
4854     return lb;
4855 }
4856
4857 STATIC LB_enum
4858 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4859 {
4860     dVAR;
4861     LB_enum lb;
4862
4863     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4864
4865     if (*curpos < strbeg) {
4866         return LB_EDGE;
4867     }
4868
4869     if (utf8_target) {
4870         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4871         U8 * prev_prev_char_pos;
4872
4873         if (! prev_char_pos) {
4874             return LB_EDGE;
4875         }
4876
4877         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4878             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4879             *curpos = prev_char_pos;
4880             prev_char_pos = prev_prev_char_pos;
4881         }
4882         else {
4883             *curpos = (U8 *) strbeg;
4884             return LB_EDGE;
4885         }
4886     }
4887     else {
4888         if (*curpos - 2 < strbeg) {
4889             *curpos = (U8 *) strbeg;
4890             return LB_EDGE;
4891         }
4892         (*curpos)--;
4893         lb = getLB_VAL_CP(*(*curpos - 1));
4894     }
4895
4896     return lb;
4897 }
4898
4899 STATIC bool
4900 S_isSB(pTHX_ SB_enum before,
4901              SB_enum after,
4902              const U8 * const strbeg,
4903              const U8 * const curpos,
4904              const U8 * const strend,
4905              const bool utf8_target)
4906 {
4907     /* returns a boolean indicating if there is a Sentence Boundary Break
4908      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4909
4910     U8 * lpos = (U8 *) curpos;
4911     bool has_para_sep = FALSE;
4912     bool has_sp = FALSE;
4913
4914     PERL_ARGS_ASSERT_ISSB;
4915
4916     /* Break at the start and end of text.
4917         SB1.  sot  ÷
4918         SB2.  ÷  eot
4919       But unstated in Unicode is don't break if the text is empty */
4920     if (before == SB_EDGE || after == SB_EDGE) {
4921         return before != after;
4922     }
4923
4924     /* SB 3: Do not break within CRLF. */
4925     if (before == SB_CR && after == SB_LF) {
4926         return FALSE;
4927     }
4928
4929     /* Break after paragraph separators.  CR and LF are considered
4930      * so because Unicode views text as like word processing text where there
4931      * are no newlines except between paragraphs, and the word processor takes
4932      * care of wrapping without there being hard line-breaks in the text *./
4933        SB4.  Sep | CR | LF  ÷ */
4934     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4935         return TRUE;
4936     }
4937
4938     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4939      * (See Section 6.2, Replacing Ignore Rules.)
4940         SB5.  X (Extend | Format)*  →  X */
4941     if (after == SB_Extend || after == SB_Format) {
4942
4943         /* Implied is that the these characters attach to everything
4944          * immediately prior to them except for those separator-type
4945          * characters.  And the rules earlier have already handled the case
4946          * when one of those immediately precedes the extend char */
4947         return FALSE;
4948     }
4949
4950     if (before == SB_Extend || before == SB_Format) {
4951         U8 * temp_pos = lpos;
4952         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4953         if (   backup != SB_EDGE
4954             && backup != SB_Sep
4955             && backup != SB_CR
4956             && backup != SB_LF)
4957         {
4958             before = backup;
4959             lpos = temp_pos;
4960         }
4961
4962         /* Here, both 'before' and 'backup' are these types; implied is that we
4963          * don't break between them */
4964         if (backup == SB_Extend || backup == SB_Format) {
4965             return FALSE;
4966         }
4967     }
4968
4969     /* Do not break after ambiguous terminators like period, if they are
4970      * immediately followed by a number or lowercase letter, if they are
4971      * between uppercase letters, if the first following letter (optionally
4972      * after certain punctuation) is lowercase, or if they are followed by
4973      * "continuation" punctuation such as comma, colon, or semicolon. For
4974      * example, a period may be an abbreviation or numeric period, and thus may
4975      * not mark the end of a sentence.
4976
4977      * SB6. ATerm  ×  Numeric */
4978     if (before == SB_ATerm && after == SB_Numeric) {
4979         return FALSE;
4980     }
4981
4982     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4983     if (before == SB_ATerm && after == SB_Upper) {
4984         U8 * temp_pos = lpos;
4985         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4986         if (backup == SB_Upper || backup == SB_Lower) {
4987             return FALSE;
4988         }
4989     }
4990
4991     /* The remaining rules that aren't the final one, all require an STerm or
4992      * an ATerm after having backed up over some Close* Sp*, and in one case an
4993      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4994      * So do that backup now, setting flags if either Sp or a paragraph
4995      * separator are found */
4996
4997     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4998         has_para_sep = TRUE;
4999         before = backup_one_SB(strbeg, &lpos, utf8_target);
5000     }
5001
5002     if (before == SB_Sp) {
5003         has_sp = TRUE;
5004         do {
5005             before = backup_one_SB(strbeg, &lpos, utf8_target);
5006         }
5007         while (before == SB_Sp);
5008     }
5009
5010     while (before == SB_Close) {
5011         before = backup_one_SB(strbeg, &lpos, utf8_target);
5012     }
5013
5014     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5015      * most cases an STerm */
5016     if (before == SB_STerm || before == SB_ATerm) {
5017
5018         /* So, here the lhs matches
5019          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5020          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5021          * The rules that apply here are:
5022          *
5023          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5024                                            | LF | STerm | ATerm) )* Lower
5025            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5026            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5027            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5028            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5029          */
5030
5031         /* And all but SB11 forbid having seen a paragraph separator */
5032         if (! has_para_sep) {
5033             if (before == SB_ATerm) {          /* SB8 */
5034                 U8 * rpos = (U8 *) curpos;
5035                 SB_enum later = after;
5036
5037                 while (    later != SB_OLetter
5038                         && later != SB_Upper
5039                         && later != SB_Lower
5040                         && later != SB_Sep
5041                         && later != SB_CR
5042                         && later != SB_LF
5043                         && later != SB_STerm
5044                         && later != SB_ATerm
5045                         && later != SB_EDGE)
5046                 {
5047                     later = advance_one_SB(&rpos, strend, utf8_target);
5048                 }
5049                 if (later == SB_Lower) {
5050                     return FALSE;
5051                 }
5052             }
5053
5054             if (   after == SB_SContinue    /* SB8a */
5055                 || after == SB_STerm
5056                 || after == SB_ATerm)
5057             {
5058                 return FALSE;
5059             }
5060
5061             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5062                 if (   after == SB_Close
5063                     || after == SB_Sp
5064                     || after == SB_Sep
5065                     || after == SB_CR
5066                     || after == SB_LF)
5067                 {
5068                     return FALSE;
5069                 }
5070             }
5071
5072             /* SB10.  This and SB9 could probably be combined some way, but khw
5073              * has decided to follow the Unicode rule book precisely for
5074              * simplified maintenance */
5075             if (   after == SB_Sp
5076                 || after == SB_Sep
5077                 || after == SB_CR
5078                 || after == SB_LF)
5079             {
5080                 return FALSE;
5081             }
5082         }
5083
5084         /* SB11.  */
5085         return TRUE;
5086     }
5087
5088     /* Otherwise, do not break.
5089     SB12.  Any  ×  Any */
5090
5091     return FALSE;
5092 }
5093
5094 STATIC SB_enum
5095 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5096 {
5097     dVAR;
5098     SB_enum sb;
5099
5100     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5101
5102     if (*curpos >= strend) {
5103         return SB_EDGE;
5104     }
5105
5106     if (utf8_target) {
5107         do {
5108             *curpos += UTF8SKIP(*curpos);
5109             if (*curpos >= strend) {
5110                 return SB_EDGE;
5111             }
5112             sb = getSB_VAL_UTF8(*curpos, strend);
5113         } while (sb == SB_Extend || sb == SB_Format);
5114     }
5115     else {
5116         do {
5117             (*curpos)++;
5118             if (*curpos >= strend) {
5119                 return SB_EDGE;
5120             }
5121             sb = getSB_VAL_CP(**curpos);
5122         } while (sb == SB_Extend || sb == SB_Format);
5123     }
5124
5125     return sb;
5126 }
5127
5128 STATIC SB_enum
5129 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5130 {
5131     dVAR;
5132     SB_enum sb;
5133
5134     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5135
5136     if (*curpos < strbeg) {
5137         return SB_EDGE;
5138     }
5139
5140     if (utf8_target) {
5141         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5142         if (! prev_char_pos) {
5143             return SB_EDGE;
5144         }
5145
5146         /* Back up over Extend and Format.  curpos is always just to the right
5147          * of the characater whose value we are getting */
5148         do {
5149             U8 * prev_prev_char_pos;
5150             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5151                                                                       strbeg)))
5152             {
5153                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5154                 *curpos = prev_char_pos;
5155                 prev_char_pos = prev_prev_char_pos;
5156             }
5157             else {
5158                 *curpos = (U8 *) strbeg;
5159                 return SB_EDGE;
5160             }
5161         } while (sb == SB_Extend || sb == SB_Format);
5162     }
5163     else {
5164         do {
5165             if (*curpos - 2 < strbeg) {
5166                 *curpos = (U8 *) strbeg;
5167                 return SB_EDGE;
5168             }
5169             (*curpos)--;
5170             sb = getSB_VAL_CP(*(*curpos - 1));
5171         } while (sb == SB_Extend || sb == SB_Format);
5172     }
5173
5174     return sb;
5175 }
5176
5177 STATIC bool
5178 S_isWB(pTHX_ WB_enum previous,
5179              WB_enum before,
5180              WB_enum after,
5181              const U8 * const strbeg,
5182              const U8 * const curpos,
5183              const U8 * const strend,
5184              const bool utf8_target)
5185 {
5186     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5187      *  a Unicode word break, using their published algorithm, but tailored for
5188      *  Perl by treating spans of white space as one unit.  Context may be
5189      *  needed to make this determination.  If the value for the character
5190      *  before 'before' is known, it is passed as 'previous'; otherwise that
5191      *  should be set to WB_UNKNOWN.  The other input parameters give the
5192      *  boundaries and current position in the matching of the string.  That
5193      *  is, 'curpos' marks the position where the character whose wb value is
5194      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5195
5196     U8 * before_pos = (U8 *) curpos;
5197     U8 * after_pos = (U8 *) curpos;
5198     WB_enum prev = before;
5199     WB_enum next;
5200
5201     PERL_ARGS_ASSERT_ISWB;
5202
5203     /* Rule numbers in the comments below are as of Unicode 9.0 */
5204
5205   redo:
5206     before = prev;
5207     switch (WB_table[before][after]) {
5208         case WB_BREAKABLE:
5209             return TRUE;
5210
5211         case WB_NOBREAK:
5212             return FALSE;
5213
5214         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5215             next = advance_one_WB(&after_pos, strend, utf8_target,
5216                                  FALSE /* Don't skip Extend nor Format */ );
5217             /* A space immediately preceeding an Extend or Format is attached
5218              * to by them, and hence gets separated from previous spaces.
5219              * Otherwise don't break between horizontal white space */
5220             return next == WB_Extend || next == WB_Format;
5221
5222         /* WB4 Ignore Format and Extend characters, except when they appear at
5223          * the beginning of a region of text.  This code currently isn't
5224          * general purpose, but it works as the rules are currently and likely
5225          * to be laid out.  The reason it works is that when 'they appear at
5226          * the beginning of a region of text', the rule is to break before
5227          * them, just like any other character.  Therefore, the default rule
5228          * applies and we don't have to look in more depth.  Should this ever
5229          * change, we would have to have 2 'case' statements, like in the rules
5230          * below, and backup a single character (not spacing over the extend
5231          * ones) and then see if that is one of the region-end characters and
5232          * go from there */
5233         case WB_Ex_or_FO_or_ZWJ_then_foo:
5234             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5235             goto redo;
5236
5237         case WB_DQ_then_HL + WB_BREAKABLE:
5238         case WB_DQ_then_HL + WB_NOBREAK:
5239
5240             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5241
5242             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5243                                                             == WB_Hebrew_Letter)
5244             {
5245                 return FALSE;
5246             }
5247
5248              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5249
5250         case WB_HL_then_DQ + WB_BREAKABLE:
5251         case WB_HL_then_DQ + WB_NOBREAK:
5252
5253             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5254
5255             if (advance_one_WB(&after_pos, strend, utf8_target,
5256                                        TRUE /* Do skip Extend and Format */ )
5257                                                             == WB_Hebrew_Letter)
5258             {
5259                 return FALSE;
5260             }
5261
5262             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5263
5264         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5265         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5266
5267             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5268              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5269
5270             next = advance_one_WB(&after_pos, strend, utf8_target,
5271                                        TRUE /* Do skip Extend and Format */ );
5272
5273             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5274             {
5275                 return FALSE;
5276             }
5277
5278             return WB_table[before][after]
5279                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5280
5281         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5282         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5283
5284             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5285              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5286
5287             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5288             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5289             {
5290                 return FALSE;
5291             }
5292
5293             return WB_table[before][after]
5294                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5295
5296         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5297         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5298
5299             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5300              * */
5301
5302             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5303                                                             == WB_Numeric)
5304             {
5305                 return FALSE;
5306             }
5307
5308             return WB_table[before][after]
5309                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5310
5311         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5312         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5313
5314             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5315
5316             if (advance_one_WB(&after_pos, strend, utf8_target,
5317                                        TRUE /* Do skip Extend and Format */ )
5318                                                             == WB_Numeric)
5319             {
5320                 return FALSE;
5321             }
5322
5323             return WB_table[before][after]
5324                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5325
5326         case WB_RI_then_RI + WB_NOBREAK:
5327         case WB_RI_then_RI + WB_BREAKABLE:
5328             {
5329                 int RI_count = 1;
5330
5331                 /* Do not break within emoji flag sequences. That is, do not
5332                  * break between regional indicator (RI) symbols if there is an
5333                  * odd number of RI characters before the potential break
5334                  * point.
5335                  *
5336                  * WB15   sot (RI RI)* RI × RI
5337                  * WB16 [^RI] (RI RI)* RI × RI */
5338
5339                 while (backup_one_WB(&previous,
5340                                      strbeg,
5341                                      &before_pos,
5342                                      utf8_target) == WB_Regional_Indicator)
5343                 {
5344                     RI_count++;
5345                 }
5346
5347                 return RI_count % 2 != 1;
5348             }
5349
5350         default:
5351             break;
5352     }
5353
5354 #ifdef DEBUGGING
5355     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5356                                   before, after, WB_table[before][after]);
5357     assert(0);
5358 #endif
5359     return TRUE;
5360 }
5361
5362 STATIC WB_enum
5363 S_advance_one_WB(pTHX_ U8 ** curpos,
5364                        const U8 * const strend,
5365                        const bool utf8_target,
5366                        const bool skip_Extend_Format)
5367 {
5368     dVAR;
5369     WB_enum wb;
5370
5371     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5372
5373     if (*curpos >= strend) {
5374         return WB_EDGE;
5375     }
5376
5377     if (utf8_target) {
5378
5379         /* Advance over Extend and Format */
5380         do {
5381             *curpos += UTF8SKIP(*curpos);
5382             if (*curpos >= strend) {
5383                 return WB_EDGE;
5384             }
5385             wb = getWB_VAL_UTF8(*curpos, strend);
5386         } while (    skip_Extend_Format
5387                  && (wb == WB_Extend || wb == WB_Format));
5388     }
5389     else {
5390         do {
5391             (*curpos)++;
5392             if (*curpos >= strend) {
5393                 return WB_EDGE;
5394             }
5395             wb = getWB_VAL_CP(**curpos);
5396         } while (    skip_Extend_Format
5397                  && (wb == WB_Extend || wb == WB_Format));
5398     }
5399
5400     return wb;
5401 }
5402
5403 STATIC WB_enum
5404 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5405 {
5406     dVAR;
5407     WB_enum wb;
5408
5409     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5410
5411     /* If we know what the previous character's break value is, don't have
5412         * to look it up */
5413     if (*previous != WB_UNKNOWN) {
5414         wb = *previous;
5415
5416         /* But we need to move backwards by one */
5417         if (utf8_target) {
5418             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5419             if (! *curpos) {
5420                 *previous = WB_EDGE;
5421                 *curpos = (U8 *) strbeg;
5422             }
5423             else {
5424                 *previous = WB_UNKNOWN;
5425             }
5426         }
5427         else {
5428             (*curpos)--;
5429             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5430         }
5431
5432         /* And we always back up over these three types */
5433         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5434             return wb;
5435         }
5436     }
5437
5438     if (*curpos < strbeg) {
5439         return WB_EDGE;
5440     }
5441
5442     if (utf8_target) {
5443         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5444         if (! prev_char_pos) {
5445             return WB_EDGE;
5446         }
5447
5448         /* Back up over Extend and Format.  curpos is always just to the right
5449          * of the characater whose value we are getting */
5450         do {
5451             U8 * prev_prev_char_pos;
5452             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5453                                                    -1,
5454                                                    strbeg)))
5455             {
5456                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5457                 *curpos = prev_char_pos;
5458                 prev_char_pos = prev_prev_char_pos;
5459             }
5460             else {
5461                 *curpos = (U8 *) strbeg;
5462                 return WB_EDGE;
5463             }
5464         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5465     }
5466     else {
5467         do {
5468             if (*curpos - 2 < strbeg) {
5469                 *curpos = (U8 *) strbeg;
5470                 return WB_EDGE;
5471             }
5472             (*curpos)--;
5473             wb = getWB_VAL_CP(*(*curpos - 1));
5474         } while (wb == WB_Extend || wb == WB_Format);
5475     }
5476
5477     return wb;
5478 }
5479
5480 /* Macros for regmatch(), using its internal variables */
5481 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
5482 #define NEXTCHR_IS_EOS (nextchr < 0)
5483
5484 #define SET_nextchr \
5485     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
5486
5487 #define SET_locinput(p) \
5488     locinput = (p);  \
5489     SET_nextchr
5490
5491 #define sayYES goto yes
5492 #define sayNO goto no
5493 #define sayNO_SILENT goto no_silent
5494
5495 /* we dont use STMT_START/END here because it leads to
5496    "unreachable code" warnings, which are bogus, but distracting. */
5497 #define CACHEsayNO \
5498     if (ST.cache_mask) \
5499        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
5500     sayNO
5501
5502 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5503 (                                                           \
5504     (   ( st )                                         ) && \
5505     (   ( st )->u.eval.close_paren                     ) && \
5506     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5507 )
5508
5509 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5510 (                                                           \
5511     (   ( st )                                         ) && \
5512     (   ( st )->u.eval.close_paren                     ) && \
5513     (   ( expr )                                       ) && \
5514     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5515 )
5516
5517
5518 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5519     (st)->u.eval.close_paren = ( (expr) + 1 )
5520
5521 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5522     (st)->u.eval.close_paren = 0
5523
5524 /* push a new state then goto it */
5525
5526 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
5527     pushinput = input; \
5528     pusheol = eol; \
5529     pushsr0 = sr0; \
5530     scan = node; \
5531     st->resume_state = state; \
5532     goto push_state;
5533
5534 /* push a new state with success backtracking, then goto it */
5535
5536 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
5537     pushinput = input; \
5538     pusheol = eol;     \
5539     pushsr0 = sr0; \
5540     scan = node; \
5541     st->resume_state = state; \
5542     goto push_yes_state;
5543
5544 #define DEBUG_STATE_pp(pp)                                  \
5545     DEBUG_STATE_r({                                         \
5546         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
5547         Perl_re_printf( aTHX_                               \
5548             "%*s" pp " %s%s%s%s%s\n",                       \
5549             INDENT_CHARS(depth), "",                        \
5550             PL_reg_name[st->resume_state],                  \
5551             ((st==yes_state||st==mark_state) ? "[" : ""),   \
5552             ((st==yes_state) ? "Y" : ""),                   \
5553             ((st==mark_state) ? "M" : ""),                  \
5554             ((st==yes_state||st==mark_state) ? "]" : "")    \
5555         );                                                  \
5556     });
5557
5558 /*
5559
5560 regmatch() - main matching routine
5561
5562 This is basically one big switch statement in a loop. We execute an op,
5563 set 'next' to point the next op, and continue. If we come to a point which
5564 we may need to backtrack to on failure such as (A|B|C), we push a
5565 backtrack state onto the backtrack stack. On failure, we pop the top
5566 state, and re-enter the loop at the state indicated. If there are no more
5567 states to pop, we return failure.
5568
5569 Sometimes we also need to backtrack on success; for example /A+/, where
5570 after successfully matching one A, we need to go back and try to
5571 match another one; similarly for lookahead assertions: if the assertion
5572 completes successfully, we backtrack to the state just before the assertion
5573 and then carry on.  In these cases, the pushed state is marked as
5574 'backtrack on success too'. This marking is in fact done by a chain of
5575 pointers, each pointing to the previous 'yes' state. On success, we pop to
5576 the nearest yes state, discarding any intermediate failure-only states.
5577 Sometimes a yes state is pushed just to force some cleanup code to be
5578 called at the end of a successful match or submatch; e.g. (??{$re}) uses
5579 it to free the inner regex.
5580
5581 Note that failure backtracking rewinds the cursor position, while
5582 success backtracking leaves it alone.
5583
5584 A pattern is complete when the END op is executed, while a subpattern
5585 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
5586 ops trigger the "pop to last yes state if any, otherwise return true"
5587 behaviour.
5588
5589 A common convention in this function is to use A and B to refer to the two
5590 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
5591 the subpattern to be matched possibly multiple times, while B is the entire
5592 rest of the pattern. Variable and state names reflect this convention.
5593
5594 The states in the main switch are the union of ops and failure/success of
5595 substates associated with with that op.  For example, IFMATCH is the op
5596 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
5597 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
5598 successfully matched A and IFMATCH_A_fail is a state saying that we have
5599 just failed to match A. Resume states always come in pairs. The backtrack
5600 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
5601 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
5602 on success or failure.
5603
5604 The struct that holds a backtracking state is actually a big union, with
5605 one variant for each major type of op. The variable st points to the
5606 top-most backtrack struct. To make the code clearer, within each
5607 block of code we #define ST to alias the relevant union.
5608
5609 Here's a concrete example of a (vastly oversimplified) IFMATCH
5610 implementation:
5611
5612     switch (state) {
5613     ....
5614
5615 #define ST st->u.ifmatch
5616
5617     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5618         ST.foo = ...; // some state we wish to save
5619         ...
5620         // push a yes backtrack state with a resume value of
5621         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
5622         // first node of A:
5623         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
5624         // NOTREACHED
5625
5626     case IFMATCH_A: // we have successfully executed A; now continue with B
5627         next = B;
5628         bar = ST.foo; // do something with the preserved value
5629         break;
5630
5631     case IFMATCH_A_fail: // A failed, so the assertion failed
5632         ...;   // do some housekeeping, then ...
5633         sayNO; // propagate the failure
5634
5635 #undef ST
5636
5637     ...
5638     }
5639
5640 For any old-timers reading this who are familiar with the old recursive
5641 approach, the code above is equivalent to:
5642
5643     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5644     {
5645         int foo = ...
5646         ...
5647         if (regmatch(A)) {
5648             next = B;
5649             bar = foo;
5650             break;
5651         }
5652         ...;   // do some housekeeping, then ...
5653         sayNO; // propagate the failure
5654     }
5655
5656 The topmost backtrack state, pointed to by st, is usually free. If you
5657 want to claim it, populate any ST.foo fields in it with values you wish to
5658 save, then do one of
5659
5660         PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
5661         PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
5662
5663 which sets that backtrack state's resume value to 'resume_state', pushes a
5664 new free entry to the top of the backtrack stack, then goes to 'node'.
5665 On backtracking, the free slot is popped, and the saved state becomes the
5666 new free state. An ST.foo field in this new top state can be temporarily
5667 accessed to retrieve values, but once the main loop is re-entered, it
5668 becomes available for reuse.
5669
5670 Note that the depth of the backtrack stack constantly increases during the
5671 left-to-right execution of the pattern, rather than going up and down with
5672 the pattern nesting. For example the stack is at its maximum at Z at the
5673 end of the pattern, rather than at X in the following:
5674
5675     /(((X)+)+)+....(Y)+....Z/
5676
5677 The only exceptions to this are lookahead/behind assertions and the cut,
5678 (?>A), which pop all the backtrack states associated with A before
5679 continuing.
5680
5681 Backtrack state structs are allocated in slabs of about 4K in size.
5682 PL_regmatch_state and st always point to the currently active state,
5683 and PL_regmatch_slab points to the slab currently containing
5684 PL_regmatch_state.  The first time regmatch() is called, the first slab is
5685 allocated, and is never freed until interpreter destruction. When the slab
5686 is full, a new one is allocated and chained to the end. At exit from
5687 regmatch(), slabs allocated since entry are freed.
5688
5689 In order to work with variable length lookbehinds, an upper limit is placed on
5690 lookbehinds which is set to where the match position is at the end of where the
5691 lookbehind would get to.  Nothing in the lookbehind should match above that,
5692 except we should be able to look beyond if for things like \b, which need the
5693 next character in the string to be able to determine if this is a boundary or
5694 not.  We also can't match the end of string/line unless we are also at the end
5695 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
5696 that match a width, we have to add a condition that they are within the legal
5697 bounds of our window into the string.
5698
5699 */
5700
5701 /* returns -1 on failure, $+[0] on success */
5702 STATIC SSize_t
5703 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5704 {
5705     dVAR;
5706     const bool utf8_target = reginfo->is_utf8_target;
5707     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5708     REGEXP *rex_sv = reginfo->prog;
5709     regexp *rex = ReANY(rex_sv);
5710     RXi_GET_DECL(rex,rexi);
5711     /* the current state. This is a cached copy of PL_regmatch_state */
5712     regmatch_state *st;
5713     /* cache heavy used fields of st in registers */
5714     regnode *scan;
5715     regnode *next;
5716     U32 n = 0;  /* general value; init to avoid compiler warning */
5717     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5718     SSize_t endref = 0; /* offset of end of backref when ln is start */
5719     char *locinput = startpos;
5720     char *loceol = reginfo->strend;
5721     char *pushinput; /* where to continue after a PUSH */
5722     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
5723     U8   *pushsr0;   /* save starting pos of script run */
5724     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5725
5726     bool result = 0;        /* return value of S_regmatch */
5727     U32 depth = 0;            /* depth of backtrack stack */
5728     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5729     const U32 max_nochange_depth =
5730         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5731         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5732     regmatch_state *yes_state = NULL; /* state to pop to on success of
5733                                                             subpattern */
5734     /* mark_state piggy backs on the yes_state logic so that when we unwind
5735        the stack on success we can update the mark_state as we go */
5736     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5737     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5738     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5739     U32 state_num;
5740     bool no_final = 0;      /* prevent failure from backtracking? */
5741     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5742     char *startpoint = locinput;
5743     SV *popmark = NULL;     /* are we looking for a mark? */
5744     SV *sv_commit = NULL;   /* last mark name seen in failure */
5745     SV *sv_yes_mark = NULL; /* last mark name we have seen
5746                                during a successful match */
5747     U32 lastopen = 0;       /* last open we saw */
5748     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5749     SV* const oreplsv = GvSVn(PL_replgv);
5750     /* these three flags are set by various ops to signal information to
5751      * the very next op. They have a useful lifetime of exactly one loop
5752      * iteration, and are not preserved or restored by state pushes/pops
5753      */
5754     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5755     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5756     int logical = 0;        /* the following EVAL is:
5757                                 0: (?{...})
5758                                 1: (?(?{...})X|Y)
5759                                 2: (??{...})
5760                                or the following IFMATCH/UNLESSM is:
5761                                 false: plain (?=foo)
5762                                 true:  used as a condition: (?(?=foo))
5763                             */
5764     PAD* last_pad = NULL;
5765     dMULTICALL;
5766     U8 gimme = G_SCALAR;
5767     CV *caller_cv = NULL;       /* who called us */
5768     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5769     U32 maxopenparen = 0;       /* max '(' index seen so far */
5770     int to_complement;  /* Invert the result? */
5771     _char_class_number classnum;
5772     bool is_utf8_pat = reginfo->is_utf8_pat;
5773     bool match = FALSE;
5774     I32 orig_savestack_ix = PL_savestack_ix;
5775     U8 * script_run_begin = NULL;
5776
5777 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5778 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5779 #  define SOLARIS_BAD_OPTIMIZER
5780     const U32 *pl_charclass_dup = PL_charclass;
5781 #  define PL_charclass pl_charclass_dup
5782 #endif
5783
5784 #ifdef DEBUGGING
5785     GET_RE_DEBUG_FLAGS_DECL;
5786 #endif
5787
5788     /* protect against undef(*^R) */
5789     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5790
5791     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5792     multicall_oldcatch = 0;
5793     PERL_UNUSED_VAR(multicall_cop);
5794
5795     PERL_ARGS_ASSERT_REGMATCH;
5796
5797     st = PL_regmatch_state;
5798
5799     /* Note that nextchr is a byte even in UTF */
5800     SET_nextchr;
5801     scan = prog;
5802
5803     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5804             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5805             Perl_re_printf( aTHX_ "regmatch start\n" );
5806     }));
5807
5808     while (scan != NULL) {
5809         next = scan + NEXT_OFF(scan);
5810         if (next == scan)
5811             next = NULL;
5812         state_num = OP(scan);
5813
5814       reenter_switch:
5815         DEBUG_EXECUTE_r(
5816             if (state_num <= REGNODE_MAX) {
5817                 SV * const prop = sv_newmortal();
5818                 regnode *rnext = regnext(scan);
5819
5820                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5821                 regprop(rex, prop, scan, reginfo, NULL);
5822                 Perl_re_printf( aTHX_
5823                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5824                     INDENT_CHARS(depth), "",
5825                     (IV)(scan - rexi->program),
5826                     SvPVX_const(prop),
5827                     (PL_regkind[OP(scan)] == END || !rnext) ?
5828                         0 : (IV)(rnext - rexi->program));
5829             }
5830         );
5831
5832         to_complement = 0;
5833
5834         SET_nextchr;
5835         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5836
5837         switch (state_num) {
5838         case SBOL: /*  /^../ and /\A../  */
5839             if (locinput == reginfo->strbeg)
5840                 break;
5841             sayNO;
5842
5843         case MBOL: /*  /^../m  */
5844             if (locinput == reginfo->strbeg ||
5845                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5846             {
5847                 break;
5848             }
5849             sayNO;
5850
5851         case GPOS: /*  \G  */
5852             if (locinput == reginfo->ganch)
5853                 break;
5854             sayNO;
5855
5856         case KEEPS: /*   \K  */
5857             /* update the startpoint */
5858             st->u.keeper.val = rex->offs[0].start;
5859             rex->offs[0].start = locinput - reginfo->strbeg;
5860             PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
5861                             script_run_begin);
5862             NOT_REACHED; /* NOTREACHED */
5863
5864         case KEEPS_next_fail:
5865             /* rollback the start point change */
5866             rex->offs[0].start = st->u.keeper.val;
5867             sayNO_SILENT;
5868             NOT_REACHED; /* NOTREACHED */
5869
5870         case MEOL: /* /..$/m  */
5871             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5872                 sayNO;
5873             break;
5874
5875         case SEOL: /* /..$/  */
5876             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5877                 sayNO;
5878             if (reginfo->strend - locinput > 1)
5879                 sayNO;
5880             break;
5881
5882         case EOS: /*  \z  */
5883             if (!NEXTCHR_IS_EOS)
5884                 sayNO;
5885             break;
5886
5887         case SANY: /*  /./s  */
5888             if (NEXTCHR_IS_EOS || locinput >= loceol)
5889                 sayNO;
5890             goto increment_locinput;
5891
5892         case REG_ANY: /*  /./  */
5893             if (   NEXTCHR_IS_EOS
5894                 || locinput >= loceol
5895                 || nextchr == '\n')
5896             {
5897                 sayNO;
5898             }
5899             goto increment_locinput;
5900
5901
5902 #undef  ST
5903 #define ST st->u.trie
5904         case TRIEC: /* (ab|cd) with known charclass */
5905             /* In this case the charclass data is available inline so
5906                we can fail fast without a lot of extra overhead.
5907              */
5908             if ( !   NEXTCHR_IS_EOS
5909                 &&   locinput < loceol
5910                 && ! ANYOF_BITMAP_TEST(scan, nextchr))
5911             {
5912                 DEBUG_EXECUTE_r(
5913                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5914                               depth, PL_colors[4], PL_colors[5])
5915                 );
5916                 sayNO_SILENT;
5917                 NOT_REACHED; /* NOTREACHED */
5918             }
5919             /* FALLTHROUGH */
5920         case TRIE:  /* (ab|cd)  */
5921             /* the basic plan of execution of the trie is:
5922              * At the beginning, run though all the states, and
5923              * find the longest-matching word. Also remember the position
5924              * of the shortest matching word. For example, this pattern:
5925              *    1  2 3 4    5
5926              *    ab|a|x|abcd|abc
5927              * when matched against the string "abcde", will generate
5928              * accept states for all words except 3, with the longest
5929              * matching word being 4, and the shortest being 2 (with
5930              * the position being after char 1 of the string).
5931              *
5932              * Then for each matching word, in word order (i.e. 1,2,4,5),
5933              * we run the remainder of the pattern; on each try setting
5934              * the current position to the character following the word,
5935              * returning to try the next word on failure.
5936              *
5937              * We avoid having to build a list of words at runtime by
5938              * using a compile-time structure, wordinfo[].prev, which
5939              * gives, for each word, the previous accepting word (if any).
5940              * In the case above it would contain the mappings 1->2, 2->0,
5941              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5942              * the longest word (4 above), a list of all words, by
5943              * following the list of prev pointers; this gives us the
5944              * unordered list 4,5,1,2. Then given the current word we have
5945              * just tried, we can go through the list and find the
5946              * next-biggest word to try (so if we just failed on word 2,
5947              * the next in the list is 4).
5948              *
5949              * Since at runtime we don't record the matching position in
5950              * the string for each word, we have to work that out for
5951              * each word we're about to process. The wordinfo table holds
5952              * the character length of each word; given that we recorded
5953              * at the start: the position of the shortest word and its
5954              * length in chars, we just need to move the pointer the
5955              * difference between the two char lengths. Depending on
5956              * Unicode status and folding, that's cheap or expensive.
5957              *
5958              * This algorithm is optimised for the case where are only a
5959              * small number of accept states, i.e. 0,1, or maybe 2.
5960              * With lots of accepts states, and having to try all of them,
5961              * it becomes quadratic on number of accept states to find all
5962              * the next words.
5963              */
5964
5965             {
5966                 /* what type of TRIE am I? (utf8 makes this contextual) */
5967                 DECL_TRIE_TYPE(scan);
5968
5969                 /* what trie are we using right now */
5970                 reg_trie_data * const trie
5971                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5972                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5973                 U32 state = trie->startstate;
5974
5975                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5976                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5977                     if (utf8_target
5978                         && ! NEXTCHR_IS_EOS
5979                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5980                         && scan->flags == EXACTL)
5981                     {
5982                         /* We only output for EXACTL, as we let the folder
5983                          * output this message for EXACTFLU8 to avoid
5984                          * duplication */
5985                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5986                                                                reginfo->strend);
5987                     }
5988                 }
5989                 if (   trie->bitmap
5990                     && (     NEXTCHR_IS_EOS
5991                         ||   locinput >= loceol
5992                         || ! TRIE_BITMAP_TEST(trie, nextchr)))
5993                 {
5994                     if (trie->states[ state ].wordnum) {
5995                          DEBUG_EXECUTE_r(
5996                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
5997                                           depth, PL_colors[4], PL_colors[5])
5998                         );
5999                         if (!trie->jump)
6000                             break;
6001                     } else {
6002                         DEBUG_EXECUTE_r(
6003                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6004                                           depth, PL_colors[4], PL_colors[5])
6005                         );
6006                         sayNO_SILENT;
6007                    }
6008                 }
6009
6010             {
6011                 U8 *uc = ( U8* )locinput;
6012
6013                 STRLEN len = 0;
6014                 STRLEN foldlen = 0;
6015                 U8 *uscan = (U8*)NULL;
6016                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6017                 U32 charcount = 0; /* how many input chars we have matched */
6018                 U32 accepted = 0; /* have we seen any accepting states? */
6019
6020                 ST.jump = trie->jump;
6021                 ST.me = scan;
6022                 ST.firstpos = NULL;
6023                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6024                 ST.nextword = 0;
6025
6026                 /* fully traverse the TRIE; note the position of the
6027                    shortest accept state and the wordnum of the longest
6028                    accept state */
6029
6030                 while ( state && uc <= (U8*)(loceol) ) {
6031                     U32 base = trie->states[ state ].trans.base;
6032                     UV uvc = 0;
6033                     U16 charid = 0;
6034                     U16 wordnum;
6035                     wordnum = trie->states[ state ].wordnum;
6036
6037                     if (wordnum) { /* it's an accept state */
6038                         if (!accepted) {
6039                             accepted = 1;
6040                             /* record first match position */
6041                             if (ST.longfold) {
6042                                 ST.firstpos = (U8*)locinput;
6043                                 ST.firstchars = 0;
6044                             }
6045                             else {
6046                                 ST.firstpos = uc;
6047                                 ST.firstchars = charcount;
6048                             }
6049                         }
6050                         if (!ST.nextword || wordnum < ST.nextword)
6051                             ST.nextword = wordnum;
6052                         ST.topword = wordnum;
6053                     }
6054
6055                     DEBUG_TRIE_EXECUTE_r({
6056                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6057                                 /* HERE */
6058                                 PerlIO_printf( Perl_debug_log,
6059                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6060                                     INDENT_CHARS(depth), "", PL_colors[4],
6061                                     (UV)state, (accepted ? 'Y' : 'N'));
6062                     });
6063
6064                     /* read a char and goto next state */
6065                     if ( base && (foldlen || uc < (U8*)(loceol))) {
6066                         I32 offset;
6067                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6068                                              (U8 *) loceol, uscan,
6069                                              len, uvc, charid, foldlen,
6070                                              foldbuf, uniflags);
6071                         charcount++;
6072                         if (foldlen>0)
6073                             ST.longfold = TRUE;
6074                         if (charid &&
6075                              ( ((offset =
6076                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6077
6078                              && ((U32)offset < trie->lasttrans)
6079                              && trie->trans[offset].check == state)
6080                         {
6081                             state = trie->trans[offset].next;
6082                         }
6083                         else {
6084                             state = 0;
6085                         }
6086                         uc += len;
6087
6088                     }
6089                     else {
6090                         state = 0;
6091                     }
6092                     DEBUG_TRIE_EXECUTE_r(
6093                         Perl_re_printf( aTHX_
6094                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6095                             charid, uvc, (UV)state, PL_colors[5] );
6096                     );
6097                 }
6098                 if (!accepted)
6099                    sayNO;
6100
6101                 /* calculate total number of accept states */
6102                 {
6103                     U16 w = ST.topword;
6104                     accepted = 0;
6105                     while (w) {
6106                         w = trie->wordinfo[w].prev;
6107                         accepted++;
6108                     }
6109                     ST.accepted = accepted;
6110                 }
6111
6112                 DEBUG_EXECUTE_r(
6113                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6114                         depth,
6115                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6116                 );
6117                 goto trie_first_try; /* jump into the fail handler */
6118             }}
6119             NOT_REACHED; /* NOTREACHED */
6120
6121         case TRIE_next_fail: /* we failed - try next alternative */
6122         {
6123             U8 *uc;
6124             if ( ST.jump ) {
6125                 /* undo any captures done in the tail part of a branch,
6126                  * e.g.
6127                  *    /(?:X(.)(.)|Y(.)).../
6128                  * where the trie just matches X then calls out to do the
6129                  * rest of the branch */
6130                 REGCP_UNWIND(ST.cp);
6131                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6132             }
6133             if (!--ST.accepted) {
6134                 DEBUG_EXECUTE_r({
6135                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6136                         depth,
6137                         PL_colors[4],
6138                         PL_colors[5] );
6139                 });
6140                 sayNO_SILENT;
6141             }
6142             {
6143                 /* Find next-highest word to process.  Note that this code
6144                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6145                 U16 min = 0;
6146                 U16 word;
6147                 U16 const nextword = ST.nextword;
6148                 reg_trie_wordinfo * const wordinfo
6149                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6150                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6151                     if (word > nextword && (!min || word < min))
6152                         min = word;
6153                 }
6154                 ST.nextword = min;
6155             }
6156
6157           trie_first_try:
6158             if (do_cutgroup) {
6159                 do_cutgroup = 0;
6160                 no_final = 0;
6161             }
6162
6163             if ( ST.jump ) {
6164                 ST.lastparen = rex->lastparen;
6165                 ST.lastcloseparen = rex->lastcloseparen;
6166                 REGCP_SET(ST.cp);
6167             }
6168
6169             /* find start char of end of current word */
6170             {
6171                 U32 chars; /* how many chars to skip */
6172                 reg_trie_data * const trie
6173                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6174
6175                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6176                             >=  ST.firstchars);
6177                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6178                             - ST.firstchars;
6179                 uc = ST.firstpos;
6180
6181                 if (ST.longfold) {
6182                     /* the hard option - fold each char in turn and find
6183                      * its folded length (which may be different */
6184                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6185                     STRLEN foldlen;
6186                     STRLEN len;
6187                     UV uvc;
6188                     U8 *uscan;
6189
6190                     while (chars) {
6191                         if (utf8_target) {
6192                             /* XXX This assumes the length is well-formed, as
6193                              * does the UTF8SKIP below */
6194                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6195                                                     uniflags);
6196                             uc += len;
6197                         }
6198                         else {
6199                             uvc = *uc;
6200                             uc++;
6201                         }
6202                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6203                         uscan = foldbuf;
6204                         while (foldlen) {
6205                             if (!--chars)
6206                                 break;
6207                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6208                                                  uniflags);
6209                             uscan += len;
6210                             foldlen -= len;
6211                         }
6212                     }
6213                 }
6214                 else {
6215                     if (utf8_target)
6216                         while (chars--)
6217                             uc += UTF8SKIP(uc);
6218                     else
6219                         uc += chars;
6220                 }
6221             }
6222
6223             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6224                             ? ST.jump[ST.nextword]
6225                             : NEXT_OFF(ST.me));
6226
6227             DEBUG_EXECUTE_r({
6228                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6229                     depth,
6230                     PL_colors[4],
6231                     ST.nextword,
6232                     PL_colors[5]
6233                     );
6234             });
6235
6236             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6237                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6238                                 script_run_begin);
6239                 NOT_REACHED; /* NOTREACHED */
6240             }
6241             /* only one choice left - just continue */
6242             DEBUG_EXECUTE_r({
6243                 AV *const trie_words
6244                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6245                 SV ** const tmp = trie_words
6246                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6247                 SV *sv= tmp ? sv_newmortal() : NULL;
6248
6249                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6250                     depth, PL_colors[4],
6251                     ST.nextword,
6252                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6253                             PL_colors[0], PL_colors[1],
6254                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6255                         )
6256                     : "not compiled under -Dr",
6257                     PL_colors[5] );
6258             });
6259
6260             locinput = (char*)uc;
6261             continue; /* execute rest of RE */
6262             /* NOTREACHED */
6263         }
6264 #undef  ST
6265
6266         case EXACTL:             /*  /abc/l       */
6267             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6268
6269             /* Complete checking would involve going through every character
6270              * matched by the string to see if any is above latin1.  But the
6271              * comparision otherwise might very well be a fast assembly
6272              * language routine, and I (khw) don't think slowing things down
6273              * just to check for this warning is worth it.  So this just checks
6274              * the first character */
6275             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6276                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6277             }
6278             goto do_exact;
6279         case EXACT_ONLY8:
6280             if (! utf8_target) {
6281                 sayNO;
6282             }
6283             /* FALLTHROUGH */
6284         case EXACT: {            /*  /abc/        */
6285             char *s;
6286           do_exact:
6287             s = STRING(scan);
6288             ln = STR_LEN(scan);
6289             if (utf8_target != is_utf8_pat) {
6290                 /* The target and the pattern have differing utf8ness. */
6291                 char *l = locinput;
6292                 const char * const e = s + ln;
6293
6294                 if (utf8_target) {
6295                     /* The target is utf8, the pattern is not utf8.
6296                      * Above-Latin1 code points can't match the pattern;
6297                      * invariants match exactly, and the other Latin1 ones need
6298                      * to be downgraded to a single byte in order to do the
6299                      * comparison.  (If we could be confident that the target
6300                      * is not malformed, this could be refactored to have fewer
6301                      * tests by just assuming that if the first bytes match, it
6302                      * is an invariant, but there are tests in the test suite
6303                      * dealing with (??{...}) which violate this) */
6304                     while (s < e) {
6305                         if (   l >= loceol
6306                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6307                         {
6308                             sayNO;
6309                         }
6310                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6311                             if (*l != *s) {
6312                                 sayNO;
6313                             }
6314                             l++;
6315                         }
6316                         else {
6317                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6318                             {
6319                                 sayNO;
6320                             }
6321                             l += 2;
6322                         }
6323                         s++;
6324                     }
6325                 }
6326                 else {
6327                     /* The target is not utf8, the pattern is utf8. */
6328                     while (s < e) {
6329                         if (   l >= loceol
6330                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6331                         {
6332                             sayNO;
6333                         }
6334                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6335                             if (*s != *l) {
6336                                 sayNO;
6337                             }
6338                             s++;
6339                         }
6340                         else {
6341                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6342                             {
6343                                 sayNO;
6344                             }
6345                             s += 2;
6346                         }
6347                         l++;
6348                     }
6349                 }
6350                 locinput = l;
6351             }
6352             else {
6353                 /* The target and the pattern have the same utf8ness. */
6354                 /* Inline the first character, for speed. */
6355                 if (   loceol - locinput < ln
6356                     || UCHARAT(s) != nextchr
6357                     || (ln > 1 && memNE(s, locinput, ln)))
6358                 {
6359                     sayNO;
6360                 }
6361                 locinput += ln;
6362             }
6363             break;
6364             }
6365
6366         case EXACTFL:            /*  /abc/il      */
6367           {
6368             re_fold_t folder;
6369             const U8 * fold_array;
6370             const char * s;
6371             U32 fold_utf8_flags;
6372
6373             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6374             folder = foldEQ_locale;
6375             fold_array = PL_fold_locale;
6376             fold_utf8_flags = FOLDEQ_LOCALE;
6377             goto do_exactf;
6378
6379         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6380                                       is effectively /u; hence to match, target
6381                                       must be UTF-8. */
6382             if (! utf8_target) {
6383                 sayNO;
6384             }
6385             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6386                                              | FOLDEQ_S2_FOLDS_SANE;
6387             folder = foldEQ_latin1_s2_folded;
6388             fold_array = PL_fold_latin1;
6389             goto do_exactf;
6390
6391         case EXACTFU_ONLY8:      /* /abc/iu with something in /abc/ > 255 */
6392             if (! utf8_target) {
6393                 sayNO;
6394             }
6395             assert(is_utf8_pat);
6396             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6397             goto do_exactf;
6398
6399         case EXACTFUP:          /*  /foo/iu, and something is problematic in
6400                                     'foo' so can't take shortcuts. */
6401             assert(! is_utf8_pat);
6402             folder = foldEQ_latin1;
6403             fold_array = PL_fold_latin1;
6404             fold_utf8_flags = 0;
6405             goto do_exactf;
6406
6407         case EXACTFU:            /*  /abc/iu      */
6408             folder = foldEQ_latin1_s2_folded;
6409             fold_array = PL_fold_latin1;
6410             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6411             goto do_exactf;
6412
6413         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6414                                    patterns */
6415             assert(! is_utf8_pat);
6416             /* FALLTHROUGH */
6417         case EXACTFAA:            /*  /abc/iaa     */
6418             folder = foldEQ_latin1_s2_folded;
6419             fold_array = PL_fold_latin1;
6420             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6421             if (is_utf8_pat || ! utf8_target) {
6422
6423                 /* The possible presence of a MICRO SIGN in the pattern forbids
6424                  * us to view a non-UTF-8 pattern as folded when there is a
6425                  * UTF-8 target */
6426                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
6427                                   |FOLDEQ_S2_FOLDS_SANE;
6428             }
6429             goto do_exactf;
6430
6431
6432         case EXACTF:             /*  /abc/i    This node only generated for
6433                                                non-utf8 patterns */
6434             assert(! is_utf8_pat);
6435             folder = foldEQ;
6436             fold_array = PL_fold;
6437             fold_utf8_flags = 0;
6438
6439           do_exactf:
6440             s = STRING(scan);
6441             ln = STR_LEN(scan);
6442
6443             if (   utf8_target
6444                 || is_utf8_pat
6445                 || state_num == EXACTFUP
6446                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6447             {
6448               /* Either target or the pattern are utf8, or has the issue where
6449                * the fold lengths may differ. */
6450                 const char * const l = locinput;
6451                 char *e = loceol;
6452
6453                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
6454                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
6455                 {
6456                     sayNO;
6457                 }
6458                 locinput = e;
6459                 break;
6460             }
6461
6462             /* Neither the target nor the pattern are utf8 */
6463             if (UCHARAT(s) != nextchr
6464                 && !NEXTCHR_IS_EOS
6465                 && UCHARAT(s) != fold_array[nextchr])
6466             {
6467                 sayNO;
6468             }
6469             if (loceol - locinput < ln)
6470                 sayNO;
6471             if (ln > 1 && ! folder(locinput, s, ln))
6472                 sayNO;
6473             locinput += ln;
6474             break;
6475         }
6476
6477         case NBOUNDL: /*  /\B/l  */
6478             to_complement = 1;
6479             /* FALLTHROUGH */
6480
6481         case BOUNDL:  /*  /\b/l  */
6482         {
6483             bool b1, b2;
6484             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6485
6486             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6487                 if (! IN_UTF8_CTYPE_LOCALE) {
6488                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6489                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6490                 }
6491                 goto boundu;
6492             }
6493
6494             if (utf8_target) {
6495                 if (locinput == reginfo->strbeg)
6496                     b1 = isWORDCHAR_LC('\n');
6497                 else {
6498                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6499                                                         (U8*)(reginfo->strbeg)),
6500                                                  (U8*)(reginfo->strend));
6501                 }
6502                 b2 = (NEXTCHR_IS_EOS)
6503                     ? isWORDCHAR_LC('\n')
6504                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6505                                               (U8*) reginfo->strend);
6506             }
6507             else { /* Here the string isn't utf8 */
6508                 b1 = (locinput == reginfo->strbeg)
6509                      ? isWORDCHAR_LC('\n')
6510                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6511                 b2 = (NEXTCHR_IS_EOS)
6512                     ? isWORDCHAR_LC('\n')
6513                     : isWORDCHAR_LC(nextchr);
6514             }
6515             if (to_complement ^ (b1 == b2)) {
6516                 sayNO;
6517             }
6518             break;
6519         }
6520
6521         case NBOUND:  /*  /\B/   */
6522             to_complement = 1;
6523             /* FALLTHROUGH */
6524
6525         case BOUND:   /*  /\b/   */
6526             if (utf8_target) {
6527                 goto bound_utf8;
6528             }
6529             goto bound_ascii_match_only;
6530
6531         case NBOUNDA: /*  /\B/a  */
6532             to_complement = 1;
6533             /* FALLTHROUGH */
6534
6535         case BOUNDA:  /*  /\b/a  */
6536         {
6537             bool b1, b2;
6538
6539           bound_ascii_match_only:
6540             /* Here the string isn't utf8, or is utf8 and only ascii characters
6541              * are to match \w.  In the latter case looking at the byte just
6542              * prior to the current one may be just the final byte of a
6543              * multi-byte character.  This is ok.  There are two cases:
6544              * 1) it is a single byte character, and then the test is doing
6545              *    just what it's supposed to.
6546              * 2) it is a multi-byte character, in which case the final byte is
6547              *    never mistakable for ASCII, and so the test will say it is
6548              *    not a word character, which is the correct answer. */
6549             b1 = (locinput == reginfo->strbeg)
6550                  ? isWORDCHAR_A('\n')
6551                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6552             b2 = (NEXTCHR_IS_EOS)
6553                 ? isWORDCHAR_A('\n')
6554                 : isWORDCHAR_A(nextchr);
6555             if (to_complement ^ (b1 == b2)) {
6556                 sayNO;
6557             }
6558             break;
6559         }
6560
6561         case NBOUNDU: /*  /\B/u  */
6562             to_complement = 1;
6563             /* FALLTHROUGH */
6564
6565         case BOUNDU:  /*  /\b/u  */
6566
6567           boundu:
6568             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6569                 match = FALSE;
6570             }
6571             else if (utf8_target) {
6572               bound_utf8:
6573                 switch((bound_type) FLAGS(scan)) {
6574                     case TRADITIONAL_BOUND:
6575                     {
6576                         bool b1, b2;
6577                         b1 = (locinput == reginfo->strbeg)
6578                              ? 0 /* isWORDCHAR_L1('\n') */
6579                              : isWORDCHAR_utf8_safe(
6580                                                reghop3((U8*)locinput,
6581                                                        -1,
6582                                                        (U8*)(reginfo->strbeg)),
6583                                                     (U8*) reginfo->strend);
6584                         b2 = (NEXTCHR_IS_EOS)
6585                             ? 0 /* isWORDCHAR_L1('\n') */
6586                             : isWORDCHAR_utf8_safe((U8*)locinput,
6587                                                    (U8*) reginfo->strend);
6588                         match = cBOOL(b1 != b2);
6589                         break;
6590                     }
6591                     case GCB_BOUND:
6592                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6593                             match = TRUE; /* GCB always matches at begin and
6594                                              end */
6595                         }
6596                         else {
6597                             /* Find the gcb values of previous and current
6598                              * chars, then see if is a break point */
6599                             match = isGCB(getGCB_VAL_UTF8(
6600                                                 reghop3((U8*)locinput,
6601                                                         -1,
6602                                                         (U8*)(reginfo->strbeg)),
6603                                                 (U8*) reginfo->strend),
6604                                           getGCB_VAL_UTF8((U8*) locinput,
6605                                                         (U8*) reginfo->strend),
6606                                           (U8*) reginfo->strbeg,
6607                                           (U8*) locinput,
6608                                           utf8_target);
6609                         }
6610                         break;
6611
6612                     case LB_BOUND:
6613                         if (locinput == reginfo->strbeg) {
6614                             match = FALSE;
6615                         }
6616                         else if (NEXTCHR_IS_EOS) {
6617                             match = TRUE;
6618                         }
6619                         else {
6620                             match = isLB(getLB_VAL_UTF8(
6621                                                 reghop3((U8*)locinput,
6622                                                         -1,
6623                                                         (U8*)(reginfo->strbeg)),
6624                                                 (U8*) reginfo->strend),
6625                                           getLB_VAL_UTF8((U8*) locinput,
6626                                                         (U8*) reginfo->strend),
6627                                           (U8*) reginfo->strbeg,
6628                                           (U8*) locinput,
6629                                           (U8*) reginfo->strend,
6630                                           utf8_target);
6631                         }
6632                         break;
6633
6634                     case SB_BOUND: /* Always matches at begin and end */
6635                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6636                             match = TRUE;
6637                         }
6638                         else {
6639                             match = isSB(getSB_VAL_UTF8(
6640                                                 reghop3((U8*)locinput,
6641                                                         -1,
6642                                                         (U8*)(reginfo->strbeg)),
6643                                                 (U8*) reginfo->strend),
6644                                           getSB_VAL_UTF8((U8*) locinput,
6645                                                         (U8*) reginfo->strend),
6646                                           (U8*) reginfo->strbeg,
6647                                           (U8*) locinput,
6648                                           (U8*) reginfo->strend,
6649                                           utf8_target);
6650                         }
6651                         break;
6652
6653                     case WB_BOUND:
6654                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6655                             match = TRUE;
6656                         }
6657                         else {
6658                             match = isWB(WB_UNKNOWN,
6659                                          getWB_VAL_UTF8(
6660                                                 reghop3((U8*)locinput,
6661                                                         -1,
6662                                                         (U8*)(reginfo->strbeg)),
6663                                                 (U8*) reginfo->strend),
6664                                           getWB_VAL_UTF8((U8*) locinput,
6665                                                         (U8*) reginfo->strend),
6666                                           (U8*) reginfo->strbeg,
6667                                           (U8*) locinput,
6668                                           (U8*) reginfo->strend,
6669                                           utf8_target);
6670                         }
6671                         break;
6672                 }
6673             }
6674             else {  /* Not utf8 target */
6675                 switch((bound_type) FLAGS(scan)) {
6676                     case TRADITIONAL_BOUND:
6677                     {
6678                         bool b1, b2;
6679                         b1 = (locinput == reginfo->strbeg)
6680                             ? 0 /* isWORDCHAR_L1('\n') */
6681                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6682                         b2 = (NEXTCHR_IS_EOS)
6683                             ? 0 /* isWORDCHAR_L1('\n') */
6684                             : isWORDCHAR_L1(nextchr);
6685                         match = cBOOL(b1 != b2);
6686                         break;
6687                     }
6688
6689                     case GCB_BOUND:
6690                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6691                             match = TRUE; /* GCB always matches at begin and
6692                                              end */
6693                         }
6694                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6695                                    range */
6696                             match =    UCHARAT(locinput - 1) != '\r'
6697                                     || UCHARAT(locinput) != '\n';
6698                         }
6699                         break;
6700
6701                     case LB_BOUND:
6702                         if (locinput == reginfo->strbeg) {
6703                             match = FALSE;
6704                         }
6705                         else if (NEXTCHR_IS_EOS) {
6706                             match = TRUE;
6707                         }
6708                         else {
6709                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6710                                          getLB_VAL_CP(UCHARAT(locinput)),
6711                                          (U8*) reginfo->strbeg,
6712                                          (U8*) locinput,
6713                                          (U8*) reginfo->strend,
6714                                          utf8_target);
6715                         }
6716                         break;
6717
6718                     case SB_BOUND: /* Always matches at begin and end */
6719                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6720                             match = TRUE;
6721                         }
6722                         else {
6723                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6724                                          getSB_VAL_CP(UCHARAT(locinput)),
6725                                          (U8*) reginfo->strbeg,
6726                                          (U8*) locinput,
6727                                          (U8*) reginfo->strend,
6728                                          utf8_target);
6729                         }
6730                         break;
6731
6732                     case WB_BOUND:
6733                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6734                             match = TRUE;
6735                         }
6736                         else {
6737                             match = isWB(WB_UNKNOWN,
6738                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6739                                          getWB_VAL_CP(UCHARAT(locinput)),
6740                                          (U8*) reginfo->strbeg,
6741                                          (U8*) locinput,
6742                                          (U8*) reginfo->strend,
6743                                          utf8_target);
6744                         }
6745                         break;
6746                 }
6747             }
6748
6749             if (to_complement ^ ! match) {
6750                 sayNO;
6751             }
6752             break;
6753
6754         case ANYOFPOSIXL:
6755         case ANYOFL:  /*  /[abc]/l      */
6756             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6757
6758             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6759             {
6760               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6761             }
6762             /* FALLTHROUGH */
6763         case ANYOFD:  /*   /[abc]/d       */
6764         case ANYOF:  /*   /[abc]/       */
6765             if (NEXTCHR_IS_EOS || locinput >= loceol)
6766                 sayNO;
6767             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
6768                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
6769             {
6770                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
6771                     sayNO;
6772                 }
6773                 locinput++;
6774             }
6775             else {
6776                 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6777                                                                    utf8_target))
6778                 {
6779                     sayNO;
6780                 }
6781                 goto increment_locinput;
6782             }
6783             break;
6784
6785         case ANYOFM:
6786             if (   NEXTCHR_IS_EOS
6787                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
6788                 || locinput >= loceol)
6789             {
6790                 sayNO;
6791             }
6792             locinput++; /* ANYOFM is always single byte */
6793             break;
6794
6795         case NANYOFM:
6796             if (   NEXTCHR_IS_EOS
6797                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
6798                 || locinput >= loceol)
6799             {
6800                 sayNO;
6801             }
6802             goto increment_locinput;
6803             break;
6804
6805         case ANYOFH:
6806             if (   ! utf8_target
6807                 ||   NEXTCHR_IS_EOS
6808                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6809                                                                    utf8_target))
6810             {
6811                 sayNO;
6812             }
6813             goto increment_locinput;
6814             break;
6815
6816         case ANYOFHb:
6817             if (   ! utf8_target
6818                 ||   NEXTCHR_IS_EOS
6819                 ||   ANYOF_FLAGS(scan) != (U8) *locinput
6820                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6821                                                                   utf8_target))
6822             {
6823                 sayNO;
6824             }
6825             goto increment_locinput;
6826             break;
6827
6828         /* The argument (FLAGS) to all the POSIX node types is the class number
6829          * */
6830
6831         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6832             to_complement = 1;
6833             /* FALLTHROUGH */
6834
6835         case POSIXL:    /* \w or [:punct:] etc. under /l */
6836             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6837             if (NEXTCHR_IS_EOS || locinput >= loceol)
6838                 sayNO;
6839
6840             /* Use isFOO_lc() for characters within Latin1.  (Note that
6841              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6842              * wouldn't be invariant) */
6843             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6844                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6845                     sayNO;
6846                 }
6847
6848                 locinput++;
6849                 break;
6850             }
6851
6852             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6853                 /* An above Latin-1 code point, or malformed */
6854                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6855                                                        reginfo->strend);
6856                 goto utf8_posix_above_latin1;
6857             }
6858
6859             /* Here is a UTF-8 variant code point below 256 and the target is
6860              * UTF-8 */
6861             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6862                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6863                                             *(locinput + 1))))))
6864             {
6865                 sayNO;
6866             }
6867
6868             goto increment_locinput;
6869
6870         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6871             to_complement = 1;
6872             /* FALLTHROUGH */
6873
6874         case POSIXD:    /* \w or [:punct:] etc. under /d */
6875             if (utf8_target) {
6876                 goto utf8_posix;
6877             }
6878             goto posixa;
6879
6880         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6881
6882             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6883                 sayNO;
6884             }
6885
6886             /* All UTF-8 variants match */
6887             if (! UTF8_IS_INVARIANT(nextchr)) {
6888                 goto increment_locinput;
6889             }
6890
6891             to_complement = 1;
6892             goto join_nposixa;
6893
6894         case POSIXA:    /* \w or [:punct:] etc. under /a */
6895
6896           posixa:
6897             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6898              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6899              * character is a single byte */
6900
6901             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6902                 sayNO;
6903             }
6904
6905           join_nposixa:
6906
6907             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6908                                                                 FLAGS(scan)))))
6909             {
6910                 sayNO;
6911             }
6912
6913             /* Here we are either not in utf8, or we matched a utf8-invariant,
6914              * so the next char is the next byte */
6915             locinput++;
6916             break;
6917
6918         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6919             to_complement = 1;
6920             /* FALLTHROUGH */
6921
6922         case POSIXU:    /* \w or [:punct:] etc. under /u */
6923           utf8_posix:
6924             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6925                 sayNO;
6926             }
6927
6928             /* Use _generic_isCC() for characters within Latin1.  (Note that
6929              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6930              * wouldn't be invariant) */
6931             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6932                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6933                                                            FLAGS(scan)))))
6934                 {
6935                     sayNO;
6936                 }
6937                 locinput++;
6938             }
6939             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6940                 if (! (to_complement
6941                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6942                                                                *(locinput + 1)),
6943                                              FLAGS(scan)))))
6944                 {
6945                     sayNO;
6946                 }
6947                 locinput += 2;
6948             }
6949             else {  /* Handle above Latin-1 code points */
6950               utf8_posix_above_latin1:
6951                 classnum = (_char_class_number) FLAGS(scan);
6952                 switch (classnum) {
6953                     default:
6954                         if (! (to_complement
6955                            ^ cBOOL(_invlist_contains_cp(
6956                                       PL_XPosix_ptrs[classnum],
6957                                       utf8_to_uvchr_buf((U8 *) locinput,
6958                                                         (U8 *) reginfo->strend,
6959                                                         NULL)))))
6960                         {
6961                             sayNO;
6962                         }
6963                         break;
6964                     case _CC_ENUM_SPACE:
6965                         if (! (to_complement
6966                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
6967                         {
6968                             sayNO;
6969                         }
6970                         break;
6971                     case _CC_ENUM_BLANK:
6972                         if (! (to_complement
6973                                         ^ cBOOL(is_HORIZWS_high(locinput))))
6974                         {
6975                             sayNO;
6976                         }
6977                         break;
6978                     case _CC_ENUM_XDIGIT:
6979                         if (! (to_complement
6980                                         ^ cBOOL(is_XDIGIT_high(locinput))))
6981                         {
6982                             sayNO;
6983                         }
6984                         break;
6985                     case _CC_ENUM_VERTSPACE:
6986                         if (! (to_complement
6987                                         ^ cBOOL(is_VERTWS_high(locinput))))
6988                         {
6989                             sayNO;
6990                         }
6991                         break;
6992                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
6993                     case _CC_ENUM_ASCII:
6994                         if (! to_complement) {
6995                             sayNO;
6996                         }
6997                         break;
6998                 }
6999                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7000             }
7001             break;
7002
7003         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
7004                        a Unicode extended Grapheme Cluster */
7005             if (NEXTCHR_IS_EOS || locinput >= loceol)
7006                 sayNO;
7007             if  (! utf8_target) {
7008
7009                 /* Match either CR LF  or '.', as all the other possibilities
7010                  * require utf8 */
7011                 locinput++;         /* Match the . or CR */
7012                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
7013                                        match the LF */
7014                     && locinput <  loceol
7015                     && UCHARAT(locinput) == '\n')
7016                 {
7017                     locinput++;
7018                 }
7019             }
7020             else {
7021
7022                 /* Get the gcb type for the current character */
7023                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7024                                                        (U8*) reginfo->strend);
7025
7026                 /* Then scan through the input until we get to the first
7027                  * character whose type is supposed to be a gcb with the
7028                  * current character.  (There is always a break at the
7029                  * end-of-input) */
7030                 locinput += UTF8SKIP(locinput);
7031                 while (locinput < loceol) {
7032                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7033                                                          (U8*) reginfo->strend);
7034                     if (isGCB(prev_gcb, cur_gcb,
7035                               (U8*) reginfo->strbeg, (U8*) locinput,
7036                               utf8_target))
7037                     {
7038                         break;
7039                     }
7040
7041                     prev_gcb = cur_gcb;
7042                     locinput += UTF8SKIP(locinput);
7043                 }
7044
7045
7046             }
7047             break;
7048
7049         case REFFLN:  /*  /\g{name}/il  */
7050         {   /* The capture buffer cases.  The ones beginning with N for the
7051                named buffers just convert to the equivalent numbered and
7052                pretend they were called as the corresponding numbered buffer
7053                op.  */
7054             /* don't initialize these in the declaration, it makes C++
7055                unhappy */
7056             const char *s;
7057             char type;
7058             re_fold_t folder;
7059             const U8 *fold_array;
7060             UV utf8_fold_flags;
7061
7062             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7063             folder = foldEQ_locale;
7064             fold_array = PL_fold_locale;
7065             type = REFFL;
7066             utf8_fold_flags = FOLDEQ_LOCALE;
7067             goto do_nref;
7068
7069         case REFFAN:  /*  /\g{name}/iaa  */
7070             folder = foldEQ_latin1;
7071             fold_array = PL_fold_latin1;
7072             type = REFFA;
7073             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7074             goto do_nref;
7075
7076         case REFFUN:  /*  /\g{name}/iu  */
7077             folder = foldEQ_latin1;
7078             fold_array = PL_fold_latin1;
7079             type = REFFU;
7080             utf8_fold_flags = 0;
7081             goto do_nref;
7082
7083         case REFFN:  /*  /\g{name}/i  */
7084             folder = foldEQ;
7085             fold_array = PL_fold;
7086             type = REFF;
7087             utf8_fold_flags = 0;
7088             goto do_nref;
7089
7090         case REFN:  /*  /\g{name}/   */
7091             type = REF;
7092             folder = NULL;
7093             fold_array = NULL;
7094             utf8_fold_flags = 0;
7095           do_nref:
7096
7097             /* For the named back references, find the corresponding buffer
7098              * number */
7099             n = reg_check_named_buff_matched(rex,scan);
7100
7101             if ( ! n ) {
7102                 sayNO;
7103             }
7104             goto do_nref_ref_common;
7105
7106         case REFFL:  /*  /\1/il  */
7107             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7108             folder = foldEQ_locale;
7109             fold_array = PL_fold_locale;
7110             utf8_fold_flags = FOLDEQ_LOCALE;
7111             goto do_ref;
7112
7113         case REFFA:  /*  /\1/iaa  */
7114             folder = foldEQ_latin1;
7115             fold_array = PL_fold_latin1;
7116             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7117             goto do_ref;
7118
7119         case REFFU:  /*  /\1/iu  */
7120             folder = foldEQ_latin1;
7121             fold_array = PL_fold_latin1;
7122             utf8_fold_flags = 0;
7123             goto do_ref;
7124
7125         case REFF:  /*  /\1/i  */
7126             folder = foldEQ;
7127             fold_array = PL_fold;
7128             utf8_fold_flags = 0;
7129             goto do_ref;
7130
7131         case REF:  /*  /\1/    */
7132             folder = NULL;
7133             fold_array = NULL;
7134             utf8_fold_flags = 0;
7135
7136           do_ref:
7137             type = OP(scan);
7138             n = ARG(scan);  /* which paren pair */
7139
7140           do_nref_ref_common:
7141             ln = rex->offs[n].start;
7142             endref = rex->offs[n].end;
7143             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7144             if (rex->lastparen < n || ln == -1 || endref == -1)
7145                 sayNO;                  /* Do not match unless seen CLOSEn. */
7146             if (ln == endref)
7147                 break;
7148
7149             s = reginfo->strbeg + ln;
7150             if (type != REF     /* REF can do byte comparison */
7151                 && (utf8_target || type == REFFU || type == REFFL))
7152             {
7153                 char * limit = loceol;
7154
7155                 /* This call case insensitively compares the entire buffer
7156                     * at s, with the current input starting at locinput, but
7157                     * not going off the end given by loceol, and
7158                     * returns in <limit> upon success, how much of the
7159                     * current input was matched */
7160                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7161                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7162                 {
7163                     sayNO;
7164                 }
7165                 locinput = limit;
7166                 break;
7167             }
7168
7169             /* Not utf8:  Inline the first character, for speed. */
7170             if ( ! NEXTCHR_IS_EOS
7171                 && locinput < loceol
7172                 && UCHARAT(s) != nextchr
7173                 && (   type == REF
7174                     || UCHARAT(s) != fold_array[nextchr]))
7175             {
7176                 sayNO;
7177             }
7178             ln = endref - ln;
7179             if (locinput + ln > loceol)
7180                 sayNO;
7181             if (ln > 1 && (type == REF
7182                            ? memNE(s, locinput, ln)
7183                            : ! folder(locinput, s, ln)))
7184                 sayNO;
7185             locinput += ln;
7186             break;
7187         }
7188
7189         case NOTHING: /* null op; e.g. the 'nothing' following
7190                        * the '*' in m{(a+|b)*}' */
7191             break;
7192         case TAIL: /* placeholder while compiling (A|B|C) */
7193             break;
7194
7195 #undef  ST
7196 #define ST st->u.eval
7197 #define CUR_EVAL cur_eval->u.eval
7198
7199         {
7200             SV *ret;
7201             REGEXP *re_sv;
7202             regexp *re;
7203             regexp_internal *rei;
7204             regnode *startpoint;
7205             U32 arg;
7206
7207         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7208             arg= (U32)ARG(scan);
7209             if (cur_eval && cur_eval->locinput == locinput) {
7210                 if ( ++nochange_depth > max_nochange_depth )
7211                     Perl_croak(aTHX_
7212                         "Pattern subroutine nesting without pos change"
7213                         " exceeded limit in regex");
7214             } else {
7215                 nochange_depth = 0;
7216             }
7217             re_sv = rex_sv;
7218             re = rex;
7219             rei = rexi;
7220             startpoint = scan + ARG2L(scan);
7221             EVAL_CLOSE_PAREN_SET( st, arg );
7222             /* Detect infinite recursion
7223              *
7224              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7225              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7226              * So we track the position in the string we are at each time
7227              * we recurse and if we try to enter the same routine twice from
7228              * the same position we throw an error.
7229              */
7230             if ( rex->recurse_locinput[arg] == locinput ) {
7231                 /* FIXME: we should show the regop that is failing as part
7232                  * of the error message. */
7233                 Perl_croak(aTHX_ "Infinite recursion in regex");
7234             } else {
7235                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7236                 rex->recurse_locinput[arg]= locinput;
7237
7238                 DEBUG_r({
7239                     GET_RE_DEBUG_FLAGS_DECL;
7240                     DEBUG_STACK_r({
7241                         Perl_re_exec_indentf( aTHX_
7242                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7243                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7244                         );
7245                     });
7246                 });
7247             }
7248
7249             /* Save all the positions seen so far. */
7250             ST.cp = regcppush(rex, 0, maxopenparen);
7251             REGCP_SET(ST.lastcp);
7252
7253             /* and then jump to the code we share with EVAL */
7254             goto eval_recurse_doit;
7255             /* NOTREACHED */
7256
7257         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7258             if (cur_eval && cur_eval->locinput==locinput) {
7259                 if ( ++nochange_depth > max_nochange_depth )
7260                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7261             } else {
7262                 nochange_depth = 0;
7263             }
7264             {
7265                 /* execute the code in the {...} */
7266
7267                 dSP;
7268                 IV before;
7269                 OP * const oop = PL_op;
7270                 COP * const ocurcop = PL_curcop;
7271                 OP *nop;
7272                 CV *newcv;
7273
7274                 /* save *all* paren positions */
7275                 regcppush(rex, 0, maxopenparen);
7276                 REGCP_SET(ST.lastcp);
7277
7278                 if (!caller_cv)
7279                     caller_cv = find_runcv(NULL);
7280
7281                 n = ARG(scan);
7282
7283                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7284                     newcv = (ReANY(
7285                                     (REGEXP*)(rexi->data->data[n])
7286                             ))->qr_anoncv;
7287                     nop = (OP*)rexi->data->data[n+1];
7288                 }
7289                 else if (rexi->data->what[n] == 'l') { /* literal code */
7290                     newcv = caller_cv;
7291                     nop = (OP*)rexi->data->data[n];
7292                     assert(CvDEPTH(newcv));
7293                 }
7294                 else {
7295                     /* literal with own CV */
7296                     assert(rexi->data->what[n] == 'L');
7297                     newcv = rex->qr_anoncv;
7298                     nop = (OP*)rexi->data->data[n];
7299                 }
7300
7301                 /* Some notes about MULTICALL and the context and save stacks.
7302                  *
7303                  * In something like
7304                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7305                  * since codeblocks don't introduce a new scope (so that
7306                  * local() etc accumulate), at the end of a successful
7307                  * match there will be a SAVEt_CLEARSV on the savestack
7308                  * for each of $x, $y, $z. If the three code blocks above
7309                  * happen to have come from different CVs (e.g. via
7310                  * embedded qr//s), then we must ensure that during any
7311                  * savestack unwinding, PL_comppad always points to the
7312                  * right pad at each moment. We achieve this by
7313                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7314                  * there is a change of pad.
7315                  * In theory whenever we call a code block, we should
7316                  * push a CXt_SUB context, then pop it on return from
7317                  * that code block. This causes a bit of an issue in that
7318                  * normally popping a context also clears the savestack
7319                  * back to cx->blk_oldsaveix, but here we specifically
7320                  * don't want to clear the save stack on exit from the
7321                  * code block.
7322                  * Also for efficiency we don't want to keep pushing and
7323                  * popping the single SUB context as we backtrack etc.
7324                  * So instead, we push a single context the first time
7325                  * we need, it, then hang onto it until the end of this
7326                  * function. Whenever we encounter a new code block, we
7327                  * update the CV etc if that's changed. During the times
7328                  * in this function where we're not executing a code
7329                  * block, having the SUB context still there is a bit
7330                  * naughty - but we hope that no-one notices.
7331                  * When the SUB context is initially pushed, we fake up
7332                  * cx->blk_oldsaveix to be as if we'd pushed this context
7333                  * on first entry to S_regmatch rather than at some random
7334                  * point during the regexe execution. That way if we
7335                  * croak, popping the context stack will ensure that
7336                  * *everything* SAVEd by this function is undone and then
7337                  * the context popped, rather than e.g., popping the
7338                  * context (and restoring the original PL_comppad) then
7339                  * popping more of the savestack and restoring a bad
7340                  * PL_comppad.
7341                  */
7342
7343                 /* If this is the first EVAL, push a MULTICALL. On
7344                  * subsequent calls, if we're executing a different CV, or
7345                  * if PL_comppad has got messed up from backtracking
7346                  * through SAVECOMPPADs, then refresh the context.
7347                  */
7348                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7349                 {
7350                     U8 flags = (CXp_SUB_RE |
7351                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7352                     SAVECOMPPAD();
7353                     if (last_pushed_cv) {
7354                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7355                     }
7356                     else {
7357                         PUSH_MULTICALL_FLAGS(newcv, flags);
7358                     }
7359                     /* see notes above */
7360                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7361
7362                     last_pushed_cv = newcv;
7363                 }
7364                 else {
7365                     /* these assignments are just to silence compiler
7366                      * warnings */
7367                     multicall_cop = NULL;
7368                 }
7369                 last_pad = PL_comppad;
7370
7371                 /* the initial nextstate you would normally execute
7372                  * at the start of an eval (which would cause error
7373                  * messages to come from the eval), may be optimised
7374                  * away from the execution path in the regex code blocks;
7375                  * so manually set PL_curcop to it initially */
7376                 {
7377                     OP *o = cUNOPx(nop)->op_first;
7378                     assert(o->op_type == OP_NULL);
7379                     if (o->op_targ == OP_SCOPE) {
7380                         o = cUNOPo->op_first;
7381                     }
7382                     else {
7383                         assert(o->op_targ == OP_LEAVE);
7384                         o = cUNOPo->op_first;
7385                         assert(o->op_type == OP_ENTER);
7386                         o = OpSIBLING(o);
7387                     }
7388
7389                     if (o->op_type != OP_STUB) {
7390                         assert(    o->op_type == OP_NEXTSTATE
7391                                 || o->op_type == OP_DBSTATE
7392                                 || (o->op_type == OP_NULL
7393                                     &&  (  o->op_targ == OP_NEXTSTATE
7394                                         || o->op_targ == OP_DBSTATE
7395                                         )
7396                                     )
7397                         );
7398                         PL_curcop = (COP*)o;
7399                     }
7400                 }
7401                 nop = nop->op_next;
7402
7403                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7404                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7405
7406                 rex->offs[0].end = locinput - reginfo->strbeg;
7407                 if (reginfo->info_aux_eval->pos_magic)
7408                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7409                                   reginfo->sv, reginfo->strbeg,
7410                                   locinput - reginfo->strbeg);
7411
7412                 if (sv_yes_mark) {
7413                     SV *sv_mrk = get_sv("REGMARK", 1);
7414                     sv_setsv(sv_mrk, sv_yes_mark);
7415                 }
7416
7417                 /* we don't use MULTICALL here as we want to call the
7418                  * first op of the block of interest, rather than the
7419                  * first op of the sub. Also, we don't want to free
7420                  * the savestack frame */
7421                 before = (IV)(SP-PL_stack_base);
7422                 PL_op = nop;
7423                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7424                 SPAGAIN;
7425                 if ((IV)(SP-PL_stack_base) == before)
7426                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7427                 else {
7428                     ret = POPs;
7429                     PUTBACK;
7430                 }
7431
7432                 /* before restoring everything, evaluate the returned
7433                  * value, so that 'uninit' warnings don't use the wrong
7434                  * PL_op or pad. Also need to process any magic vars
7435                  * (e.g. $1) *before* parentheses are restored */
7436
7437                 PL_op = NULL;
7438
7439                 re_sv = NULL;
7440                 if (logical == 0) {       /*   (?{})/   */
7441                     SV *replsv = save_scalar(PL_replgv);
7442                     sv_setsv(replsv, ret); /* $^R */
7443                     SvSETMAGIC(replsv);
7444                 }
7445                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7446                     sw = cBOOL(SvTRUE_NN(ret));
7447                     logical = 0;
7448                 }
7449                 else {                   /*  /(??{})  */
7450                     /*  if its overloaded, let the regex compiler handle
7451                      *  it; otherwise extract regex, or stringify  */
7452                     if (SvGMAGICAL(ret))
7453                         ret = sv_mortalcopy(ret);
7454                     if (!SvAMAGIC(ret)) {
7455                         SV *sv = ret;
7456                         if (SvROK(sv))
7457                             sv = SvRV(sv);
7458                         if (SvTYPE(sv) == SVt_REGEXP)
7459                             re_sv = (REGEXP*) sv;
7460                         else if (SvSMAGICAL(ret)) {
7461                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7462                             if (mg)
7463                                 re_sv = (REGEXP *) mg->mg_obj;
7464                         }
7465
7466                         /* force any undef warnings here */
7467                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7468                             ret = sv_mortalcopy(ret);
7469                             (void) SvPV_force_nolen(ret);
7470                         }
7471                     }
7472
7473                 }
7474
7475                 /* *** Note that at this point we don't restore
7476                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7477                  * be used again soon. This is safe as long as nothing
7478                  * in the regexp code uses the pad ! */
7479                 PL_op = oop;
7480                 PL_curcop = ocurcop;
7481                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7482                 PL_curpm_under = PL_curpm;
7483                 PL_curpm = PL_reg_curpm;
7484
7485                 if (logical != 2) {
7486                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
7487                                     script_run_begin);
7488                     /* NOTREACHED */
7489                 }
7490             }
7491
7492                 /* only /(??{})/  from now on */
7493                 logical = 0;
7494                 {
7495                     /* extract RE object from returned value; compiling if
7496                      * necessary */
7497
7498                     if (re_sv) {
7499                         re_sv = reg_temp_copy(NULL, re_sv);
7500                     }
7501                     else {
7502                         U32 pm_flags = 0;
7503
7504                         if (SvUTF8(ret) && IN_BYTES) {
7505                             /* In use 'bytes': make a copy of the octet
7506                              * sequence, but without the flag on */
7507                             STRLEN len;
7508                             const char *const p = SvPV(ret, len);
7509                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7510                         }
7511                         if (rex->intflags & PREGf_USE_RE_EVAL)
7512                             pm_flags |= PMf_USE_RE_EVAL;
7513
7514                         /* if we got here, it should be an engine which
7515                          * supports compiling code blocks and stuff */
7516                         assert(rex->engine && rex->engine->op_comp);
7517                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7518                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7519                                     rex->engine, NULL, NULL,
7520                                     /* copy /msixn etc to inner pattern */
7521                                     ARG2L(scan),
7522                                     pm_flags);
7523
7524                         if (!(SvFLAGS(ret)
7525                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7526                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7527                             /* This isn't a first class regexp. Instead, it's
7528                                caching a regexp onto an existing, Perl visible
7529                                scalar.  */
7530                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7531                         }
7532                     }
7533                     SAVEFREESV(re_sv);
7534                     re = ReANY(re_sv);
7535                 }
7536                 RXp_MATCH_COPIED_off(re);
7537                 re->subbeg = rex->subbeg;
7538                 re->sublen = rex->sublen;
7539                 re->suboffset = rex->suboffset;
7540                 re->subcoffset = rex->subcoffset;
7541                 re->lastparen = 0;
7542                 re->lastcloseparen = 0;
7543                 rei = RXi_GET(re);
7544                 DEBUG_EXECUTE_r(
7545                     debug_start_match(re_sv, utf8_target, locinput,
7546                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7547                 );
7548                 startpoint = rei->program + 1;
7549                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7550                                              * close_paren only for GOSUB */
7551                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7552                 /* Save all the seen positions so far. */
7553                 ST.cp = regcppush(rex, 0, maxopenparen);
7554                 REGCP_SET(ST.lastcp);
7555                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7556                 maxopenparen = 0;
7557                 /* run the pattern returned from (??{...}) */
7558
7559               eval_recurse_doit: /* Share code with GOSUB below this line
7560                             * At this point we expect the stack context to be
7561                             * set up correctly */
7562
7563                 /* invalidate the S-L poscache. We're now executing a
7564                  * different set of WHILEM ops (and their associated
7565                  * indexes) against the same string, so the bits in the
7566                  * cache are meaningless. Setting maxiter to zero forces
7567                  * the cache to be invalidated and zeroed before reuse.
7568                  * XXX This is too dramatic a measure. Ideally we should
7569                  * save the old cache and restore when running the outer
7570                  * pattern again */
7571                 reginfo->poscache_maxiter = 0;
7572
7573                 /* the new regexp might have a different is_utf8_pat than we do */
7574                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7575
7576                 ST.prev_rex = rex_sv;
7577                 ST.prev_curlyx = cur_curlyx;
7578                 rex_sv = re_sv;
7579                 SET_reg_curpm(rex_sv);
7580                 rex = re;
7581                 rexi = rei;
7582                 cur_curlyx = NULL;
7583                 ST.B = next;
7584                 ST.prev_eval = cur_eval;
7585                 cur_eval = st;
7586                 /* now continue from first node in postoned RE */
7587                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
7588                                     loceol, script_run_begin);
7589                 NOT_REACHED; /* NOTREACHED */
7590         }
7591
7592         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7593             /* note: this is called twice; first after popping B, then A */
7594             DEBUG_STACK_r({
7595                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7596                     depth, cur_eval, ST.prev_eval);
7597             });
7598
7599 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7600             if ( cur_eval && CUR_EVAL.close_paren ) {\
7601                 DEBUG_STACK_r({ \
7602                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7603                         depth,    \
7604                         CUR_EVAL.close_paren - 1,\
7605                         cur_eval, \
7606                         VAL);     \
7607                 });               \
7608                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7609             }
7610
7611             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7612
7613             rex_sv = ST.prev_rex;
7614             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7615             SET_reg_curpm(rex_sv);
7616             rex = ReANY(rex_sv);
7617             rexi = RXi_GET(rex);
7618             {
7619                 /* preserve $^R across LEAVE's. See Bug 121070. */
7620                 SV *save_sv= GvSV(PL_replgv);
7621                 SV *replsv;
7622                 SvREFCNT_inc(save_sv);
7623                 regcpblow(ST.cp); /* LEAVE in disguise */
7624                 /* don't move this initialization up */
7625                 replsv = GvSV(PL_replgv);
7626                 sv_setsv(replsv, save_sv);
7627                 SvSETMAGIC(replsv);
7628                 SvREFCNT_dec(save_sv);
7629             }
7630             cur_eval = ST.prev_eval;
7631             cur_curlyx = ST.prev_curlyx;
7632
7633             /* Invalidate cache. See "invalidate" comment above. */
7634             reginfo->poscache_maxiter = 0;
7635             if ( nochange_depth )
7636                 nochange_depth--;
7637
7638             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7639             sayYES;
7640
7641
7642         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7643             REGCP_UNWIND(ST.lastcp);
7644             sayNO;
7645
7646         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7647             /* note: this is called twice; first after popping B, then A */
7648             DEBUG_STACK_r({
7649                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7650                     depth, cur_eval, ST.prev_eval);
7651             });
7652
7653             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7654
7655             rex_sv = ST.prev_rex;
7656             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7657             SET_reg_curpm(rex_sv);
7658             rex = ReANY(rex_sv);
7659             rexi = RXi_GET(rex);
7660
7661             REGCP_UNWIND(ST.lastcp);
7662             regcppop(rex, &maxopenparen);
7663             cur_eval = ST.prev_eval;
7664             cur_curlyx = ST.prev_curlyx;
7665
7666             /* Invalidate cache. See "invalidate" comment above. */
7667             reginfo->poscache_maxiter = 0;
7668             if ( nochange_depth )
7669                 nochange_depth--;
7670
7671             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7672             sayNO_SILENT;
7673 #undef ST
7674
7675         case OPEN: /*  (  */
7676             n = ARG(scan);  /* which paren pair */
7677             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7678             if (n > maxopenparen)
7679                 maxopenparen = n;
7680             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7681                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7682                 depth,
7683                 PTR2UV(rex),
7684                 PTR2UV(rex->offs),
7685                 (UV)n,
7686                 (IV)rex->offs[n].start_tmp,
7687                 (UV)maxopenparen
7688             ));
7689             lastopen = n;
7690             break;
7691
7692         case SROPEN: /*  (*SCRIPT_RUN:  */
7693             script_run_begin = (U8 *) locinput;
7694             break;
7695
7696
7697         case CLOSE:  /*  )  */
7698             n = ARG(scan);  /* which paren pair */
7699             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7700                              locinput - reginfo->strbeg);
7701             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7702                 goto fake_end;
7703
7704             break;
7705
7706         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7707
7708             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7709             {
7710                 sayNO;
7711             }
7712
7713             break;
7714
7715
7716         case ACCEPT:  /*  (*ACCEPT)  */
7717             if (scan->flags)
7718                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7719             if (ARG2L(scan)){
7720                 regnode *cursor;
7721                 for (cursor=scan;
7722                      cursor && OP(cursor)!=END;
7723                      cursor=regnext(cursor))
7724                 {
7725                     if ( OP(cursor)==CLOSE ){
7726                         n = ARG(cursor);
7727                         if ( n <= lastopen ) {
7728                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7729                                              locinput - reginfo->strbeg);
7730                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7731                                 break;
7732                         }
7733                     }
7734                 }
7735             }
7736             goto fake_end;
7737             /* NOTREACHED */
7738
7739         case GROUPP:  /*  (?(1))  */
7740             n = ARG(scan);  /* which paren pair */
7741             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7742             break;
7743
7744         case GROUPPN:  /*  (?(<name>))  */
7745             /* reg_check_named_buff_matched returns 0 for no match */
7746             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7747             break;
7748
7749         case INSUBP:   /*  (?(R))  */
7750             n = ARG(scan);
7751             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7752              * of SCAN is already set up as matches a eval.close_paren */
7753             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7754             break;
7755
7756         case DEFINEP:  /*  (?(DEFINE))  */
7757             sw = 0;
7758             break;
7759
7760         case IFTHEN:   /*  (?(cond)A|B)  */
7761             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7762             if (sw)
7763                 next = NEXTOPER(NEXTOPER(scan));
7764             else {
7765                 next = scan + ARG(scan);
7766                 if (OP(next) == IFTHEN) /* Fake one. */
7767                     next = NEXTOPER(NEXTOPER(next));
7768             }
7769             break;
7770
7771         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7772             logical = scan->flags;
7773             break;
7774
7775 /*******************************************************************
7776
7777 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7778 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7779 STAR/PLUS/CURLY/CURLYN are used instead.)
7780
7781 A*B is compiled as <CURLYX><A><WHILEM><B>
7782
7783 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7784 state, which contains the current count, initialised to -1. It also sets
7785 cur_curlyx to point to this state, with any previous value saved in the
7786 state block.
7787
7788 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7789 since the pattern may possibly match zero times (i.e. it's a while {} loop
7790 rather than a do {} while loop).
7791
7792 Each entry to WHILEM represents a successful match of A. The count in the
7793 CURLYX block is incremented, another WHILEM state is pushed, and execution
7794 passes to A or B depending on greediness and the current count.
7795
7796 For example, if matching against the string a1a2a3b (where the aN are
7797 substrings that match /A/), then the match progresses as follows: (the
7798 pushed states are interspersed with the bits of strings matched so far):
7799
7800     <CURLYX cnt=-1>
7801     <CURLYX cnt=0><WHILEM>
7802     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7803     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7804     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7805     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7806
7807 (Contrast this with something like CURLYM, which maintains only a single
7808 backtrack state:
7809
7810     <CURLYM cnt=0> a1
7811     a1 <CURLYM cnt=1> a2
7812     a1 a2 <CURLYM cnt=2> a3
7813     a1 a2 a3 <CURLYM cnt=3> b
7814 )
7815
7816 Each WHILEM state block marks a point to backtrack to upon partial failure
7817 of A or B, and also contains some minor state data related to that
7818 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7819 overall state, such as the count, and pointers to the A and B ops.
7820
7821 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7822 must always point to the *current* CURLYX block, the rules are:
7823
7824 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7825 and set cur_curlyx to point the new block.
7826
7827 When popping the CURLYX block after a successful or unsuccessful match,
7828 restore the previous cur_curlyx.
7829
7830 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7831 to the outer one saved in the CURLYX block.
7832
7833 When popping the WHILEM block after a successful or unsuccessful B match,
7834 restore the previous cur_curlyx.
7835
7836 Here's an example for the pattern (AI* BI)*BO
7837 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7838
7839 cur_
7840 curlyx backtrack stack
7841 ------ ---------------
7842 NULL
7843 CO     <CO prev=NULL> <WO>
7844 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7845 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7846 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7847
7848 At this point the pattern succeeds, and we work back down the stack to
7849 clean up, restoring as we go:
7850
7851 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7852 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7853 CO     <CO prev=NULL> <WO>
7854 NULL
7855
7856 *******************************************************************/
7857
7858 #define ST st->u.curlyx
7859
7860         case CURLYX:    /* start of /A*B/  (for complex A) */
7861         {
7862             /* No need to save/restore up to this paren */
7863             I32 parenfloor = scan->flags;
7864
7865             assert(next); /* keep Coverity happy */
7866             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7867                 next += ARG(next);
7868
7869             /* XXXX Probably it is better to teach regpush to support
7870                parenfloor > maxopenparen ... */
7871             if (parenfloor > (I32)rex->lastparen)
7872                 parenfloor = rex->lastparen; /* Pessimization... */
7873
7874             ST.prev_curlyx= cur_curlyx;
7875             cur_curlyx = st;
7876             ST.cp = PL_savestack_ix;
7877
7878             /* these fields contain the state of the current curly.
7879              * they are accessed by subsequent WHILEMs */
7880             ST.parenfloor = parenfloor;
7881             ST.me = scan;
7882             ST.B = next;
7883             ST.minmod = minmod;
7884             minmod = 0;
7885             ST.count = -1;      /* this will be updated by WHILEM */
7886             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7887
7888             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
7889                                 script_run_begin);
7890             NOT_REACHED; /* NOTREACHED */
7891         }
7892
7893         case CURLYX_end: /* just finished matching all of A*B */
7894             cur_curlyx = ST.prev_curlyx;
7895             sayYES;
7896             NOT_REACHED; /* NOTREACHED */
7897
7898         case CURLYX_end_fail: /* just failed to match all of A*B */
7899             regcpblow(ST.cp);
7900             cur_curlyx = ST.prev_curlyx;
7901             sayNO;
7902             NOT_REACHED; /* NOTREACHED */
7903
7904
7905 #undef ST
7906 #define ST st->u.whilem
7907
7908         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7909         {
7910             /* see the discussion above about CURLYX/WHILEM */
7911             I32 n;
7912             int min, max;
7913             regnode *A;
7914
7915             assert(cur_curlyx); /* keep Coverity happy */
7916
7917             min = ARG1(cur_curlyx->u.curlyx.me);
7918             max = ARG2(cur_curlyx->u.curlyx.me);
7919             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7920             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7921             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7922             ST.cache_offset = 0;
7923             ST.cache_mask = 0;
7924
7925
7926             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
7927                   depth, (long)n, min, max)
7928             );
7929
7930             /* First just match a string of min A's. */
7931
7932             if (n < min) {
7933                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7934                 cur_curlyx->u.curlyx.lastloc = locinput;
7935                 REGCP_SET(ST.lastcp);
7936
7937                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
7938                                 script_run_begin);
7939                 NOT_REACHED; /* NOTREACHED */
7940             }
7941
7942             /* If degenerate A matches "", assume A done. */
7943
7944             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7945                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
7946                    depth)
7947                 );
7948                 goto do_whilem_B_max;
7949             }
7950
7951             /* super-linear cache processing.
7952              *
7953              * The idea here is that for certain types of CURLYX/WHILEM -
7954              * principally those whose upper bound is infinity (and
7955              * excluding regexes that have things like \1 and other very
7956              * non-regular expresssiony things), then if a pattern like
7957              * /....A*.../ fails and we backtrack to the WHILEM, then we
7958              * make a note that this particular WHILEM op was at string
7959              * position 47 (say) when the rest of pattern failed. Then, if
7960              * we ever find ourselves back at that WHILEM, and at string
7961              * position 47 again, we can just fail immediately rather than
7962              * running the rest of the pattern again.
7963              *
7964              * This is very handy when patterns start to go
7965              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7966              * with a combinatorial explosion of backtracking.
7967              *
7968              * The cache is implemented as a bit array, with one bit per
7969              * string byte position per WHILEM op (up to 16) - so its
7970              * between 0.25 and 2x the string size.
7971              *
7972              * To avoid allocating a poscache buffer every time, we do an
7973              * initially countdown; only after we have  executed a WHILEM
7974              * op (string-length x #WHILEMs) times do we allocate the
7975              * cache.
7976              *
7977              * The top 4 bits of scan->flags byte say how many different
7978              * relevant CURLLYX/WHILEM op pairs there are, while the
7979              * bottom 4-bits is the identifying index number of this
7980              * WHILEM.
7981              */
7982
7983             if (scan->flags) {
7984
7985                 if (!reginfo->poscache_maxiter) {
7986                     /* start the countdown: Postpone detection until we
7987                      * know the match is not *that* much linear. */
7988                     reginfo->poscache_maxiter
7989                         =    (reginfo->strend - reginfo->strbeg + 1)
7990                            * (scan->flags>>4);
7991                     /* possible overflow for long strings and many CURLYX's */
7992                     if (reginfo->poscache_maxiter < 0)
7993                         reginfo->poscache_maxiter = I32_MAX;
7994                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7995                 }
7996
7997                 if (reginfo->poscache_iter-- == 0) {
7998                     /* initialise cache */
7999                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8000                     regmatch_info_aux *const aux = reginfo->info_aux;
8001                     if (aux->poscache) {
8002                         if ((SSize_t)reginfo->poscache_size < size) {
8003                             Renew(aux->poscache, size, char);
8004                             reginfo->poscache_size = size;
8005                         }
8006                         Zero(aux->poscache, size, char);
8007                     }
8008                     else {
8009                         reginfo->poscache_size = size;
8010                         Newxz(aux->poscache, size, char);
8011                     }
8012                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8013       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8014                               PL_colors[4], PL_colors[5])
8015                     );
8016                 }
8017
8018                 if (reginfo->poscache_iter < 0) {
8019                     /* have we already failed at this position? */
8020                     SSize_t offset, mask;
8021
8022                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8023                     offset  = (scan->flags & 0xf) - 1
8024                                 +   (locinput - reginfo->strbeg)
8025                                   * (scan->flags>>4);
8026                     mask    = 1 << (offset % 8);
8027                     offset /= 8;
8028                     if (reginfo->info_aux->poscache[offset] & mask) {
8029                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8030                             depth)
8031                         );
8032                         cur_curlyx->u.curlyx.count--;
8033                         sayNO; /* cache records failure */
8034                     }
8035                     ST.cache_offset = offset;
8036                     ST.cache_mask   = mask;
8037                 }
8038             }
8039
8040             /* Prefer B over A for minimal matching. */
8041
8042             if (cur_curlyx->u.curlyx.minmod) {
8043                 ST.save_curlyx = cur_curlyx;
8044                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8045                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8046                                     locinput, loceol, script_run_begin);
8047                 NOT_REACHED; /* NOTREACHED */
8048             }
8049
8050             /* Prefer A over B for maximal matching. */
8051
8052             if (n < max) { /* More greed allowed? */
8053                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8054                             maxopenparen);
8055                 cur_curlyx->u.curlyx.lastloc = locinput;
8056                 REGCP_SET(ST.lastcp);
8057                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8058                                 script_run_begin);
8059                 NOT_REACHED; /* NOTREACHED */
8060             }
8061             goto do_whilem_B_max;
8062         }
8063         NOT_REACHED; /* NOTREACHED */
8064
8065         case WHILEM_B_min: /* just matched B in a minimal match */
8066         case WHILEM_B_max: /* just matched B in a maximal match */
8067             cur_curlyx = ST.save_curlyx;
8068             sayYES;
8069             NOT_REACHED; /* NOTREACHED */
8070
8071         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8072             cur_curlyx = ST.save_curlyx;
8073             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8074             cur_curlyx->u.curlyx.count--;
8075             CACHEsayNO;
8076             NOT_REACHED; /* NOTREACHED */
8077
8078         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8079             /* FALLTHROUGH */
8080         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8081             REGCP_UNWIND(ST.lastcp);
8082             regcppop(rex, &maxopenparen);
8083             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8084             cur_curlyx->u.curlyx.count--;
8085             CACHEsayNO;
8086             NOT_REACHED; /* NOTREACHED */
8087
8088         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8089             REGCP_UNWIND(ST.lastcp);
8090             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8091             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8092                 depth)
8093             );
8094           do_whilem_B_max:
8095             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8096                 && ckWARN(WARN_REGEXP)
8097                 && !reginfo->warned)
8098             {
8099                 reginfo->warned = TRUE;
8100                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8101                      "Complex regular subexpression recursion limit (%d) "
8102                      "exceeded",
8103                      REG_INFTY - 1);
8104             }
8105
8106             /* now try B */
8107             ST.save_curlyx = cur_curlyx;
8108             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8109             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8110                                 locinput, loceol, script_run_begin);
8111             NOT_REACHED; /* NOTREACHED */
8112
8113         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8114             cur_curlyx = ST.save_curlyx;
8115
8116             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8117                 /* Maximum greed exceeded */
8118                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8119                     && ckWARN(WARN_REGEXP)
8120                     && !reginfo->warned)
8121                 {
8122                     reginfo->warned     = TRUE;
8123                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8124                         "Complex regular subexpression recursion "
8125                         "limit (%d) exceeded",
8126                         REG_INFTY - 1);
8127                 }
8128                 cur_curlyx->u.curlyx.count--;
8129                 CACHEsayNO;
8130             }
8131
8132             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8133             );
8134             /* Try grabbing another A and see if it helps. */
8135             cur_curlyx->u.curlyx.lastloc = locinput;
8136             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8137                             maxopenparen);
8138             REGCP_SET(ST.lastcp);
8139             PUSH_STATE_GOTO(WHILEM_A_min,
8140                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8141                 locinput, loceol, script_run_begin);
8142             NOT_REACHED; /* NOTREACHED */
8143
8144 #undef  ST
8145 #define ST st->u.branch
8146
8147         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8148             next = scan + ARG(scan);
8149             if (next == scan)
8150                 next = NULL;
8151             scan = NEXTOPER(scan);
8152             /* FALLTHROUGH */
8153
8154         case BRANCH:        /*  /(...|A|...)/ */
8155             scan = NEXTOPER(scan); /* scan now points to inner node */
8156             ST.lastparen = rex->lastparen;
8157             ST.lastcloseparen = rex->lastcloseparen;
8158             ST.next_branch = next;
8159             REGCP_SET(ST.cp);
8160
8161             /* Now go into the branch */
8162             if (has_cutgroup) {
8163                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8164                                     script_run_begin);
8165             } else {
8166                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8167                                 script_run_begin);
8168             }
8169             NOT_REACHED; /* NOTREACHED */
8170
8171         case CUTGROUP:  /*  /(*THEN)/  */
8172             sv_yes_mark = st->u.mark.mark_name = scan->flags
8173                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8174                 : NULL;
8175             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8176                             script_run_begin);
8177             NOT_REACHED; /* NOTREACHED */
8178
8179         case CUTGROUP_next_fail:
8180             do_cutgroup = 1;
8181             no_final = 1;
8182             if (st->u.mark.mark_name)
8183                 sv_commit = st->u.mark.mark_name;
8184             sayNO;
8185             NOT_REACHED; /* NOTREACHED */
8186
8187         case BRANCH_next:
8188             sayYES;
8189             NOT_REACHED; /* NOTREACHED */
8190
8191         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8192             if (do_cutgroup) {
8193                 do_cutgroup = 0;
8194                 no_final = 0;
8195             }
8196             REGCP_UNWIND(ST.cp);
8197             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8198             scan = ST.next_branch;
8199             /* no more branches? */
8200             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8201                 DEBUG_EXECUTE_r({
8202                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8203                         depth,
8204                         PL_colors[4],
8205                         PL_colors[5] );
8206                 });
8207                 sayNO_SILENT;
8208             }
8209             continue; /* execute next BRANCH[J] op */
8210             /* NOTREACHED */
8211
8212         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8213             minmod = 1;
8214             break;
8215
8216 #undef  ST
8217 #define ST st->u.curlym
8218
8219         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8220
8221             /* This is an optimisation of CURLYX that enables us to push
8222              * only a single backtracking state, no matter how many matches
8223              * there are in {m,n}. It relies on the pattern being constant
8224              * length, with no parens to influence future backrefs
8225              */
8226
8227             ST.me = scan;
8228             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8229
8230             ST.lastparen      = rex->lastparen;
8231             ST.lastcloseparen = rex->lastcloseparen;
8232
8233             /* if paren positive, emulate an OPEN/CLOSE around A */
8234             if (ST.me->flags) {
8235                 U32 paren = ST.me->flags;
8236                 if (paren > maxopenparen)
8237                     maxopenparen = paren;
8238                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8239             }
8240             ST.A = scan;
8241             ST.B = next;
8242             ST.alen = 0;
8243             ST.count = 0;
8244             ST.minmod = minmod;
8245             minmod = 0;
8246             ST.c1 = CHRTEST_UNINIT;
8247             REGCP_SET(ST.cp);
8248
8249             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8250                 goto curlym_do_B;
8251
8252           curlym_do_A: /* execute the A in /A{m,n}B/  */
8253             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8254                                 script_run_begin);
8255             NOT_REACHED; /* NOTREACHED */
8256
8257         case CURLYM_A: /* we've just matched an A */
8258             ST.count++;
8259             /* after first match, determine A's length: u.curlym.alen */
8260             if (ST.count == 1) {
8261                 if (reginfo->is_utf8_target) {
8262                     char *s = st->locinput;
8263                     while (s < locinput) {
8264                         ST.alen++;
8265                         s += UTF8SKIP(s);
8266                     }
8267                 }
8268                 else {
8269                     ST.alen = locinput - st->locinput;
8270                 }
8271                 if (ST.alen == 0)
8272                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8273             }
8274             DEBUG_EXECUTE_r(
8275                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8276                           depth, (IV) ST.count, (IV)ST.alen)
8277             );
8278
8279             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8280                 goto fake_end;
8281
8282             {
8283                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8284                 if ( max == REG_INFTY || ST.count < max )
8285                     goto curlym_do_A; /* try to match another A */
8286             }
8287             goto curlym_do_B; /* try to match B */
8288
8289         case CURLYM_A_fail: /* just failed to match an A */
8290             REGCP_UNWIND(ST.cp);
8291
8292
8293             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8294                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8295                 sayNO;
8296
8297           curlym_do_B: /* execute the B in /A{m,n}B/  */
8298             if (ST.c1 == CHRTEST_UNINIT) {
8299                 /* calculate c1 and c2 for possible match of 1st char
8300                  * following curly */
8301                 ST.c1 = ST.c2 = CHRTEST_VOID;
8302                 assert(ST.B);
8303                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8304                     regnode *text_node = ST.B;
8305                     if (! HAS_TEXT(text_node))
8306                         FIND_NEXT_IMPT(text_node);
8307                     if (PL_regkind[OP(text_node)] == EXACT) {
8308                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8309                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8310                            reginfo))
8311                         {
8312                             sayNO;
8313                         }
8314                     }
8315                 }
8316             }
8317
8318             DEBUG_EXECUTE_r(
8319                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8320                     depth, (IV)ST.count)
8321                 );
8322             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8323                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8324
8325                            /* (We can use memEQ and memNE in this file without
8326                             * having to worry about one being shorter than the
8327                             * other, since the first byte of each gives the
8328                             * length of the character) */
8329                     if (   memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
8330                                                               reginfo->strend))
8331                         && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
8332                                                              reginfo->strend)))
8333                     {
8334                         /* simulate B failing */
8335                         DEBUG_OPTIMISE_r(
8336                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8337                                 depth,
8338                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8339                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8340                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8341                         );
8342                         state_num = CURLYM_B_fail;
8343                         goto reenter_switch;
8344                     }
8345                 }
8346                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8347                     /* simulate B failing */
8348                     DEBUG_OPTIMISE_r(
8349                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8350                             depth,
8351                             (int) nextchr, ST.c1, ST.c2)
8352                     );
8353                     state_num = CURLYM_B_fail;
8354                     goto reenter_switch;
8355                 }
8356             }
8357
8358             if (ST.me->flags) {
8359                 /* emulate CLOSE: mark current A as captured */
8360                 U32 paren = (U32)ST.me->flags;
8361                 if (ST.count) {
8362                     CLOSE_CAPTURE(paren,
8363                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
8364                         locinput - reginfo->strbeg);
8365                 }
8366                 else
8367                     rex->offs[paren].end = -1;
8368
8369                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8370                 {
8371                     if (ST.count)
8372                         goto fake_end;
8373                     else
8374                         sayNO;
8375                 }
8376             }
8377
8378             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
8379                             script_run_begin);
8380             NOT_REACHED; /* NOTREACHED */
8381
8382         case CURLYM_B_fail: /* just failed to match a B */
8383             REGCP_UNWIND(ST.cp);
8384             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8385             if (ST.minmod) {
8386                 I32 max = ARG2(ST.me);
8387                 if (max != REG_INFTY && ST.count == max)
8388                     sayNO;
8389                 goto curlym_do_A; /* try to match a further A */
8390             }
8391             /* backtrack one A */
8392             if (ST.count == ARG1(ST.me) /* min */)
8393                 sayNO;
8394             ST.count--;
8395             SET_locinput(HOPc(locinput, -ST.alen));
8396             goto curlym_do_B; /* try to match B */
8397
8398 #undef ST
8399 #define ST st->u.curly
8400
8401 #define CURLY_SETPAREN(paren, success) \
8402     if (paren) { \
8403         if (success) { \
8404             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8405                                  locinput - reginfo->strbeg); \
8406         } \
8407         else { \
8408             rex->offs[paren].end = -1; \
8409             rex->lastparen      = ST.lastparen; \
8410             rex->lastcloseparen = ST.lastcloseparen; \
8411         } \
8412     }
8413
8414         case STAR:              /*  /A*B/ where A is width 1 char */
8415             ST.paren = 0;
8416             ST.min = 0;
8417             ST.max = REG_INFTY;
8418             scan = NEXTOPER(scan);
8419             goto repeat;
8420
8421         case PLUS:              /*  /A+B/ where A is width 1 char */
8422             ST.paren = 0;
8423             ST.min = 1;
8424             ST.max = REG_INFTY;
8425             scan = NEXTOPER(scan);
8426             goto repeat;
8427
8428         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8429             ST.paren = scan->flags;     /* Which paren to set */
8430             ST.lastparen      = rex->lastparen;
8431             ST.lastcloseparen = rex->lastcloseparen;
8432             if (ST.paren > maxopenparen)
8433                 maxopenparen = ST.paren;
8434             ST.min = ARG1(scan);  /* min to match */
8435             ST.max = ARG2(scan);  /* max to match */
8436             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8437
8438             /* handle the single-char capture called as a GOSUB etc */
8439             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8440             {
8441                 char *li = locinput;
8442                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
8443                     sayNO;
8444                 SET_locinput(li);
8445                 goto fake_end;
8446             }
8447
8448             goto repeat;
8449
8450         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8451             ST.paren = 0;
8452             ST.min = ARG1(scan);  /* min to match */
8453             ST.max = ARG2(scan);  /* max to match */
8454             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8455           repeat:
8456             /*
8457             * Lookahead to avoid useless match attempts
8458             * when we know what character comes next.
8459             *
8460             * Used to only do .*x and .*?x, but now it allows
8461             * for )'s, ('s and (?{ ... })'s to be in the way
8462             * of the quantifier and the EXACT-like node.  -- japhy
8463             */
8464
8465             assert(ST.min <= ST.max);
8466             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8467                 ST.c1 = ST.c2 = CHRTEST_VOID;
8468             }
8469             else {
8470                 regnode *text_node = next;
8471
8472                 if (! HAS_TEXT(text_node))
8473                     FIND_NEXT_IMPT(text_node);
8474
8475                 if (! HAS_TEXT(text_node))
8476                     ST.c1 = ST.c2 = CHRTEST_VOID;
8477                 else {
8478                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8479                         ST.c1 = ST.c2 = CHRTEST_VOID;
8480                     }
8481                     else {
8482                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8483                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8484                            reginfo))
8485                         {
8486                             sayNO;
8487                         }
8488                     }
8489                 }
8490             }
8491
8492             ST.A = scan;
8493             ST.B = next;
8494             if (minmod) {
8495                 char *li = locinput;
8496                 minmod = 0;
8497                 if (ST.min &&
8498                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
8499                             < ST.min)
8500                     sayNO;
8501                 SET_locinput(li);
8502                 ST.count = ST.min;
8503                 REGCP_SET(ST.cp);
8504                 if (ST.c1 == CHRTEST_VOID)
8505                     goto curly_try_B_min;
8506
8507                 ST.oldloc = locinput;
8508
8509                 /* set ST.maxpos to the furthest point along the
8510                  * string that could possibly match */
8511                 if  (ST.max == REG_INFTY) {
8512                     ST.maxpos = loceol - 1;
8513                     if (utf8_target)
8514                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8515                             ST.maxpos--;
8516                 }
8517                 else if (utf8_target) {
8518                     int m = ST.max - ST.min;
8519                     for (ST.maxpos = locinput;
8520                          m >0 && ST.maxpos <  loceol; m--)
8521                         ST.maxpos += UTF8SKIP(ST.maxpos);
8522                 }
8523                 else {
8524                     ST.maxpos = locinput + ST.max - ST.min;
8525                     if (ST.maxpos >=  loceol)
8526                         ST.maxpos =  loceol - 1;
8527                 }
8528                 goto curly_try_B_min_known;
8529
8530             }
8531             else {
8532                 /* avoid taking address of locinput, so it can remain
8533                  * a register var */
8534                 char *li = locinput;
8535                 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
8536                 if (ST.count < ST.min)
8537                     sayNO;
8538                 SET_locinput(li);
8539                 if ((ST.count > ST.min)
8540                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8541                 {
8542                     /* A{m,n} must come at the end of the string, there's
8543                      * no point in backing off ... */
8544                     ST.min = ST.count;
8545                     /* ...except that $ and \Z can match before *and* after
8546                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8547                        We may back off by one in this case. */
8548                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8549                         ST.min--;
8550                 }
8551                 REGCP_SET(ST.cp);
8552                 goto curly_try_B_max;
8553             }
8554             NOT_REACHED; /* NOTREACHED */
8555
8556         case CURLY_B_min_fail:
8557             /* failed to find B in a non-greedy match.
8558              * Handles both cases where c1,c2 valid or not */
8559
8560             REGCP_UNWIND(ST.cp);
8561             if (ST.paren) {
8562                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8563             }
8564
8565             if (ST.c1 == CHRTEST_VOID) {
8566                 /* failed -- move forward one */
8567                 char *li = locinput;
8568                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
8569                     sayNO;
8570                 }
8571                 locinput = li;
8572                 ST.count++;
8573                 if (!(   ST.count <= ST.max
8574                         /* count overflow ? */
8575                      || (ST.max == REG_INFTY && ST.count > 0))
8576                 )
8577                     sayNO;
8578             }
8579             else {
8580                 int n;
8581                 /* Couldn't or didn't -- move forward. */
8582                 ST.oldloc = locinput;
8583                 if (utf8_target)
8584                     locinput += UTF8SKIP(locinput);
8585                 else
8586                     locinput++;
8587                 ST.count++;
8588
8589               curly_try_B_min_known:
8590                 /* find the next place where 'B' could work, then call B */
8591                 if (utf8_target) {
8592                     n = (ST.oldloc == locinput) ? 0 : 1;
8593                     if (ST.c1 == ST.c2) {
8594                         /* set n to utf8_distance(oldloc, locinput) */
8595                         while (    locinput <= ST.maxpos
8596                                &&  locinput < loceol
8597                                &&  memNE(locinput, ST.c1_utf8,
8598                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8599                         {
8600                             locinput += UTF8_SAFE_SKIP(locinput,
8601                                                        reginfo->strend);
8602                             n++;
8603                         }
8604                     }
8605                     else {
8606                         /* set n to utf8_distance(oldloc, locinput) */
8607                         while (   locinput <= ST.maxpos
8608                                && locinput < loceol
8609                                && memNE(locinput, ST.c1_utf8,
8610                                      UTF8_SAFE_SKIP(locinput, reginfo->strend))
8611                                && memNE(locinput, ST.c2_utf8,
8612                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8613                         {
8614                             locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
8615                             n++;
8616                         }
8617                     }
8618                 }
8619                 else {  /* Not utf8_target */
8620                     if (ST.c1 == ST.c2) {
8621                         locinput = (char *) memchr(locinput,
8622                                                    ST.c1,
8623                                                    ST.maxpos + 1 - locinput);
8624                         if (! locinput) {
8625                             locinput = ST.maxpos + 1;
8626                         }
8627                     }
8628                     else {
8629                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8630
8631                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8632                             while (   locinput <= ST.maxpos
8633                                    && UCHARAT(locinput) != ST.c1
8634                                    && UCHARAT(locinput) != ST.c2)
8635                             {
8636                                 locinput++;
8637                             }
8638                         }
8639                         else {
8640                             /* If c1 and c2 only differ by a single bit, we can
8641                              * avoid a conditional each time through the loop,
8642                              * at the expense of a little preliminary setup and
8643                              * an extra mask each iteration.  By masking out
8644                              * that bit, we match exactly two characters, c1
8645                              * and c2, and so we don't have to test for both.
8646                              * On both ASCII and EBCDIC platforms, most of the
8647                              * ASCII-range and Latin1-range folded equivalents
8648                              * differ only in a single bit, so this is actually
8649                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8650                              * 0x61). */
8651                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8652                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8653                             while (   locinput <= ST.maxpos
8654                                    && (UCHARAT(locinput) & c1_c2_mask)
8655                                                                 != c1_masked)
8656                             {
8657                                 locinput++;
8658                             }
8659                         }
8660                     }
8661                     n = locinput - ST.oldloc;
8662                 }
8663                 if (locinput > ST.maxpos)
8664                     sayNO;
8665                 if (n) {
8666                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8667                      * at b; check that everything between oldloc and
8668                      * locinput matches */
8669                     char *li = ST.oldloc;
8670                     ST.count += n;
8671                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
8672                         sayNO;
8673                     assert(n == REG_INFTY || locinput == li);
8674                 }
8675             }
8676
8677           curly_try_B_min:
8678             CURLY_SETPAREN(ST.paren, ST.count);
8679             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
8680                             script_run_begin);
8681             NOT_REACHED; /* NOTREACHED */
8682
8683
8684           curly_try_B_max:
8685             /* a successful greedy match: now try to match B */
8686             {
8687                 bool could_match = locinput <  loceol;
8688
8689                 /* If it could work, try it. */
8690                 if (ST.c1 != CHRTEST_VOID && could_match) {
8691                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8692                     {
8693                         could_match =  memEQ(locinput, ST.c1_utf8,
8694                                              UTF8_SAFE_SKIP(locinput,
8695                                                             reginfo->strend))
8696                                     || memEQ(locinput, ST.c2_utf8,
8697                                              UTF8_SAFE_SKIP(locinput,
8698                                                             reginfo->strend));
8699                     }
8700                     else {
8701                         could_match =   UCHARAT(locinput) == ST.c1
8702                                      || UCHARAT(locinput) == ST.c2;
8703                     }
8704                 }
8705                 if (ST.c1 == CHRTEST_VOID || could_match) {
8706                     CURLY_SETPAREN(ST.paren, ST.count);
8707                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
8708                                     script_run_begin);
8709                     NOT_REACHED; /* NOTREACHED */
8710                 }
8711             }
8712             /* FALLTHROUGH */
8713
8714         case CURLY_B_max_fail:
8715             /* failed to find B in a greedy match */
8716
8717             REGCP_UNWIND(ST.cp);
8718             if (ST.paren) {
8719                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8720             }
8721             /*  back up. */
8722             if (--ST.count < ST.min)
8723                 sayNO;
8724             locinput = HOPc(locinput, -1);
8725             goto curly_try_B_max;
8726
8727 #undef ST
8728
8729         case END: /*  last op of main pattern  */
8730           fake_end:
8731             if (cur_eval) {
8732                 /* we've just finished A in /(??{A})B/; now continue with B */
8733                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8734                 st->u.eval.prev_rex = rex_sv;           /* inner */
8735
8736                 /* Save *all* the positions. */
8737                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8738                 rex_sv = CUR_EVAL.prev_rex;
8739                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8740                 SET_reg_curpm(rex_sv);
8741                 rex = ReANY(rex_sv);
8742                 rexi = RXi_GET(rex);
8743
8744                 st->u.eval.prev_curlyx = cur_curlyx;
8745                 cur_curlyx = CUR_EVAL.prev_curlyx;
8746
8747                 REGCP_SET(st->u.eval.lastcp);
8748
8749                 /* Restore parens of the outer rex without popping the
8750                  * savestack */
8751                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8752
8753                 st->u.eval.prev_eval = cur_eval;
8754                 cur_eval = CUR_EVAL.prev_eval;
8755                 DEBUG_EXECUTE_r(
8756                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8757                                       depth, cur_eval););
8758                 if ( nochange_depth )
8759                     nochange_depth--;
8760
8761                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8762
8763                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
8764                                     st->u.eval.prev_eval->u.eval.B,
8765                                     locinput, loceol, script_run_begin);
8766             }
8767
8768             if (locinput < reginfo->till) {
8769                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8770                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8771                                       PL_colors[4],
8772                                       (long)(locinput - startpos),
8773                                       (long)(reginfo->till - startpos),
8774                                       PL_colors[5]));
8775
8776                 sayNO_SILENT;           /* Cannot match: too short. */
8777             }
8778             sayYES;                     /* Success! */
8779
8780         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8781             DEBUG_EXECUTE_r(
8782             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8783                 depth, PL_colors[4], PL_colors[5]));
8784             sayYES;                     /* Success! */
8785
8786 #undef  ST
8787 #define ST st->u.ifmatch
8788
8789         case SUSPEND:   /* (?>A) */
8790             ST.wanted = 1;
8791             ST.start = locinput;
8792             ST.end = loceol;
8793             ST.count = 1;
8794             goto do_ifmatch;
8795
8796         case UNLESSM:   /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
8797             ST.wanted = 0;
8798             goto ifmatch_trivial_fail_test;
8799
8800         case IFMATCH:   /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
8801             ST.wanted = 1;
8802           ifmatch_trivial_fail_test:
8803             ST.count = scan->next_off + 1; /* next_off repurposed to be
8804                                               lookbehind count, requires
8805                                               non-zero flags */
8806             if (! scan->flags) {    /* 'flags' zero means lookahed */
8807
8808                 /* Lookahead starts here and ends at the normal place */
8809                 ST.start = locinput;
8810                 ST.end = loceol;
8811             }
8812             else {
8813                 PERL_UINT_FAST8_T back_count = scan->flags;
8814                 char * s;
8815
8816                 /* Lookbehind can look beyond the current position */
8817                 ST.end = loceol;
8818
8819                 /* ... and starts at the first place in the input that is in
8820                  * the range of the possible start positions */
8821                 for (; ST.count > 0; ST.count--, back_count--) {
8822                     s = HOPBACKc(locinput, back_count);
8823                     if (s) {
8824                         ST.start = s;
8825                         goto do_ifmatch;
8826                     }
8827                 }
8828
8829                 /* If the lookbehind doesn't start in the actual string, is a
8830                  * trivial match failure */
8831                 if (logical) {
8832                     logical = 0;
8833                     sw = 1 - cBOOL(ST.wanted);
8834                 }
8835                 else if (ST.wanted)
8836                     sayNO;
8837
8838                 /* Here, we didn't want it to match, so is actually success */
8839                 next = scan + ARG(scan);
8840                 if (next == scan)
8841                     next = NULL;
8842                 break;
8843             }
8844
8845           do_ifmatch:
8846             ST.me = scan;
8847             ST.logical = logical;
8848             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8849
8850             /* execute body of (?...A) */
8851             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
8852                                 ST.end, script_run_begin);
8853             NOT_REACHED; /* NOTREACHED */
8854
8855         {
8856             bool matched;
8857
8858         case IFMATCH_A_fail: /* body of (?...A) failed */
8859             if (! ST.logical && ST.count > 1) {
8860
8861                 /* It isn't a real failure until we've tried all starting
8862                  * positions.  Move to the next starting position and retry */
8863                 ST.count--;
8864                 ST.start = HOPc(ST.start, 1);
8865                 scan = ST.me;
8866                 logical = ST.logical;
8867                 goto do_ifmatch;
8868             }
8869
8870             /* Here, all starting positions have been tried. */
8871             matched = FALSE;
8872             goto ifmatch_done;
8873
8874         case IFMATCH_A: /* body of (?...A) succeeded */
8875             matched = TRUE;
8876           ifmatch_done:
8877             sw = matched == ST.wanted;
8878             if (! ST.logical && !sw) {
8879                 sayNO;
8880             }
8881
8882             if (OP(ST.me) != SUSPEND) {
8883                 /* restore old position except for (?>...) */
8884                 locinput = st->locinput;
8885                 loceol = st->loceol;
8886                 script_run_begin = st->sr0;
8887             }
8888             scan = ST.me + ARG(ST.me);
8889             if (scan == ST.me)
8890                 scan = NULL;
8891             continue; /* execute B */
8892         }
8893
8894 #undef ST
8895
8896         case LONGJMP: /*  alternative with many branches compiles to
8897                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8898             next = scan + ARG(scan);
8899             if (next == scan)
8900                 next = NULL;
8901             break;
8902
8903         case COMMIT:  /*  (*COMMIT)  */
8904             reginfo->cutpoint = loceol;
8905             /* FALLTHROUGH */
8906
8907         case PRUNE:   /*  (*PRUNE)   */
8908             if (scan->flags)
8909                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8910             PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
8911                             script_run_begin);
8912             NOT_REACHED; /* NOTREACHED */
8913
8914         case COMMIT_next_fail:
8915             no_final = 1;
8916             /* FALLTHROUGH */
8917             sayNO;
8918             NOT_REACHED; /* NOTREACHED */
8919
8920         case OPFAIL:   /* (*FAIL)  */
8921             if (scan->flags)
8922                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8923             if (logical) {
8924                 /* deal with (?(?!)X|Y) properly,
8925                  * make sure we trigger the no branch
8926                  * of the trailing IFTHEN structure*/
8927                 sw= 0;
8928                 break;
8929             } else {
8930                 sayNO;
8931             }
8932             NOT_REACHED; /* NOTREACHED */
8933
8934 #define ST st->u.mark
8935         case MARKPOINT: /*  (*MARK:foo)  */
8936             ST.prev_mark = mark_state;
8937             ST.mark_name = sv_commit = sv_yes_mark
8938                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8939             mark_state = st;
8940             ST.mark_loc = locinput;
8941             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
8942                                 script_run_begin);
8943             NOT_REACHED; /* NOTREACHED */
8944
8945         case MARKPOINT_next:
8946             mark_state = ST.prev_mark;
8947             sayYES;
8948             NOT_REACHED; /* NOTREACHED */
8949
8950         case MARKPOINT_next_fail:
8951             if (popmark && sv_eq(ST.mark_name,popmark))
8952             {
8953                 if (ST.mark_loc > startpoint)
8954                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8955                 popmark = NULL; /* we found our mark */
8956                 sv_commit = ST.mark_name;
8957
8958                 DEBUG_EXECUTE_r({
8959                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8960                             depth,
8961                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8962                 });
8963             }
8964             mark_state = ST.prev_mark;
8965             sv_yes_mark = mark_state ?
8966                 mark_state->u.mark.mark_name : NULL;
8967             sayNO;
8968             NOT_REACHED; /* NOTREACHED */
8969
8970         case SKIP:  /*  (*SKIP)  */
8971             if (!scan->flags) {
8972                 /* (*SKIP) : if we fail we cut here*/
8973                 ST.mark_name = NULL;
8974                 ST.mark_loc = locinput;
8975                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
8976                                 script_run_begin);
8977             } else {
8978                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8979                    otherwise do nothing.  Meaning we need to scan
8980                  */
8981                 regmatch_state *cur = mark_state;
8982                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8983
8984                 while (cur) {
8985                     if ( sv_eq( cur->u.mark.mark_name,
8986                                 find ) )
8987                     {
8988                         ST.mark_name = find;
8989                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
8990                                          script_run_begin);
8991                     }
8992                     cur = cur->u.mark.prev_mark;
8993                 }
8994             }
8995             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8996             break;
8997
8998         case SKIP_next_fail:
8999             if (ST.mark_name) {
9000                 /* (*CUT:NAME) - Set up to search for the name as we
9001                    collapse the stack*/
9002                 popmark = ST.mark_name;
9003             } else {
9004                 /* (*CUT) - No name, we cut here.*/
9005                 if (ST.mark_loc > startpoint)
9006                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9007                 /* but we set sv_commit to latest mark_name if there
9008                    is one so they can test to see how things lead to this
9009                    cut */
9010                 if (mark_state)
9011                     sv_commit=mark_state->u.mark.mark_name;
9012             }
9013             no_final = 1;
9014             sayNO;
9015             NOT_REACHED; /* NOTREACHED */
9016 #undef ST
9017
9018         case LNBREAK: /* \R */
9019             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9020                 locinput += n;
9021             } else
9022                 sayNO;
9023             break;
9024
9025         default:
9026             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9027                           PTR2UV(scan), OP(scan));
9028             Perl_croak(aTHX_ "regexp memory corruption");
9029
9030         /* this is a point to jump to in order to increment
9031          * locinput by one character */
9032           increment_locinput:
9033             assert(!NEXTCHR_IS_EOS);
9034             if (utf8_target) {
9035                 locinput += PL_utf8skip[nextchr];
9036                 /* locinput is allowed to go 1 char off the end (signifying
9037                  * EOS), but not 2+ */
9038                 if (locinput >  loceol)
9039                     sayNO;
9040             }
9041             else
9042                 locinput++;
9043             break;
9044
9045         } /* end switch */
9046
9047         /* switch break jumps here */
9048         scan = next; /* prepare to execute the next op and ... */
9049         continue;    /* ... jump back to the top, reusing st */
9050         /* NOTREACHED */
9051
9052       push_yes_state:
9053         /* push a state that backtracks on success */
9054         st->u.yes.prev_yes_state = yes_state;
9055         yes_state = st;
9056         /* FALLTHROUGH */
9057       push_state:
9058         /* push a new regex state, then continue at scan  */
9059         {
9060             regmatch_state *newst;
9061
9062             DEBUG_STACK_r({
9063                 regmatch_state *cur = st;
9064                 regmatch_state *curyes = yes_state;
9065                 U32 i;
9066                 regmatch_slab *slab = PL_regmatch_slab;
9067                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9068                     if (cur < SLAB_FIRST(slab)) {
9069                         slab = slab->prev;
9070                         cur = SLAB_LAST(slab);
9071                     }
9072                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9073                         depth,
9074                         i ? "    " : "push",
9075                         depth - i, PL_reg_name[cur->resume_state],
9076                         (curyes == cur) ? "yes" : ""
9077                     );
9078                     if (curyes == cur)
9079                         curyes = cur->u.yes.prev_yes_state;
9080                 }
9081             } else
9082                 DEBUG_STATE_pp("push")
9083             );
9084             depth++;
9085             st->locinput = locinput;
9086             st->loceol = loceol;
9087             st->sr0 = script_run_begin;
9088             newst = st+1;
9089             if (newst >  SLAB_LAST(PL_regmatch_slab))
9090                 newst = S_push_slab(aTHX);
9091             PL_regmatch_state = newst;
9092
9093             locinput = pushinput;
9094             loceol = pusheol;
9095             script_run_begin = pushsr0;
9096             st = newst;
9097             continue;
9098             /* NOTREACHED */
9099         }
9100     }
9101 #ifdef SOLARIS_BAD_OPTIMIZER
9102 #  undef PL_charclass
9103 #endif
9104
9105     /*
9106     * We get here only if there's trouble -- normally "case END" is
9107     * the terminating point.
9108     */
9109     Perl_croak(aTHX_ "corrupted regexp pointers");
9110     NOT_REACHED; /* NOTREACHED */
9111
9112   yes:
9113     if (yes_state) {
9114         /* we have successfully completed a subexpression, but we must now
9115          * pop to the state marked by yes_state and continue from there */
9116         assert(st != yes_state);
9117 #ifdef DEBUGGING
9118         while (st != yes_state) {
9119             st--;
9120             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9121                 PL_regmatch_slab = PL_regmatch_slab->prev;
9122                 st = SLAB_LAST(PL_regmatch_slab);
9123             }
9124             DEBUG_STATE_r({
9125                 if (no_final) {
9126                     DEBUG_STATE_pp("pop (no final)");
9127                 } else {
9128                     DEBUG_STATE_pp("pop (yes)");
9129                 }
9130             });
9131             depth--;
9132         }
9133 #else
9134         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9135             || yes_state > SLAB_LAST(PL_regmatch_slab))
9136         {
9137             /* not in this slab, pop slab */
9138             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9139             PL_regmatch_slab = PL_regmatch_slab->prev;
9140             st = SLAB_LAST(PL_regmatch_slab);
9141         }
9142         depth -= (st - yes_state);
9143 #endif
9144         st = yes_state;
9145         yes_state = st->u.yes.prev_yes_state;
9146         PL_regmatch_state = st;
9147
9148         if (no_final) {
9149             locinput= st->locinput;
9150             loceol= st->loceol;
9151             script_run_begin = st->sr0;
9152         }
9153         state_num = st->resume_state + no_final;
9154         goto reenter_switch;
9155     }
9156
9157     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9158                           PL_colors[4], PL_colors[5]));
9159
9160     if (reginfo->info_aux_eval) {
9161         /* each successfully executed (?{...}) block does the equivalent of
9162          *   local $^R = do {...}
9163          * When popping the save stack, all these locals would be undone;
9164          * bypass this by setting the outermost saved $^R to the latest
9165          * value */
9166         /* I dont know if this is needed or works properly now.
9167          * see code related to PL_replgv elsewhere in this file.
9168          * Yves
9169          */
9170         if (oreplsv != GvSV(PL_replgv)) {
9171             sv_setsv(oreplsv, GvSV(PL_replgv));
9172             SvSETMAGIC(oreplsv);
9173         }
9174     }
9175     result = 1;
9176     goto final_exit;
9177
9178   no:
9179     DEBUG_EXECUTE_r(
9180         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9181             depth,
9182             PL_colors[4], PL_colors[5])
9183         );
9184
9185   no_silent:
9186     if (no_final) {
9187         if (yes_state) {
9188             goto yes;
9189         } else {
9190             goto final_exit;
9191         }
9192     }
9193     if (depth) {
9194         /* there's a previous state to backtrack to */
9195         st--;
9196         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9197             PL_regmatch_slab = PL_regmatch_slab->prev;
9198             st = SLAB_LAST(PL_regmatch_slab);
9199         }
9200         PL_regmatch_state = st;
9201         locinput= st->locinput;
9202         loceol= st->loceol;
9203         script_run_begin = st->sr0;
9204
9205         DEBUG_STATE_pp("pop");
9206         depth--;
9207         if (yes_state == st)
9208             yes_state = st->u.yes.prev_yes_state;
9209
9210         state_num = st->resume_state + 1; /* failure = success + 1 */
9211         PERL_ASYNC_CHECK();
9212         goto reenter_switch;
9213     }
9214     result = 0;
9215
9216   final_exit:
9217     if (rex->intflags & PREGf_VERBARG_SEEN) {
9218         SV *sv_err = get_sv("REGERROR", 1);
9219         SV *sv_mrk = get_sv("REGMARK", 1);
9220         if (result) {
9221             sv_commit = &PL_sv_no;
9222             if (!sv_yes_mark)
9223                 sv_yes_mark = &PL_sv_yes;
9224         } else {
9225             if (!sv_commit)
9226                 sv_commit = &PL_sv_yes;
9227             sv_yes_mark = &PL_sv_no;
9228         }
9229         assert(sv_err);
9230         assert(sv_mrk);
9231         sv_setsv(sv_err, sv_commit);
9232         sv_setsv(sv_mrk, sv_yes_mark);
9233     }
9234
9235
9236     if (last_pushed_cv) {
9237         dSP;
9238         /* see "Some notes about MULTICALL" above */
9239         POP_MULTICALL;
9240         PERL_UNUSED_VAR(SP);
9241     }
9242     else
9243         LEAVE_SCOPE(orig_savestack_ix);
9244
9245     assert(!result ||  locinput - reginfo->strbeg >= 0);
9246     return result ?  locinput - reginfo->strbeg : -1;
9247 }
9248
9249 /*
9250  - regrepeat - repeatedly match something simple, report how many
9251  *
9252  * What 'simple' means is a node which can be the operand of a quantifier like
9253  * '+', or {1,3}
9254  *
9255  * startposp - pointer to a pointer to the start position.  This is updated
9256  *             to point to the byte following the highest successful
9257  *             match.
9258  * p         - the regnode to be repeatedly matched against.
9259  * loceol    - pointer to the end position beyond which we aren't supposed to
9260  *             look.
9261  * reginfo   - struct holding match state, such as utf8_target
9262  * max       - maximum number of things to match.
9263  * depth     - (for debugging) backtracking depth.
9264  */
9265 STATIC I32
9266 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9267             char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9268 {
9269     dVAR;
9270     char *scan;     /* Pointer to current position in target string */
9271     I32 c;
9272     char *this_eol = loceol;   /* potentially adjusted version. */
9273     I32 hardcount = 0;  /* How many matches so far */
9274     bool utf8_target = reginfo->is_utf8_target;
9275     unsigned int to_complement = 0;  /* Invert the result? */
9276     UV utf8_flags = 0;
9277     _char_class_number classnum;
9278
9279     PERL_ARGS_ASSERT_REGREPEAT;
9280
9281     /* This routine is structured so that we switch on the input OP.  Each OP
9282      * case: statement contains a loop to repeatedly apply the OP, advancing
9283      * the input until it fails, or reaches the end of the input, or until it
9284      * reaches the upper limit of matches. */
9285
9286     scan = *startposp;
9287     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
9288                                max */
9289         max = I32_MAX;
9290     else if (! utf8_target && this_eol - scan > max)
9291         this_eol = scan + max;
9292
9293     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
9294      * to the maximum of how far we should go in it (leaving it set to the real
9295      * end, if the maximum permissible would take us beyond that).  This allows
9296      * us to make the loop exit condition that we haven't gone past <this_eol> to
9297      * also mean that we haven't exceeded the max permissible count, saving a
9298      * test each time through the loops.  But it assumes that the OP matches a
9299      * single byte, which is true for most of the OPs below when applied to a
9300      * non-UTF-8 target.  Those relatively few OPs that don't have this
9301      * characteristic will have to compensate.
9302      *
9303      * There is no adjustment for UTF-8 targets, as the number of bytes per
9304      * character varies.  OPs will have to test both that the count is less
9305      * than the max permissible (using <hardcount> to keep track), and that we
9306      * are still within the bounds of the string (using <this_eol>.  A few OPs
9307      * match a single byte no matter what the encoding.  They can omit the max
9308      * test if, for the UTF-8 case, they do the adjustment that was skipped
9309      * above.
9310      *
9311      * Thus, the code above sets things up for the common case; and exceptional
9312      * cases need extra work; the common case is to make sure <scan> doesn't
9313      * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9314      * count doesn't exceed the maximum permissible */
9315
9316     switch (OP(p)) {
9317     case REG_ANY:
9318         if (utf8_target) {
9319             while (scan < this_eol && hardcount < max && *scan != '\n') {
9320                 scan += UTF8SKIP(scan);
9321                 hardcount++;
9322             }
9323         } else {
9324             scan = (char *) memchr(scan, '\n', this_eol - scan);
9325             if (! scan) {
9326                 scan = this_eol;
9327             }
9328         }
9329         break;
9330     case SANY:
9331         if (utf8_target) {
9332             while (scan < this_eol && hardcount < max) {
9333                 scan += UTF8SKIP(scan);
9334                 hardcount++;
9335             }
9336         }
9337         else
9338             scan = this_eol;
9339         break;
9340     case EXACTL:
9341         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9342         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9343             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9344         }
9345         goto do_exact;
9346
9347     case EXACT_ONLY8:
9348         if (! utf8_target) {
9349             break;
9350         }
9351         /* FALLTHROUGH */
9352     case EXACT:
9353       do_exact:
9354         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9355
9356         c = (U8)*STRING(p);
9357
9358         /* Can use a simple find if the pattern char to match on is invariant
9359          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9360          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9361          * true iff it doesn't matter if the argument is in UTF-8 or not */
9362         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9363             if (utf8_target && this_eol - scan > max) {
9364                 /* We didn't adjust <this_eol> because is UTF-8, but ok to do so,
9365                  * since here, to match at all, 1 char == 1 byte */
9366                 this_eol = scan + max;
9367             }
9368             scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9369         }
9370         else if (reginfo->is_utf8_pat) {
9371             if (utf8_target) {
9372                 STRLEN scan_char_len;
9373
9374                 /* When both target and pattern are UTF-8, we have to do
9375                  * string EQ */
9376                 while (hardcount < max
9377                        && scan < this_eol
9378                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9379                        && memEQ(scan, STRING(p), scan_char_len))
9380                 {
9381                     scan += scan_char_len;
9382                     hardcount++;
9383                 }
9384             }
9385             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9386
9387                 /* Target isn't utf8; convert the character in the UTF-8
9388                  * pattern to non-UTF8, and do a simple find */
9389                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9390                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9391             } /* else pattern char is above Latin1, can't possibly match the
9392                  non-UTF-8 target */
9393         }
9394         else {
9395
9396             /* Here, the string must be utf8; pattern isn't, and <c> is
9397              * different in utf8 than not, so can't compare them directly.
9398              * Outside the loop, find the two utf8 bytes that represent c, and
9399              * then look for those in sequence in the utf8 string */
9400             U8 high = UTF8_TWO_BYTE_HI(c);
9401             U8 low = UTF8_TWO_BYTE_LO(c);
9402
9403             while (hardcount < max
9404                     && scan + 1 < this_eol
9405                     && UCHARAT(scan) == high
9406                     && UCHARAT(scan + 1) == low)
9407             {
9408                 scan += 2;
9409                 hardcount++;
9410             }
9411         }
9412         break;
9413
9414     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9415         assert(! reginfo->is_utf8_pat);
9416         /* FALLTHROUGH */
9417     case EXACTFAA:
9418         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9419         if (reginfo->is_utf8_pat || ! utf8_target) {
9420
9421             /* The possible presence of a MICRO SIGN in the pattern forbids us
9422              * to view a non-UTF-8 pattern as folded when there is a UTF-8
9423              * target.  */
9424             utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
9425         }
9426         goto do_exactf;
9427
9428     case EXACTFL:
9429         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9430         utf8_flags = FOLDEQ_LOCALE;
9431         goto do_exactf;
9432
9433     case EXACTF:   /* This node only generated for non-utf8 patterns */
9434         assert(! reginfo->is_utf8_pat);
9435         goto do_exactf;
9436
9437     case EXACTFLU8:
9438         if (! utf8_target) {
9439             break;
9440         }
9441         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9442                                     | FOLDEQ_S2_FOLDS_SANE;
9443         goto do_exactf;
9444
9445     case EXACTFU_ONLY8:
9446         if (! utf8_target) {
9447             break;
9448         }
9449         assert(reginfo->is_utf8_pat);
9450         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9451         goto do_exactf;
9452
9453     case EXACTFU:
9454         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9455         /* FALLTHROUGH */
9456
9457     case EXACTFUP:
9458
9459       do_exactf: {
9460         int c1, c2;
9461         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9462
9463         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9464
9465         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9466                                         reginfo))
9467         {
9468             if (c1 == CHRTEST_VOID) {
9469                 /* Use full Unicode fold matching */
9470                 char *tmpeol = loceol;
9471                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9472                 while (hardcount < max
9473                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9474                                              STRING(p), NULL, pat_len,
9475                                              reginfo->is_utf8_pat, utf8_flags))
9476                 {
9477                     scan = tmpeol;
9478                     tmpeol = loceol;
9479                     hardcount++;
9480                 }
9481             }
9482             else if (utf8_target) {
9483                 if (c1 == c2) {
9484                     while (scan < this_eol
9485                            && hardcount < max
9486                            && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9487                                                                   loceol)))
9488                     {
9489                         scan += UTF8SKIP(c1_utf8);
9490                         hardcount++;
9491                     }
9492                 }
9493                 else {
9494                     while (scan < this_eol
9495                            && hardcount < max
9496                            && (   memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9497                                                                      loceol))
9498                                || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
9499                                                                      loceol))))
9500                     {
9501                         scan += UTF8_SAFE_SKIP(scan, loceol);
9502                         hardcount++;
9503                     }
9504                 }
9505             }
9506             else if (c1 == c2) {
9507                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c1);
9508             }
9509             else {
9510                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9511                  * a conditional each time through the loop if the characters
9512                  * differ only in a single bit, as is the usual situation */
9513                 U8 c1_c2_bits_differing = c1 ^ c2;
9514
9515                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9516                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9517
9518                     scan = (char *) find_span_end_mask((U8 *) scan,
9519                                                        (U8 *) this_eol,
9520                                                        c1 & c1_c2_mask,
9521                                                        c1_c2_mask);
9522                 }
9523                 else {
9524                     while (    scan < this_eol
9525                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9526                     {
9527                         scan++;
9528                     }
9529                 }
9530             }
9531         }
9532         break;
9533     }
9534     case ANYOFPOSIXL:
9535     case ANYOFL:
9536         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9537
9538         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9539             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9540         }
9541         /* FALLTHROUGH */
9542     case ANYOFD:
9543     case ANYOF:
9544         if (utf8_target) {
9545             while (hardcount < max
9546                    && scan < this_eol
9547                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
9548             {
9549                 scan += UTF8SKIP(scan);
9550                 hardcount++;
9551             }
9552         }
9553         else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
9554             while (scan < this_eol
9555                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9556                 scan++;
9557         }
9558         else {
9559             while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9560                 scan++;
9561         }
9562         break;
9563
9564     case ANYOFM:
9565         if (utf8_target && this_eol - scan > max) {
9566
9567             /* We didn't adjust <this_eol> at the beginning of this routine
9568              * because is UTF-8, but it is actually ok to do so, since here, to
9569              * match, 1 char == 1 byte. */
9570             this_eol = scan + max;
9571         }
9572
9573         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9574         break;
9575
9576     case NANYOFM:
9577         if (utf8_target) {
9578             while (     hardcount < max
9579                    &&   scan < this_eol
9580                    &&  (*scan & FLAGS(p)) != ARG(p))
9581             {
9582                 scan += UTF8SKIP(scan);
9583                 hardcount++;
9584             }
9585         }
9586         else {
9587             scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9588         }
9589         break;
9590
9591     case ANYOFH:
9592         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9593             while (  hardcount < max
9594                    && scan < this_eol
9595                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9596             {
9597                 scan += UTF8SKIP(scan);
9598                 hardcount++;
9599             }
9600         }
9601         break;
9602
9603     case ANYOFHb:
9604         if (utf8_target) {  /* ANYOFHb only can match UTF-8 targets */
9605
9606             /* we know the first byte must be the FLAGS field */
9607             while (   hardcount < max
9608                    && scan < this_eol
9609                    && (U8) *scan == ANYOF_FLAGS(p)
9610                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
9611                                                               TRUE))
9612             {
9613                 scan += UTF8SKIP(scan);
9614                 hardcount++;
9615             }
9616         }
9617         break;
9618
9619     /* The argument (FLAGS) to all the POSIX node types is the class number */
9620
9621     case NPOSIXL:
9622         to_complement = 1;
9623         /* FALLTHROUGH */
9624
9625     case POSIXL:
9626         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9627         if (! utf8_target) {
9628             while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9629                                                                    *scan)))
9630             {
9631                 scan++;
9632             }
9633         } else {
9634             while (hardcount < max && scan < this_eol
9635                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9636                                                                   (U8 *) scan,
9637                                                                   (U8 *) this_eol)))
9638             {
9639                 scan += UTF8SKIP(scan);
9640                 hardcount++;
9641             }
9642         }
9643         break;
9644
9645     case POSIXD:
9646         if (utf8_target) {
9647             goto utf8_posix;
9648         }
9649         /* FALLTHROUGH */
9650
9651     case POSIXA:
9652         if (utf8_target && this_eol - scan > max) {
9653
9654             /* We didn't adjust <this_eol> at the beginning of this routine
9655              * because is UTF-8, but it is actually ok to do so, since here, to
9656              * match, 1 char == 1 byte. */
9657             this_eol = scan + max;
9658         }
9659         while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9660             scan++;
9661         }
9662         break;
9663
9664     case NPOSIXD:
9665         if (utf8_target) {
9666             to_complement = 1;
9667             goto utf8_posix;
9668         }
9669         /* FALLTHROUGH */
9670
9671     case NPOSIXA:
9672         if (! utf8_target) {
9673             while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9674                 scan++;
9675             }
9676         }
9677         else {
9678
9679             /* The complement of something that matches only ASCII matches all
9680              * non-ASCII, plus everything in ASCII that isn't in the class. */
9681             while (hardcount < max && scan < this_eol
9682                    && (   ! isASCII_utf8_safe(scan, loceol)
9683                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9684             {
9685                 scan += UTF8SKIP(scan);
9686                 hardcount++;
9687             }
9688         }
9689         break;
9690
9691     case NPOSIXU:
9692         to_complement = 1;
9693         /* FALLTHROUGH */
9694
9695     case POSIXU:
9696         if (! utf8_target) {
9697             while (scan < this_eol && to_complement
9698                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9699             {
9700                 scan++;
9701             }
9702         }
9703         else {
9704           utf8_posix:
9705             classnum = (_char_class_number) FLAGS(p);
9706             switch (classnum) {
9707                 default:
9708                     while (   hardcount < max && scan < this_eol
9709                            && to_complement ^ cBOOL(_invlist_contains_cp(
9710                                               PL_XPosix_ptrs[classnum],
9711                                               utf8_to_uvchr_buf((U8 *) scan,
9712                                                                 (U8 *) this_eol,
9713                                                                 NULL))))
9714                     {
9715                         scan += UTF8SKIP(scan);
9716                         hardcount++;
9717                     }
9718                     break;
9719
9720                     /* For the classes below, the knowledge of how to handle
9721                      * every code point is compiled in to Perl via a macro.
9722                      * This code is written for making the loops as tight as
9723                      * possible.  It could be refactored to save space instead.
9724                      * */
9725
9726                 case _CC_ENUM_SPACE:
9727                     while (hardcount < max
9728                            && scan < this_eol
9729                            && (to_complement
9730                                ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
9731                     {
9732                         scan += UTF8SKIP(scan);
9733                         hardcount++;
9734                     }
9735                     break;
9736                 case _CC_ENUM_BLANK:
9737                     while (hardcount < max
9738                            && scan < this_eol
9739                            && (to_complement
9740                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
9741                     {
9742                         scan += UTF8SKIP(scan);
9743                         hardcount++;
9744                     }
9745                     break;
9746                 case _CC_ENUM_XDIGIT:
9747                     while (hardcount < max
9748                            && scan < this_eol
9749                            && (to_complement
9750                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
9751                     {
9752                         scan += UTF8SKIP(scan);
9753                         hardcount++;
9754                     }
9755                     break;
9756                 case _CC_ENUM_VERTSPACE:
9757                     while (hardcount < max
9758                            && scan < this_eol
9759                            && (to_complement
9760                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
9761                     {
9762                         scan += UTF8SKIP(scan);
9763                         hardcount++;
9764                     }
9765                     break;
9766                 case _CC_ENUM_CNTRL:
9767                     while (hardcount < max
9768                            && scan < this_eol
9769                            && (to_complement
9770                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
9771                     {
9772                         scan += UTF8SKIP(scan);
9773                         hardcount++;
9774                     }
9775                     break;
9776             }
9777         }
9778         break;
9779
9780     case LNBREAK:
9781         if (utf8_target) {
9782             while (hardcount < max && scan < this_eol &&
9783                     (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
9784                 scan += c;
9785                 hardcount++;
9786             }
9787         } else {
9788             /* LNBREAK can match one or two latin chars, which is ok, but we
9789              * have to use hardcount in this situation, and throw away the
9790              * adjustment to <this_eol> done before the switch statement */
9791             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9792                 scan+=c;
9793                 hardcount++;
9794             }
9795         }
9796         break;
9797
9798     case BOUNDL:
9799     case NBOUNDL:
9800         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9801         /* FALLTHROUGH */
9802     case BOUND:
9803     case BOUNDA:
9804     case BOUNDU:
9805     case EOS:
9806     case GPOS:
9807     case KEEPS:
9808     case NBOUND:
9809     case NBOUNDA:
9810     case NBOUNDU:
9811     case OPFAIL:
9812     case SBOL:
9813     case SEOL:
9814         /* These are all 0 width, so match right here or not at all. */
9815         break;
9816
9817     default:
9818         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9819         NOT_REACHED; /* NOTREACHED */
9820
9821     }
9822
9823     if (hardcount)
9824         c = hardcount;
9825     else
9826         c = scan - *startposp;
9827     *startposp = scan;
9828
9829     DEBUG_r({
9830         GET_RE_DEBUG_FLAGS_DECL;
9831         DEBUG_EXECUTE_r({
9832             SV * const prop = sv_newmortal();
9833             regprop(prog, prop, p, reginfo, NULL);
9834             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9835                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9836         });
9837     });
9838
9839     return(c);
9840 }
9841
9842 /*
9843  - reginclass - determine if a character falls into a character class
9844
9845   n is the ANYOF-type regnode
9846   p is the target string
9847   p_end points to one byte beyond the end of the target string
9848   utf8_target tells whether p is in UTF-8.
9849
9850   Returns true if matched; false otherwise.
9851
9852   Note that this can be a synthetic start class, a combination of various
9853   nodes, so things you think might be mutually exclusive, such as locale,
9854   aren't.  It can match both locale and non-locale
9855
9856  */
9857
9858 STATIC bool
9859 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9860 {
9861     dVAR;
9862     const char flags = (OP(n) == ANYOFHb) ? 0 : ANYOF_FLAGS(n);
9863     bool match = FALSE;
9864     UV c = *p;
9865
9866     PERL_ARGS_ASSERT_REGINCLASS;
9867
9868     /* If c is not already the code point, get it.  Note that
9869      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9870     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9871         STRLEN c_len = 0;
9872         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9873         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9874         if (c_len == (STRLEN)-1) {
9875             _force_out_malformed_utf8_message(p, p_end,
9876                                               utf8n_flags,
9877                                               1 /* 1 means die */ );
9878             NOT_REACHED; /* NOTREACHED */
9879         }
9880         if (     c > 255
9881             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
9882             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
9883         {
9884             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9885         }
9886     }
9887
9888     /* If this character is potentially in the bitmap, check it */
9889     if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH && OP(n) != ANYOFHb) {
9890         if (ANYOF_BITMAP_TEST(n, c))
9891             match = TRUE;
9892         else if ((flags
9893                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9894                   && OP(n) == ANYOFD
9895                   && ! utf8_target
9896                   && ! isASCII(c))
9897         {
9898             match = TRUE;
9899         }
9900         else if (flags & ANYOF_LOCALE_FLAGS) {
9901             if (  (flags & ANYOFL_FOLD)
9902                 && c < sizeof(PL_fold_locale)
9903                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9904             {
9905                 match = TRUE;
9906             }
9907             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
9908                      && c <= U8_MAX  /* param to isFOO_lc() */
9909             ) {
9910
9911                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9912                  * if the class includes the Posix character class given by
9913                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9914                  * complemented Posix class given by int(bit/2).  So we loop
9915                  * through the bits, each time changing whether we complement
9916                  * the result or not.  Suppose for the sake of illustration
9917                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9918                  * is set, it means there is a match for this ANYOF node if the
9919                  * character is in the class given by the expression (0 / 2 = 0
9920                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9921                  * and since 'to_complement' is 0, the result will stay TRUE,
9922                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9923                  * bit 1 is 1.  That means there is a match if the character
9924                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9925                  * but will on bit 1.  On the second iteration 'to_complement'
9926                  * will be 1, so the exclusive or will reverse things, so we
9927                  * are testing for \W.  On the third iteration, 'to_complement'
9928                  * will be 0, and we would be testing for \s; the fourth
9929                  * iteration would test for \S, etc.
9930                  *
9931                  * Note that this code assumes that all the classes are closed
9932                  * under folding.  For example, if a character matches \w, then
9933                  * its fold does too; and vice versa.  This should be true for
9934                  * any well-behaved locale for all the currently defined Posix
9935                  * classes, except for :lower: and :upper:, which are handled
9936                  * by the pseudo-class :cased: which matches if either of the
9937                  * other two does.  To get rid of this assumption, an outer
9938                  * loop could be used below to iterate over both the source
9939                  * character, and its fold (if different) */
9940
9941                 int count = 0;
9942                 int to_complement = 0;
9943
9944                 while (count < ANYOF_MAX) {
9945                     if (ANYOF_POSIXL_TEST(n, count)
9946                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9947                     {
9948                         match = TRUE;
9949                         break;
9950                     }
9951                     count++;
9952                     to_complement ^= 1;
9953                 }
9954             }
9955         }
9956     }
9957
9958
9959     /* If the bitmap didn't (or couldn't) match, and something outside the
9960      * bitmap could match, try that. */
9961     if (!match) {
9962         if (c >= NUM_ANYOF_CODE_POINTS
9963             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9964         {
9965             match = TRUE;       /* Everything above the bitmap matches */
9966         }
9967             /* Here doesn't match everything above the bitmap.  If there is
9968              * some information available beyond the bitmap, we may find a
9969              * match in it.  If so, this is most likely because the code point
9970              * is outside the bitmap range.  But rarely, it could be because of
9971              * some other reason.  If so, various flags are set to indicate
9972              * this possibility.  On ANYOFD nodes, there may be matches that
9973              * happen only when the target string is UTF-8; or for other node
9974              * types, because runtime lookup is needed, regardless of the
9975              * UTF-8ness of the target string.  Finally, under /il, there may
9976              * be some matches only possible if the locale is a UTF-8 one. */
9977         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9978                  && (   c >= NUM_ANYOF_CODE_POINTS
9979                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9980                          && (   UNLIKELY(OP(n) != ANYOFD)
9981                              || (utf8_target && ! isASCII_uni(c)
9982 #                               if NUM_ANYOF_CODE_POINTS > 256
9983                                                                  && c < 256
9984 #                               endif
9985                                 )))
9986                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9987                          && IN_UTF8_CTYPE_LOCALE)))
9988         {
9989             SV* only_utf8_locale = NULL;
9990             SV * const definition = _get_regclass_nonbitmap_data(prog, n, TRUE,
9991                                                    0, &only_utf8_locale, NULL);
9992             if (definition) {
9993                 U8 utf8_buffer[2];
9994                 U8 * utf8_p;
9995                 if (utf8_target) {
9996                     utf8_p = (U8 *) p;
9997                 } else { /* Convert to utf8 */
9998                     utf8_p = utf8_buffer;
9999                     append_utf8_from_native_byte(*p, &utf8_p);
10000                     utf8_p = utf8_buffer;
10001                 }
10002
10003                 /* Turkish locales have these hard-coded rules overriding
10004                  * normal ones */
10005                 if (   UNLIKELY(PL_in_utf8_turkic_locale)
10006                     && isALPHA_FOLD_EQ(*p, 'i'))
10007                 {
10008                     if (*p == 'i') {
10009                         if (_invlist_contains_cp(definition,
10010                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10011                         {
10012                             match = TRUE;
10013                         }
10014                     }
10015                     else if (*p == 'I') {
10016                         if (_invlist_contains_cp(definition,
10017                                                 LATIN_SMALL_LETTER_DOTLESS_I))
10018                         {
10019                             match = TRUE;
10020                         }
10021                     }
10022                 }
10023                 else if (_invlist_contains_cp(definition, c)) {
10024                     match = TRUE;
10025                 }
10026             }
10027             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10028                 match = _invlist_contains_cp(only_utf8_locale, c);
10029             }
10030         }
10031
10032         /* In a Turkic locale under folding, hard-code the I i case pair
10033          * matches */
10034         if (     UNLIKELY(PL_in_utf8_turkic_locale)
10035             && ! match
10036             &&   (flags & ANYOFL_FOLD)
10037             &&   utf8_target)
10038         {
10039             if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10040                 if (ANYOF_BITMAP_TEST(n, 'i')) {
10041                     match = TRUE;
10042                 }
10043             }
10044             else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10045                 if (ANYOF_BITMAP_TEST(n, 'I')) {
10046                     match = TRUE;
10047                 }
10048             }
10049         }
10050
10051         if (UNICODE_IS_SUPER(c)
10052             && (flags
10053                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10054             && OP(n) != ANYOFD
10055             && ckWARN_d(WARN_NON_UNICODE))
10056         {
10057             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10058                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10059         }
10060     }
10061
10062 #if ANYOF_INVERT != 1
10063     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10064      * use it, don't */
10065 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10066 #endif
10067
10068     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10069     return (flags & ANYOF_INVERT) ^ match;
10070 }
10071
10072 STATIC U8 *
10073 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10074 {
10075     /* return the position 'off' UTF-8 characters away from 's', forward if
10076      * 'off' >= 0, backwards if negative.  But don't go outside of position
10077      * 'lim', which better be < s  if off < 0 */
10078
10079     PERL_ARGS_ASSERT_REGHOP3;
10080
10081     if (off >= 0) {
10082         while (off-- && s < lim) {
10083             /* XXX could check well-formedness here */
10084             U8 *new_s = s + UTF8SKIP(s);
10085             if (new_s > lim) /* lim may be in the middle of a long character */
10086                 return s;
10087             s = new_s;
10088         }
10089     }
10090     else {
10091         while (off++ && s > lim) {
10092             s--;
10093             if (UTF8_IS_CONTINUED(*s)) {
10094                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10095                     s--;
10096                 if (! UTF8_IS_START(*s)) {
10097                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10098                 }
10099             }
10100             /* XXX could check well-formedness here */
10101         }
10102     }
10103     return s;
10104 }
10105
10106 STATIC U8 *
10107 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10108 {
10109     PERL_ARGS_ASSERT_REGHOP4;
10110
10111     if (off >= 0) {
10112         while (off-- && s < rlim) {
10113             /* XXX could check well-formedness here */
10114             s += UTF8SKIP(s);
10115         }
10116     }
10117     else {
10118         while (off++ && s > llim) {
10119             s--;
10120             if (UTF8_IS_CONTINUED(*s)) {
10121                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10122                     s--;
10123                 if (! UTF8_IS_START(*s)) {
10124                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10125                 }
10126             }
10127             /* XXX could check well-formedness here */
10128         }
10129     }
10130     return s;
10131 }
10132
10133 /* like reghop3, but returns NULL on overrun, rather than returning last
10134  * char pos */
10135
10136 STATIC U8 *
10137 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10138 {
10139     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10140
10141     if (off >= 0) {
10142         while (off-- && s < lim) {
10143             /* XXX could check well-formedness here */
10144             s += UTF8SKIP(s);
10145         }
10146         if (off >= 0)
10147             return NULL;
10148     }
10149     else {
10150         while (off++ && s > lim) {
10151             s--;
10152             if (UTF8_IS_CONTINUED(*s)) {
10153                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10154                     s--;
10155                 if (! UTF8_IS_START(*s)) {
10156                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10157                 }
10158             }
10159             /* XXX could check well-formedness here */
10160         }
10161         if (off <= 0)
10162             return NULL;
10163     }
10164     return s;
10165 }
10166
10167
10168 /* when executing a regex that may have (?{}), extra stuff needs setting
10169    up that will be visible to the called code, even before the current
10170    match has finished. In particular:
10171
10172    * $_ is localised to the SV currently being matched;
10173    * pos($_) is created if necessary, ready to be updated on each call-out
10174      to code;
10175    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10176      isn't set until the current pattern is successfully finished), so that
10177      $1 etc of the match-so-far can be seen;
10178    * save the old values of subbeg etc of the current regex, and  set then
10179      to the current string (again, this is normally only done at the end
10180      of execution)
10181 */
10182
10183 static void
10184 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10185 {
10186     MAGIC *mg;
10187     regexp *const rex = ReANY(reginfo->prog);
10188     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10189
10190     eval_state->rex = rex;
10191
10192     if (reginfo->sv) {
10193         /* Make $_ available to executed code. */
10194         if (reginfo->sv != DEFSV) {
10195             SAVE_DEFSV;
10196             DEFSV_set(reginfo->sv);
10197         }
10198
10199         if (!(mg = mg_find_mglob(reginfo->sv))) {
10200             /* prepare for quick setting of pos */
10201             mg = sv_magicext_mglob(reginfo->sv);
10202             mg->mg_len = -1;
10203         }
10204         eval_state->pos_magic = mg;
10205         eval_state->pos       = mg->mg_len;
10206         eval_state->pos_flags = mg->mg_flags;
10207     }
10208     else
10209         eval_state->pos_magic = NULL;
10210
10211     if (!PL_reg_curpm) {
10212         /* PL_reg_curpm is a fake PMOP that we can attach the current
10213          * regex to and point PL_curpm at, so that $1 et al are visible
10214          * within a /(?{})/. It's just allocated once per interpreter the
10215          * first time its needed */
10216         Newxz(PL_reg_curpm, 1, PMOP);
10217 #ifdef USE_ITHREADS
10218         {
10219             SV* const repointer = &PL_sv_undef;
10220             /* this regexp is also owned by the new PL_reg_curpm, which
10221                will try to free it.  */
10222             av_push(PL_regex_padav, repointer);
10223             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10224             PL_regex_pad = AvARRAY(PL_regex_padav);
10225         }
10226 #endif
10227     }
10228     SET_reg_curpm(reginfo->prog);
10229     eval_state->curpm = PL_curpm;
10230     PL_curpm_under = PL_curpm;
10231     PL_curpm = PL_reg_curpm;
10232     if (RXp_MATCH_COPIED(rex)) {
10233         /*  Here is a serious problem: we cannot rewrite subbeg,
10234             since it may be needed if this match fails.  Thus
10235             $` inside (?{}) could fail... */
10236         eval_state->subbeg     = rex->subbeg;
10237         eval_state->sublen     = rex->sublen;
10238         eval_state->suboffset  = rex->suboffset;
10239         eval_state->subcoffset = rex->subcoffset;
10240 #ifdef PERL_ANY_COW
10241         eval_state->saved_copy = rex->saved_copy;
10242 #endif
10243         RXp_MATCH_COPIED_off(rex);
10244     }
10245     else
10246         eval_state->subbeg = NULL;
10247     rex->subbeg = (char *)reginfo->strbeg;
10248     rex->suboffset = 0;
10249     rex->subcoffset = 0;
10250     rex->sublen = reginfo->strend - reginfo->strbeg;
10251 }
10252
10253
10254 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10255
10256 static void
10257 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10258 {
10259     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10260     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10261     regmatch_slab *s;
10262
10263     Safefree(aux->poscache);
10264
10265     if (eval_state) {
10266
10267         /* undo the effects of S_setup_eval_state() */
10268
10269         if (eval_state->subbeg) {
10270             regexp * const rex = eval_state->rex;
10271             rex->subbeg     = eval_state->subbeg;
10272             rex->sublen     = eval_state->sublen;
10273             rex->suboffset  = eval_state->suboffset;
10274             rex->subcoffset = eval_state->subcoffset;
10275 #ifdef PERL_ANY_COW
10276             rex->saved_copy = eval_state->saved_copy;
10277 #endif
10278             RXp_MATCH_COPIED_on(rex);
10279         }
10280         if (eval_state->pos_magic)
10281         {
10282             eval_state->pos_magic->mg_len = eval_state->pos;
10283             eval_state->pos_magic->mg_flags =
10284                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10285                | (eval_state->pos_flags & MGf_BYTES);
10286         }
10287
10288         PL_curpm = eval_state->curpm;
10289     }
10290
10291     PL_regmatch_state = aux->old_regmatch_state;
10292     PL_regmatch_slab  = aux->old_regmatch_slab;
10293
10294     /* free all slabs above current one - this must be the last action
10295      * of this function, as aux and eval_state are allocated within
10296      * slabs and may be freed here */
10297
10298     s = PL_regmatch_slab->next;
10299     if (s) {
10300         PL_regmatch_slab->next = NULL;
10301         while (s) {
10302             regmatch_slab * const osl = s;
10303             s = s->next;
10304             Safefree(osl);
10305         }
10306     }
10307 }
10308
10309
10310 STATIC void
10311 S_to_utf8_substr(pTHX_ regexp *prog)
10312 {
10313     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10314      * on the converted value */
10315
10316     int i = 1;
10317
10318     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10319
10320     do {
10321         if (prog->substrs->data[i].substr
10322             && !prog->substrs->data[i].utf8_substr) {
10323             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10324             prog->substrs->data[i].utf8_substr = sv;
10325             sv_utf8_upgrade(sv);
10326             if (SvVALID(prog->substrs->data[i].substr)) {
10327                 if (SvTAIL(prog->substrs->data[i].substr)) {
10328                     /* Trim the trailing \n that fbm_compile added last
10329                        time.  */
10330                     SvCUR_set(sv, SvCUR(sv) - 1);
10331                     /* Whilst this makes the SV technically "invalid" (as its
10332                        buffer is no longer followed by "\0") when fbm_compile()
10333                        adds the "\n" back, a "\0" is restored.  */
10334                     fbm_compile(sv, FBMcf_TAIL);
10335                 } else
10336                     fbm_compile(sv, 0);
10337             }
10338             if (prog->substrs->data[i].substr == prog->check_substr)
10339                 prog->check_utf8 = sv;
10340         }
10341     } while (i--);
10342 }
10343
10344 STATIC bool
10345 S_to_byte_substr(pTHX_ regexp *prog)
10346 {
10347     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10348      * on the converted value; returns FALSE if can't be converted. */
10349
10350     int i = 1;
10351
10352     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10353
10354     do {
10355         if (prog->substrs->data[i].utf8_substr
10356             && !prog->substrs->data[i].substr) {
10357             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10358             if (! sv_utf8_downgrade(sv, TRUE)) {
10359                 return FALSE;
10360             }
10361             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10362                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10363                     /* Trim the trailing \n that fbm_compile added last
10364                         time.  */
10365                     SvCUR_set(sv, SvCUR(sv) - 1);
10366                     fbm_compile(sv, FBMcf_TAIL);
10367                 } else
10368                     fbm_compile(sv, 0);
10369             }
10370             prog->substrs->data[i].substr = sv;
10371             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10372                 prog->check_substr = sv;
10373         }
10374     } while (i--);
10375
10376     return TRUE;
10377 }
10378
10379 #ifndef PERL_IN_XSUB_RE
10380
10381 bool
10382 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10383 {
10384     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10385      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10386      * the larger string bounded by 'strbeg' and 'strend'.
10387      *
10388      * 'cp' needs to be assigned (if not a future version of the Unicode
10389      * Standard could make it something that combines with adjacent characters,
10390      * so code using it would then break), and there has to be a GCB break
10391      * before and after the character. */
10392
10393     dVAR;
10394
10395     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10396     const U8 * prev_cp_start;
10397
10398     PERL_ARGS_ASSERT__IS_GRAPHEME;
10399
10400     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10401         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10402     {
10403         /* These are considered graphemes */
10404         return TRUE;
10405     }
10406
10407     /* Otherwise, unassigned code points are forbidden */
10408     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10409                                     _invlist_search(PL_Assigned_invlist, cp))))
10410     {
10411         return FALSE;
10412     }
10413
10414     cp_gcb_val = getGCB_VAL_CP(cp);
10415
10416     /* Find the GCB value of the previous code point in the input */
10417     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10418     if (UNLIKELY(prev_cp_start == s)) {
10419         prev_cp_gcb_val = GCB_EDGE;
10420     }
10421     else {
10422         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10423     }
10424
10425     /* And check that is a grapheme boundary */
10426     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10427                 TRUE /* is UTF-8 encoded */ ))
10428     {
10429         return FALSE;
10430     }
10431
10432     /* Similarly verify there is a break between the current character and the
10433      * following one */
10434     s += UTF8SKIP(s);
10435     if (s >= strend) {
10436         next_cp_gcb_val = GCB_EDGE;
10437     }
10438     else {
10439         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10440     }
10441
10442     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10443 }
10444
10445 /*
10446 =head1 Unicode Support
10447
10448 =for apidoc isSCRIPT_RUN
10449
10450 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10451 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10452 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10453 two degenerate cases given below, this function returns TRUE iff all code
10454 points in it come from any combination of three "scripts" given by the Unicode
10455 "Script Extensions" property: Common, Inherited, and possibly one other.
10456 Additionally all decimal digits must come from the same consecutive sequence of
10457 10.
10458
10459 For example, if all the characters in the sequence are Greek, or Common, or
10460 Inherited, this function will return TRUE, provided any decimal digits in it
10461 are from the same block of digits in Common.  (These are the ASCII digits
10462 "0".."9" and additionally a block for full width forms of these, and several
10463 others used in mathematical notation.)   For scripts (unlike Greek) that have
10464 their own digits defined this will accept either digits from that set or from
10465 one of the Common digit sets, but not a combination of the two.  Some scripts,
10466 such as Arabic, have more than one set of digits.  All digits must come from
10467 the same set for this function to return TRUE.
10468
10469 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10470 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10471 C<SCX_INVALID> if the function returns FALSE.
10472
10473 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10474 will be C<SCX_INVALID>.
10475
10476 If the sequence contains a single code point which is unassigned to a character
10477 in the version of Unicode being used, the function will return TRUE, and the
10478 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10479 in the input sequence will result in the function treating the input as not
10480 being a script run.
10481
10482 The returned script will be C<SCX_Inherited> iff all the code points in it are
10483 from the Inherited script.
10484
10485 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10486 it are from the Inherited or Common scripts.
10487
10488 =cut
10489
10490 */
10491
10492 bool
10493 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10494 {
10495     /* Basically, it looks at each character in the sequence to see if the
10496      * above conditions are met; if not it fails.  It uses an inversion map to
10497      * find the enum corresponding to the script of each character.  But this
10498      * is complicated by the fact that a few code points can be in any of
10499      * several scripts.  The data has been constructed so that there are
10500      * additional enum values (all negative) for these situations.  The
10501      * absolute value of those is an index into another table which contains
10502      * pointers to auxiliary tables for each such situation.  Each aux array
10503      * lists all the scripts for the given situation.  There is another,
10504      * parallel, table that gives the number of entries in each aux table.
10505      * These are all defined in charclass_invlists.h */
10506
10507     /* XXX Here are the additional things UTS 39 says could be done:
10508      *
10509      * Forbid sequences of the same nonspacing mark
10510      *
10511      * Check to see that all the characters are in the sets of exemplar
10512      * characters for at least one language in the Unicode Common Locale Data
10513      * Repository [CLDR]. */
10514
10515     dVAR;
10516
10517     /* Things that match /\d/u */
10518     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10519     UV * decimals_array = invlist_array(decimals_invlist);
10520
10521     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10522      * not currently known) */
10523     UV zero_of_run = 0;
10524
10525     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10526     SCX_enum script_of_char = SCX_INVALID;
10527
10528     /* If the script remains not fully determined from iteration to iteration,
10529      * this is the current intersection of the possiblities.  */
10530     SCX_enum * intersection = NULL;
10531     PERL_UINT_FAST8_T intersection_len = 0;
10532
10533     bool retval = TRUE;
10534     SCX_enum * ret_script = NULL;
10535
10536     assert(send >= s);
10537
10538     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10539
10540     /* All code points in 0..255 are either Common or Latin, so must be a
10541      * script run.  We can return immediately unless we need to know which
10542      * script it is. */
10543     if (! utf8_target && LIKELY(send > s)) {
10544         if (ret_script == NULL) {
10545             return TRUE;
10546         }
10547
10548         /* If any character is Latin, the run is Latin */
10549         while (s < send) {
10550             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10551                 *ret_script = SCX_Latin;
10552                 return TRUE;
10553             }
10554         }
10555
10556         /* Here, all are Common */
10557         *ret_script = SCX_Common;
10558         return TRUE;
10559     }
10560
10561     /* Look at each character in the sequence */
10562     while (s < send) {
10563         /* If the current character being examined is a digit, this is the code
10564          * point of the zero for its sequence of 10 */
10565         UV zero_of_char;
10566
10567         UV cp;
10568
10569         /* The code allows all scripts to use the ASCII digits.  This is
10570          * because they are in the Common script.  Hence any ASCII ones found
10571          * are ok, unless and until a digit from another set has already been
10572          * encountered.  digit ranges in Common are not similarly blessed) */
10573         if (UNLIKELY(isDIGIT(*s))) {
10574             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10575                 retval = FALSE;
10576                 break;
10577             }
10578             if (zero_of_run) {
10579                 if (zero_of_run != '0') {
10580                     retval = FALSE;
10581                     break;
10582                 }
10583             }
10584             else {
10585                 zero_of_run = '0';
10586             }
10587             s++;
10588             continue;
10589         }
10590
10591         /* Here, isn't an ASCII digit.  Find the code point of the character */
10592         if (! UTF8_IS_INVARIANT(*s)) {
10593             Size_t len;
10594             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10595             s += len;
10596         }
10597         else {
10598             cp = *(s++);
10599         }
10600
10601         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10602          * digit in that script.  We can skip the rest of this code for this
10603          * character. */
10604         if (UNLIKELY(   zero_of_run
10605                      && cp >= zero_of_run
10606                      && cp - zero_of_run <= 9))
10607         {
10608             continue;
10609         }
10610
10611         /* Find the character's script.  The correct values are hard-coded here
10612          * for small-enough code points. */
10613         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10614                                unlikely to change */
10615             if (       cp > 255
10616                 || (   isALPHA_L1(cp)
10617                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10618             {
10619                 script_of_char = SCX_Latin;
10620             }
10621             else {
10622                 script_of_char = SCX_Common;
10623             }
10624         }
10625         else {
10626             script_of_char = _Perl_SCX_invmap[
10627                                        _invlist_search(PL_SCX_invlist, cp)];
10628         }
10629
10630         /* We arbitrarily accept a single unassigned character, but not in
10631          * combination with anything else, and not a run of them. */
10632         if (   UNLIKELY(script_of_run == SCX_Unknown)
10633             || UNLIKELY(   script_of_run != SCX_INVALID
10634                         && script_of_char == SCX_Unknown))
10635         {
10636             retval = FALSE;
10637             break;
10638         }
10639
10640         /* For the first character, or the run is inherited, the run's script
10641          * is set to the char's */
10642         if (   UNLIKELY(script_of_run == SCX_INVALID)
10643             || UNLIKELY(script_of_run == SCX_Inherited))
10644         {
10645             script_of_run = script_of_char;
10646         }
10647
10648         /* For the character's script to be Unknown, it must be the first
10649          * character in the sequence (for otherwise a test above would have
10650          * prevented us from reaching here), and we have set the run's script
10651          * to it.  Nothing further to be done for this character */
10652         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10653             continue;
10654         }
10655
10656         /* We accept 'inherited' script characters currently even at the
10657          * beginning.  (We know that no characters in Inherited are digits, or
10658          * we'd have to check for that) */
10659         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10660             continue;
10661         }
10662
10663         /* If the run so far is Common, and the new character isn't, change the
10664          * run's script to that of this character */
10665         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10666             script_of_run = script_of_char;
10667         }
10668
10669         /* Now we can see if the script of the new character is the same as
10670          * that of the run */
10671         if (LIKELY(script_of_char == script_of_run)) {
10672             /* By far the most common case */
10673             goto scripts_match;
10674         }
10675
10676         /* Here, the script of the run isn't Common.  But characters in Common
10677          * match any script */
10678         if (script_of_char == SCX_Common) {
10679             goto scripts_match;
10680         }
10681
10682 #ifndef HAS_SCX_AUX_TABLES
10683
10684         /* Too early a Unicode version to have a code point belonging to more
10685          * than one script, so, if the scripts don't exactly match, fail */
10686         PERL_UNUSED_VAR(intersection_len);
10687         retval = FALSE;
10688         break;
10689
10690 #else
10691
10692         /* Here there is no exact match between the character's script and the
10693          * run's.  And we've handled the special cases of scripts Unknown,
10694          * Inherited, and Common.
10695          *
10696          * Negative script numbers signify that the value may be any of several
10697          * scripts, and we need to look at auxiliary information to make our
10698          * deterimination.  But if both are non-negative, we can fail now */
10699         if (LIKELY(script_of_char >= 0)) {
10700             const SCX_enum * search_in;
10701             PERL_UINT_FAST8_T search_in_len;
10702             PERL_UINT_FAST8_T i;
10703
10704             if (LIKELY(script_of_run >= 0)) {
10705                 retval = FALSE;
10706                 break;
10707             }
10708
10709             /* Use the previously constructed set of possible scripts, if any.
10710              * */
10711             if (intersection) {
10712                 search_in = intersection;
10713                 search_in_len = intersection_len;
10714             }
10715             else {
10716                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10717                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10718             }
10719
10720             for (i = 0; i < search_in_len; i++) {
10721                 if (search_in[i] == script_of_char) {
10722                     script_of_run = script_of_char;
10723                     goto scripts_match;
10724                 }
10725             }
10726
10727             retval = FALSE;
10728             break;
10729         }
10730         else if (LIKELY(script_of_run >= 0)) {
10731             /* script of character could be one of several, but run is a single
10732              * script */
10733             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10734             const PERL_UINT_FAST8_T search_in_len
10735                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10736             PERL_UINT_FAST8_T i;
10737
10738             for (i = 0; i < search_in_len; i++) {
10739                 if (search_in[i] == script_of_run) {
10740                     script_of_char = script_of_run;
10741                     goto scripts_match;
10742                 }
10743             }
10744
10745             retval = FALSE;
10746             break;
10747         }
10748         else {
10749             /* Both run and char could be in one of several scripts.  If the
10750              * intersection is empty, then this character isn't in this script
10751              * run.  Otherwise, we need to calculate the intersection to use
10752              * for future iterations of the loop, unless we are already at the
10753              * final character */
10754             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10755             const PERL_UINT_FAST8_T char_len
10756                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10757             const SCX_enum * search_run;
10758             PERL_UINT_FAST8_T run_len;
10759
10760             SCX_enum * new_overlap = NULL;
10761             PERL_UINT_FAST8_T i, j;
10762
10763             if (intersection) {
10764                 search_run = intersection;
10765                 run_len = intersection_len;
10766             }
10767             else {
10768                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10769                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10770             }
10771
10772             intersection_len = 0;
10773
10774             for (i = 0; i < run_len; i++) {
10775                 for (j = 0; j < char_len; j++) {
10776                     if (search_run[i] == search_char[j]) {
10777
10778                         /* Here, the script at i,j matches.  That means this
10779                          * character is in the run.  But continue on to find
10780                          * the complete intersection, for the next loop
10781                          * iteration, and for the digit check after it.
10782                          *
10783                          * On the first found common script, we malloc space
10784                          * for the intersection list for the worst case of the
10785                          * intersection, which is the minimum of the number of
10786                          * scripts remaining in each set. */
10787                         if (intersection_len == 0) {
10788                             Newx(new_overlap,
10789                                  MIN(run_len - i, char_len - j),
10790                                  SCX_enum);
10791                         }
10792                         new_overlap[intersection_len++] = search_run[i];
10793                     }
10794                 }
10795             }
10796
10797             /* Here we've looked through everything.  If they have no scripts
10798              * in common, not a run */
10799             if (intersection_len == 0) {
10800                 retval = FALSE;
10801                 break;
10802             }
10803
10804             /* If there is only a single script in common, set to that.
10805              * Otherwise, use the intersection going forward */
10806             Safefree(intersection);
10807             intersection = NULL;
10808             if (intersection_len == 1) {
10809                 script_of_run = script_of_char = new_overlap[0];
10810                 Safefree(new_overlap);
10811                 new_overlap = NULL;
10812             }
10813             else {
10814                 intersection = new_overlap;
10815             }
10816         }
10817
10818 #endif
10819
10820   scripts_match:
10821
10822         /* Here, the script of the character is compatible with that of the
10823          * run.  That means that in most cases, it continues the script run.
10824          * Either it and the run match exactly, or one or both can be in any of
10825          * several scripts, and the intersection is not empty.  However, if the
10826          * character is a decimal digit, it could still mean failure if it is
10827          * from the wrong sequence of 10.  So, we need to look at if it's a
10828          * digit.  We've already handled the 10 decimal digits, and the next
10829          * lowest one is this one: */
10830         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
10831             continue;   /* Not a digit; this character is part of the run */
10832         }
10833
10834         /* If we have a definitive '0' for the script of this character, we
10835          * know that for this to be a digit, it must be in the range of +0..+9
10836          * of that zero. */
10837         if (   script_of_char >= 0
10838             && (zero_of_char = script_zeros[script_of_char]))
10839         {
10840             if (   cp < zero_of_char
10841                 || cp > zero_of_char + 9)
10842             {
10843                 continue;   /* Not a digit; this character is part of the run
10844                              */
10845             }
10846
10847         }
10848         else {  /* Need to look up if this character is a digit or not */
10849             SSize_t index_of_zero_of_char;
10850             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
10851             if (     UNLIKELY(index_of_zero_of_char < 0)
10852                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
10853             {
10854                 continue;   /* Not a digit; this character is part of the run.
10855                              */
10856             }
10857
10858             zero_of_char = decimals_array[index_of_zero_of_char];
10859         }
10860
10861         /* Here, the character is a decimal digit, and the zero of its sequence
10862          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
10863          * they better be the same. */
10864         if (zero_of_run) {
10865             if (zero_of_run != zero_of_char) {
10866                 retval = FALSE;
10867                 break;
10868             }
10869         }
10870         else {  /* Otherwise we now have a zero for this run */
10871             zero_of_run = zero_of_char;
10872         }
10873     } /* end of looping through CLOSESR text */
10874
10875     Safefree(intersection);
10876
10877     if (ret_script != NULL) {
10878         if (retval) {
10879             *ret_script = script_of_run;
10880         }
10881         else {
10882             *ret_script = SCX_INVALID;
10883         }
10884     }
10885
10886     return retval;
10887 }
10888
10889 #endif /* ifndef PERL_IN_XSUB_RE */
10890
10891 /*
10892  * ex: set ts=8 sts=4 sw=4 et:
10893  */