regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char non_utf8_target_but_utf8_required[]
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /*
 111  * Forwards.
 112  */
 113
 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 115
 116 #define HOPc(pos,off) \
 117         (char *)(reginfo->is_utf8_target \
 118             ? reghop3((U8*)pos, off, \
 119                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 120             : (U8*)(pos + off))
 121
 122 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 123 #define HOPBACK3(pos, off, lim) \
 124         (reginfo->is_utf8_target                          \
 125             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 126             : (pos - off >= lim)                                 \
 127                 ? (U8*)pos - off                                 \
 128                 : NULL)
 129
 130 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 131
 132 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 133 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 134
 135 /* lim must be +ve. Returns NULL on overshoot */
 136 #define HOPMAYBE3(pos,off,lim) \
 137         (reginfo->is_utf8_target                        \
 138             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 139             : ((U8*)pos + off <= lim)                   \
 140                 ? (U8*)pos + off                        \
 141                 : NULL)
 142
 143 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 144  * off must be >=0; args should be vars rather than expressions */
 145 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 146     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 147     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 148 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 149
 150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 151     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 152     : (U8*)(pos + off))
 153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 154
 155 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 156 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 157
 158 /* for use after a quantifier and before an EXACT-like node -- japhy */
 159 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 160  *
 161  * NOTE that *nothing* that affects backtracking should be in here, specifically
 162  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 163  * node that is in between two EXACT like nodes when ascertaining what the required
 164  * "follow" character is. This should probably be moved to regex compile time
 165  * although it may be done at run time beause of the REF possibility - more
 166  * investigation required. -- demerphq
 167 */
 168 #define JUMPABLE(rn) (                                                             \
 169     OP(rn) == OPEN ||                                                              \
 170     (OP(rn) == CLOSE &&                                                            \
 171      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 172     OP(rn) == EVAL ||                                                              \
 173     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 174     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 175     OP(rn) == KEEPS ||                                                             \
 176     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 177 )
 178 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 179
 180 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 181
 182 /*
 183   Search for mandatory following text node; for lookahead, the text must
 184   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 185 */
 186 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 187     while (JUMPABLE(rn)) { \
 188         const OPCODE type = OP(rn); \
 189         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 190             rn = NEXTOPER(NEXTOPER(rn)); \
 191         else if (type == PLUS) \
 192             rn = NEXTOPER(rn); \
 193         else if (type == IFMATCH) \
 194             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 195         else rn += NEXT_OFF(rn); \
 196     } \
 197 } STMT_END
 198
 199 #define SLAB_FIRST(s) (&(s)->states[0])
 200 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 201
 202 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 203 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 204 static regmatch_state * S_push_slab(pTHX);
 205
 206 #define REGCP_PAREN_ELEMS 3
 207 #define REGCP_OTHER_ELEMS 3
 208 #define REGCP_FRAME_ELEMS 1
 209 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 210  * are needed for the regexp context stack bookkeeping. */
 211
 212 STATIC CHECKPOINT
 213 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 214 {
 215     const int retval = PL_savestack_ix;
 216     const int paren_elems_to_push =
 217                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 218     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 219     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 220     I32 p;
 221     GET_RE_DEBUG_FLAGS_DECL;
 222
 223     PERL_ARGS_ASSERT_REGCPPUSH;
 224
 225     if (paren_elems_to_push < 0)
 226         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 227                    (int)paren_elems_to_push, (int)maxopenparen,
 228                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 229
 230     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 231         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 232                    " out of range (%lu-%ld)",
 233                    total_elems,
 234                    (unsigned long)maxopenparen,
 235                    (long)parenfloor);
 236
 237     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 238
 239     DEBUG_BUFFERS_r(
 240         if ((int)maxopenparen > (int)parenfloor)
 241             Perl_re_exec_indentf( aTHX_
 242                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 243                 depth,
 244                 PTR2UV(rex),
 245                 PTR2UV(rex->offs)
 246             );
 247     );
 248     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 249 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 250         SSPUSHIV(rex->offs[p].end);
 251         SSPUSHIV(rex->offs[p].start);
 252         SSPUSHINT(rex->offs[p].start_tmp);
 253         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 254             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 255             depth,
 256             (UV)p,
 257             (IV)rex->offs[p].start,
 258             (IV)rex->offs[p].start_tmp,
 259             (IV)rex->offs[p].end
 260         ));
 261     }
 262 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 263     SSPUSHINT(maxopenparen);
 264     SSPUSHINT(rex->lastparen);
 265     SSPUSHINT(rex->lastcloseparen);
 266     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 267
 268     return retval;
 269 }
 270
 271 /* These are needed since we do not localize EVAL nodes: */
 272 #define REGCP_SET(cp)                                           \
 273     DEBUG_STATE_r(                                              \
 274         Perl_re_exec_indentf( aTHX_                             \
 275             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 276             depth, (IV)PL_savestack_ix                          \
 277         )                                                       \
 278     );                                                          \
 279     cp = PL_savestack_ix
 280
 281 #define REGCP_UNWIND(cp)                                        \
 282     DEBUG_STATE_r(                                              \
 283         if (cp != PL_savestack_ix)                              \
 284             Perl_re_exec_indentf( aTHX_                         \
 285                 "Clearing an EVAL scope, savestack=%"           \
 286                 IVdf "..%" IVdf "\n",                           \
 287                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 288             )                                                   \
 289     );                                                          \
 290     regcpblow(cp)
 291
 292 /* set the start and end positions of capture ix */
 293 #define CLOSE_CAPTURE(ix, s, e)                                            \
 294     rex->offs[ix].start = s;                                               \
 295     rex->offs[ix].end = e;                                                 \
 296     if (ix > rex->lastparen)                                               \
 297         rex->lastparen = ix;                                               \
 298     rex->lastcloseparen = ix;                                              \
 299     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
 300         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
 301         depth,                                                             \
 302         PTR2UV(rex),                                                       \
 303         PTR2UV(rex->offs),                                                 \
 304         (UV)ix,                                                            \
 305         (IV)rex->offs[ix].start,                                           \
 306         (IV)rex->offs[ix].end,                                             \
 307         (UV)rex->lastparen                                                 \
 308     ))
 309
 310 #define UNWIND_PAREN(lp, lcp)               \
 311     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
 312         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
 313         depth,                              \
 314         PTR2UV(rex),                        \
 315         PTR2UV(rex->offs),                  \
 316         (UV)(lp),                           \
 317         (UV)(rex->lastparen),               \
 318         (UV)(lcp)                           \
 319     ));                                     \
 320     for (n = rex->lastparen; n > lp; n--)   \
 321         rex->offs[n].end = -1;              \
 322     rex->lastparen = n;                     \
 323     rex->lastcloseparen = lcp;
 324
 325
 326 STATIC void
 327 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 328 {
 329     UV i;
 330     U32 paren;
 331     GET_RE_DEBUG_FLAGS_DECL;
 332
 333     PERL_ARGS_ASSERT_REGCPPOP;
 334
 335     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 336     i = SSPOPUV;
 337     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 338     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 339     rex->lastcloseparen = SSPOPINT;
 340     rex->lastparen = SSPOPINT;
 341     *maxopenparen_p = SSPOPINT;
 342
 343     i -= REGCP_OTHER_ELEMS;
 344     /* Now restore the parentheses context. */
 345     DEBUG_BUFFERS_r(
 346         if (i || rex->lastparen + 1 <= rex->nparens)
 347             Perl_re_exec_indentf( aTHX_
 348                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 349                 depth,
 350                 PTR2UV(rex),
 351                 PTR2UV(rex->offs)
 352             );
 353     );
 354     paren = *maxopenparen_p;
 355     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 356         SSize_t tmps;
 357         rex->offs[paren].start_tmp = SSPOPINT;
 358         rex->offs[paren].start = SSPOPIV;
 359         tmps = SSPOPIV;
 360         if (paren <= rex->lastparen)
 361             rex->offs[paren].end = tmps;
 362         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 363             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 364             depth,
 365             (UV)paren,
 366             (IV)rex->offs[paren].start,
 367             (IV)rex->offs[paren].start_tmp,
 368             (IV)rex->offs[paren].end,
 369             (paren > rex->lastparen ? "(skipped)" : ""));
 370         );
 371         paren--;
 372     }
 373 #if 1
 374     /* It would seem that the similar code in regtry()
 375      * already takes care of this, and in fact it is in
 376      * a better location to since this code can #if 0-ed out
 377      * but the code in regtry() is needed or otherwise tests
 378      * requiring null fields (pat.t#187 and split.t#{13,14}
 379      * (as of patchlevel 7877)  will fail.  Then again,
 380      * this code seems to be necessary or otherwise
 381      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 382      * --jhi updated by dapm */
 383     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 384         if (i > *maxopenparen_p)
 385             rex->offs[i].start = -1;
 386         rex->offs[i].end = -1;
 387         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 388             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 389             depth,
 390             (UV)i,
 391             (i > *maxopenparen_p) ? "-1" : "  "
 392         ));
 393     }
 394 #endif
 395 }
 396
 397 /* restore the parens and associated vars at savestack position ix,
 398  * but without popping the stack */
 399
 400 STATIC void
 401 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 402 {
 403     I32 tmpix = PL_savestack_ix;
 404     PERL_ARGS_ASSERT_REGCP_RESTORE;
 405
 406     PL_savestack_ix = ix;
 407     regcppop(rex, maxopenparen_p);
 408     PL_savestack_ix = tmpix;
 409 }
 410
 411 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 412
 413 #ifndef PERL_IN_XSUB_RE
 414
 415 bool
 416 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 417 {
 418     /* Returns a boolean as to whether or not 'character' is a member of the
 419      * Posix character class given by 'classnum' that should be equivalent to a
 420      * value in the typedef '_char_class_number'.
 421      *
 422      * Ideally this could be replaced by a just an array of function pointers
 423      * to the C library functions that implement the macros this calls.
 424      * However, to compile, the precise function signatures are required, and
 425      * these may vary from platform to to platform.  To avoid having to figure
 426      * out what those all are on each platform, I (khw) am using this method,
 427      * which adds an extra layer of function call overhead (unless the C
 428      * optimizer strips it away).  But we don't particularly care about
 429      * performance with locales anyway. */
 430
 431     switch ((_char_class_number) classnum) {
 432         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 433         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 434         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 435         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 436         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 437                                         || isUPPER_LC(character);
 438         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 439         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 440         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 441         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 442         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 443         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 444         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 445         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 446         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 447         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 448         default:    /* VERTSPACE should never occur in locales */
 449             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 450     }
 451
 452     NOT_REACHED; /* NOTREACHED */
 453     return FALSE;
 454 }
 455
 456 #endif
 457
 458 PERL_STATIC_INLINE I32
 459 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
 460 {
 461     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
 462      * folded.  Works on all folds representable without UTF-8, except for
 463      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
 464      * check that the strings each have at least 'len' characters.
 465      *
 466      * There is almost an identical API function where s2 need not be folded:
 467      * Perl_foldEQ_latin1() */
 468
 469     const U8 *a = (const U8 *)s1;
 470     const U8 *b = (const U8 *)s2;
 471
 472     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
 473
 474     assert(len >= 0);
 475
 476     while (len--) {
 477         assert(! isUPPER_L1(*b));
 478         if (toLOWER_L1(*a) != *b) {
 479             return 0;
 480         }
 481         a++, b++;
 482     }
 483     return 1;
 484 }
 485
 486 STATIC bool
 487 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 488 {
 489     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 490      * 'character' is a member of the Posix character class given by 'classnum'
 491      * that should be equivalent to a value in the typedef
 492      * '_char_class_number'.
 493      *
 494      * This just calls isFOO_lc on the code point for the character if it is in
 495      * the range 0-255.  Outside that range, all characters use Unicode
 496      * rules, ignoring any locale.  So use the Unicode function if this class
 497      * requires an inversion list, and use the Unicode macro otherwise. */
 498
 499     dVAR;
 500
 501     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 502
 503     if (UTF8_IS_INVARIANT(*character)) {
 504         return isFOO_lc(classnum, *character);
 505     }
 506     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 507         return isFOO_lc(classnum,
 508                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 509     }
 510
 511     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 512
 513     switch ((_char_class_number) classnum) {
 514         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 515         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 516         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 517         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 518         default:
 519             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 520                                         utf8_to_uvchr_buf(character, e, NULL));
 521     }
 522
 523     return FALSE; /* Things like CNTRL are always below 256 */
 524 }
 525
 526 STATIC U8 *
 527 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 528 {
 529     /* Returns the position of the first byte in the sequence between 's' and
 530      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 531      * */
 532
 533     PERL_ARGS_ASSERT_FIND_SPAN_END;
 534
 535     assert(send >= s);
 536
 537     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 538                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 539                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 540     {
 541         PERL_UINTMAX_T span_word;
 542
 543         /* Process per-byte until reach word boundary.  XXX This loop could be
 544          * eliminated if we knew that this platform had fast unaligned reads */
 545         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 546             if (*s != span_byte) {
 547                 return s;
 548             }
 549             s++;
 550         }
 551
 552         /* Create a word filled with the bytes we are spanning */
 553         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 554
 555         /* Process per-word as long as we have at least a full word left */
 556         do {
 557
 558             /* Keep going if the whole word is composed of 'span_byte's */
 559             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 560                 s += PERL_WORDSIZE;
 561                 continue;
 562             }
 563
 564             /* Here, at least one byte in the word isn't 'span_byte'. */
 565
 566 #ifdef EBCDIC
 567
 568             break;
 569
 570 #else
 571
 572             /* This xor leaves 1 bits only in those non-matching bytes */
 573             span_word ^= * (PERL_UINTMAX_T *) s;
 574
 575             /* Make sure the upper bit of each non-matching byte is set.  This
 576              * makes each such byte look like an ASCII platform variant byte */
 577             span_word |= span_word << 1;
 578             span_word |= span_word << 2;
 579             span_word |= span_word << 4;
 580
 581             /* That reduces the problem to what this function solves */
 582             return s + _variant_byte_number(span_word);
 583
 584 #endif
 585
 586         } while (s + PERL_WORDSIZE <= send);
 587     }
 588
 589     /* Process the straggler bytes beyond the final word boundary */
 590     while (s < send) {
 591         if (*s != span_byte) {
 592             return s;
 593         }
 594         s++;
 595     }
 596
 597     return s;
 598 }
 599
 600 STATIC U8 *
 601 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 602 {
 603     /* Returns the position of the first byte in the sequence between 's'
 604      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 605      * returns 'send' if none found.  It uses word-level operations instead of
 606      * byte to speed up the process */
 607
 608     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 609
 610     assert(send >= s);
 611     assert((byte & mask) == byte);
 612
 613 #ifndef EBCDIC
 614
 615     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 616                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 617                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 618     {
 619         PERL_UINTMAX_T word, mask_word;
 620
 621         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 622             if (((*s) & mask) == byte) {
 623                 return s;
 624             }
 625             s++;
 626         }
 627
 628         word      = PERL_COUNT_MULTIPLIER * byte;
 629         mask_word = PERL_COUNT_MULTIPLIER * mask;
 630
 631         do {
 632             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 633
 634             /* If 'masked' contains bytes with the bit pattern of 'byte' within
 635              * it, xoring with 'word' will leave each of the 8 bits in such
 636              * bytes be 0, and no byte containing any other bit pattern will be
 637              * 0. */
 638             masked ^= word;
 639
 640             /* This causes the most significant bit to be set to 1 for any
 641              * bytes in the word that aren't completely 0 */
 642             masked |= masked << 1;
 643             masked |= masked << 2;
 644             masked |= masked << 4;
 645
 646             /* The msbits are the same as what marks a byte as variant, so we
 647              * can use this mask.  If all msbits are 1, the word doesn't
 648              * contain 'byte' */
 649             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 650                 s += PERL_WORDSIZE;
 651                 continue;
 652             }
 653
 654             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 655              * and any that are, are 0.  Complement and re-AND to swap that */
 656             masked = ~ masked;
 657             masked &= PERL_VARIANTS_WORD_MASK;
 658
 659             /* This reduces the problem to that solved by this function */
 660             s += _variant_byte_number(masked);
 661             return s;
 662
 663         } while (s + PERL_WORDSIZE <= send);
 664     }
 665
 666 #endif
 667
 668     while (s < send) {
 669         if (((*s) & mask) == byte) {
 670             return s;
 671         }
 672         s++;
 673     }
 674
 675     return s;
 676 }
 677
 678 STATIC U8 *
 679 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 680 {
 681     /* Returns the position of the first byte in the sequence between 's' and
 682      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 683      * 'span_byte' should have been ANDed with 'mask' in the call of this
 684      * function.  Returns 'send' if none found.  Works like find_span_end(),
 685      * except for the AND */
 686
 687     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 688
 689     assert(send >= s);
 690     assert((span_byte & mask) == span_byte);
 691
 692     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 693                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 694                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 695     {
 696         PERL_UINTMAX_T span_word, mask_word;
 697
 698         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 699             if (((*s) & mask) != span_byte) {
 700                 return s;
 701             }
 702             s++;
 703         }
 704
 705         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 706         mask_word = PERL_COUNT_MULTIPLIER * mask;
 707
 708         do {
 709             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 710
 711             if (masked == span_word) {
 712                 s += PERL_WORDSIZE;
 713                 continue;
 714             }
 715
 716 #ifdef EBCDIC
 717
 718             break;
 719
 720 #else
 721
 722             masked ^= span_word;
 723             masked |= masked << 1;
 724             masked |= masked << 2;
 725             masked |= masked << 4;
 726             return s + _variant_byte_number(masked);
 727
 728 #endif
 729
 730         } while (s + PERL_WORDSIZE <= send);
 731     }
 732
 733     while (s < send) {
 734         if (((*s) & mask) != span_byte) {
 735             return s;
 736         }
 737         s++;
 738     }
 739
 740     return s;
 741 }
 742
 743 /*
 744  * pregexec and friends
 745  */
 746
 747 #ifndef PERL_IN_XSUB_RE
 748 /*
 749  - pregexec - match a regexp against a string
 750  */
 751 I32
 752 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 753          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 754 /* stringarg: the point in the string at which to begin matching */
 755 /* strend:    pointer to null at end of string */
 756 /* strbeg:    real beginning of string */
 757 /* minend:    end of match must be >= minend bytes after stringarg. */
 758 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 759  *            itself is accessed via the pointers above */
 760 /* nosave:    For optimizations. */
 761 {
 762     PERL_ARGS_ASSERT_PREGEXEC;
 763
 764     return
 765         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 766                       nosave ? 0 : REXEC_COPY_STR);
 767 }
 768 #endif
 769
 770
 771
 772 /* re_intuit_start():
 773  *
 774  * Based on some optimiser hints, try to find the earliest position in the
 775  * string where the regex could match.
 776  *
 777  *   rx:     the regex to match against
 778  *   sv:     the SV being matched: only used for utf8 flag; the string
 779  *           itself is accessed via the pointers below. Note that on
 780  *           something like an overloaded SV, SvPOK(sv) may be false
 781  *           and the string pointers may point to something unrelated to
 782  *           the SV itself.
 783  *   strbeg: real beginning of string
 784  *   strpos: the point in the string at which to begin matching
 785  *   strend: pointer to the byte following the last char of the string
 786  *   flags   currently unused; set to 0
 787  *   data:   currently unused; set to NULL
 788  *
 789  * The basic idea of re_intuit_start() is to use some known information
 790  * about the pattern, namely:
 791  *
 792  *   a) the longest known anchored substring (i.e. one that's at a
 793  *      constant offset from the beginning of the pattern; but not
 794  *      necessarily at a fixed offset from the beginning of the
 795  *      string);
 796  *   b) the longest floating substring (i.e. one that's not at a constant
 797  *      offset from the beginning of the pattern);
 798  *   c) Whether the pattern is anchored to the string; either
 799  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 800  *      or anchored to pos(): /\G/;
 801  *   d) A start class: a real or synthetic character class which
 802  *      represents which characters are legal at the start of the pattern;
 803  *
 804  * to either quickly reject the match, or to find the earliest position
 805  * within the string at which the pattern might match, thus avoiding
 806  * running the full NFA engine at those earlier locations, only to
 807  * eventually fail and retry further along.
 808  *
 809  * Returns NULL if the pattern can't match, or returns the address within
 810  * the string which is the earliest place the match could occur.
 811  *
 812  * The longest of the anchored and floating substrings is called 'check'
 813  * and is checked first. The other is called 'other' and is checked
 814  * second. The 'other' substring may not be present.  For example,
 815  *
 816  *    /(abc|xyz)ABC\d{0,3}DEFG/
 817  *
 818  * will have
 819  *
 820  *   check substr (float)    = "DEFG", offset 6..9 chars
 821  *   other substr (anchored) = "ABC",  offset 3..3 chars
 822  *   stclass = [ax]
 823  *
 824  * Be aware that during the course of this function, sometimes 'anchored'
 825  * refers to a substring being anchored relative to the start of the
 826  * pattern, and sometimes to the pattern itself being anchored relative to
 827  * the string. For example:
 828  *
 829  *   /\dabc/:   "abc" is anchored to the pattern;
 830  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 831  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 832  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 833  *                    but the pattern is anchored to the string.
 834  */
 835
 836 char *
 837 Perl_re_intuit_start(pTHX_
 838                     REGEXP * const rx,
 839                     SV *sv,
 840                     const char * const strbeg,
 841                     char *strpos,
 842                     char *strend,
 843                     const U32 flags,
 844                     re_scream_pos_data *data)
 845 {
 846     struct regexp *const prog = ReANY(rx);
 847     SSize_t start_shift = prog->check_offset_min;
 848     /* Should be nonnegative! */
 849     SSize_t end_shift   = 0;
 850     /* current lowest pos in string where the regex can start matching */
 851     char *rx_origin = strpos;
 852     SV *check;
 853     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 854     U8   other_ix = 1 - prog->substrs->check_ix;
 855     bool ml_anch = 0;
 856     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 857     char *check_at = NULL;              /* check substr found at this pos */
 858     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 859     RXi_GET_DECL(prog,progi);
 860     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 861     regmatch_info *const reginfo = &reginfo_buf;
 862     GET_RE_DEBUG_FLAGS_DECL;
 863
 864     PERL_ARGS_ASSERT_RE_INTUIT_START;
 865     PERL_UNUSED_ARG(flags);
 866     PERL_UNUSED_ARG(data);
 867
 868     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 869                 "Intuit: trying to determine minimum start position...\n"));
 870
 871     /* for now, assume that all substr offsets are positive. If at some point
 872      * in the future someone wants to do clever things with lookbehind and
 873      * -ve offsets, they'll need to fix up any code in this function
 874      * which uses these offsets. See the thread beginning
 875      * <20140113145929.GF27210@iabyn.com>
 876      */
 877     assert(prog->substrs->data[0].min_offset >= 0);
 878     assert(prog->substrs->data[0].max_offset >= 0);
 879     assert(prog->substrs->data[1].min_offset >= 0);
 880     assert(prog->substrs->data[1].max_offset >= 0);
 881     assert(prog->substrs->data[2].min_offset >= 0);
 882     assert(prog->substrs->data[2].max_offset >= 0);
 883
 884     /* for now, assume that if both present, that the floating substring
 885      * doesn't start before the anchored substring.
 886      * If you break this assumption (e.g. doing better optimisations
 887      * with lookahead/behind), then you'll need to audit the code in this
 888      * function carefully first
 889      */
 890     assert(
 891             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 892               && (prog->float_utf8    || prog->float_substr))
 893            || (prog->float_min_offset >= prog->anchored_offset));
 894
 895     /* byte rather than char calculation for efficiency. It fails
 896      * to quickly reject some cases that can't match, but will reject
 897      * them later after doing full char arithmetic */
 898     if (prog->minlen > strend - strpos) {
 899         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 900                               "  String too short...\n"));
 901         goto fail;
 902     }
 903
 904     RXp_MATCH_UTF8_set(prog, utf8_target);
 905     reginfo->is_utf8_target = cBOOL(utf8_target);
 906     reginfo->info_aux = NULL;
 907     reginfo->strbeg = strbeg;
 908     reginfo->strend = strend;
 909     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 910     reginfo->intuit = 1;
 911     /* not actually used within intuit, but zero for safety anyway */
 912     reginfo->poscache_maxiter = 0;
 913
 914     if (utf8_target) {
 915         if ((!prog->anchored_utf8 && prog->anchored_substr)
 916                 || (!prog->float_utf8 && prog->float_substr))
 917             to_utf8_substr(prog);
 918         check = prog->check_utf8;
 919     } else {
 920         if (!prog->check_substr && prog->check_utf8) {
 921             if (! to_byte_substr(prog)) {
 922                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 923             }
 924         }
 925         check = prog->check_substr;
 926     }
 927
 928     /* dump the various substring data */
 929     DEBUG_OPTIMISE_MORE_r({
 930         int i;
 931         for (i=0; i<=2; i++) {
 932             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 933                                   : prog->substrs->data[i].substr);
 934             if (!sv)
 935                 continue;
 936
 937             Perl_re_printf( aTHX_
 938                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
 939                 " useful=%" IVdf " utf8=%d [%s]\n",
 940                 i,
 941                 (IV)prog->substrs->data[i].min_offset,
 942                 (IV)prog->substrs->data[i].max_offset,
 943                 (IV)prog->substrs->data[i].end_shift,
 944                 BmUSEFUL(sv),
 945                 utf8_target ? 1 : 0,
 946                 SvPEEK(sv));
 947         }
 948     });
 949
 950     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 951
 952         /* ml_anch: check after \n?
 953          *
 954          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 955          * with /.*.../, these flags will have been added by the
 956          * compiler:
 957          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 958          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 959          */
 960         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 961                    && !(prog->intflags & PREGf_IMPLICIT);
 962
 963         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 964             /* we are only allowed to match at BOS or \G */
 965
 966             /* trivially reject if there's a BOS anchor and we're not at BOS.
 967              *
 968              * Note that we don't try to do a similar quick reject for
 969              * \G, since generally the caller will have calculated strpos
 970              * based on pos() and gofs, so the string is already correctly
 971              * anchored by definition; and handling the exceptions would
 972              * be too fiddly (e.g. REXEC_IGNOREPOS).
 973              */
 974             if (   strpos != strbeg
 975                 && (prog->intflags & PREGf_ANCH_SBOL))
 976             {
 977                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 978                                 "  Not at start...\n"));
 979                 goto fail;
 980             }
 981
 982             /* in the presence of an anchor, the anchored (relative to the
 983              * start of the regex) substr must also be anchored relative
 984              * to strpos. So quickly reject if substr isn't found there.
 985              * This works for \G too, because the caller will already have
 986              * subtracted gofs from pos, and gofs is the offset from the
 987              * \G to the start of the regex. For example, in /.abc\Gdef/,
 988              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 989              * caller will have set strpos=pos()-4; we look for the substr
 990              * at position pos()-4+1, which lines up with the "a" */
 991
 992             if (prog->check_offset_min == prog->check_offset_max) {
 993                 /* Substring at constant offset from beg-of-str... */
 994                 SSize_t slen = SvCUR(check);
 995                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 996
 997                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 998                     "  Looking for check substr at fixed offset %" IVdf "...\n",
 999                     (IV)prog->check_offset_min));
1000
1001                 if (SvTAIL(check)) {
1002                     /* In this case, the regex is anchored at the end too.
1003                      * Unless it's a multiline match, the lengths must match
1004                      * exactly, give or take a \n.  NB: slen >= 1 since
1005                      * the last char of check is \n */
1006                     if (!multiline
1007                         && (   strend - s > slen
1008                             || strend - s < slen - 1
1009                             || (strend - s == slen && strend[-1] != '\n')))
1010                     {
1011                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1012                                             "  String too long...\n"));
1013                         goto fail_finish;
1014                     }
1015                     /* Now should match s[0..slen-2] */
1016                     slen--;
1017                 }
1018                 if (slen && (strend - s < slen
1019                     || *SvPVX_const(check) != *s
1020                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1021                 {
1022                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1023                                     "  String not equal...\n"));
1024                     goto fail_finish;
1025                 }
1026
1027                 check_at = s;
1028                 goto success_at_start;
1029             }
1030         }
1031     }
1032
1033     end_shift = prog->check_end_shift;
1034
1035 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1036     if (end_shift < 0)
1037         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1038                    (IV)end_shift, RX_PRECOMP(rx));
1039 #endif
1040
1041   restart:
1042
1043     /* This is the (re)entry point of the main loop in this function.
1044      * The goal of this loop is to:
1045      * 1) find the "check" substring in the region rx_origin..strend
1046      *    (adjusted by start_shift / end_shift). If not found, reject
1047      *    immediately.
1048      * 2) If it exists, look for the "other" substr too if defined; for
1049      *    example, if the check substr maps to the anchored substr, then
1050      *    check the floating substr, and vice-versa. If not found, go
1051      *    back to (1) with rx_origin suitably incremented.
1052      * 3) If we find an rx_origin position that doesn't contradict
1053      *    either of the substrings, then check the possible additional
1054      *    constraints on rx_origin of /^.../m or a known start class.
1055      *    If these fail, then depending on which constraints fail, jump
1056      *    back to here, or to various other re-entry points further along
1057      *    that skip some of the first steps.
1058      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1059      *    substring. If the start position was determined to be at the
1060      *    beginning of the string  - so, not rejected, but not optimised,
1061      *    since we have to run regmatch from position 0 - decrement the
1062      *    BmUSEFUL() count. Otherwise increment it.
1063      */
1064
1065
1066     /* first, look for the 'check' substring */
1067
1068     {
1069         U8* start_point;
1070         U8* end_point;
1071
1072         DEBUG_OPTIMISE_MORE_r({
1073             Perl_re_printf( aTHX_
1074                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1075                 " Start shift: %" IVdf " End shift %" IVdf
1076                 " Real end Shift: %" IVdf "\n",
1077                 (IV)(rx_origin - strbeg),
1078                 (IV)prog->check_offset_min,
1079                 (IV)start_shift,
1080                 (IV)end_shift,
1081                 (IV)prog->check_end_shift);
1082         });
1083
1084         end_point = HOPBACK3(strend, end_shift, rx_origin);
1085         if (!end_point)
1086             goto fail_finish;
1087         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1088         if (!start_point)
1089             goto fail_finish;
1090
1091
1092         /* If the regex is absolutely anchored to either the start of the
1093          * string (SBOL) or to pos() (ANCH_GPOS), then
1094          * check_offset_max represents an upper bound on the string where
1095          * the substr could start. For the ANCH_GPOS case, we assume that
1096          * the caller of intuit will have already set strpos to
1097          * pos()-gofs, so in this case strpos + offset_max will still be
1098          * an upper bound on the substr.
1099          */
1100         if (!ml_anch
1101             && prog->intflags & PREGf_ANCH
1102             && prog->check_offset_max != SSize_t_MAX)
1103         {
1104             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1105             const char * const anchor =
1106                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1107             SSize_t targ_len = (char*)end_point - anchor;
1108
1109             if (check_len > targ_len) {
1110                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1111                               "Target string too short to match required substring...\n"));
1112                 goto fail_finish;
1113             }
1114
1115             /* do a bytes rather than chars comparison. It's conservative;
1116              * so it skips doing the HOP if the result can't possibly end
1117              * up earlier than the old value of end_point.
1118              */
1119             assert(anchor + check_len <= (char *)end_point);
1120             if (prog->check_offset_max + check_len < targ_len) {
1121                 end_point = HOP3lim((U8*)anchor,
1122                                 prog->check_offset_max,
1123                                 end_point - check_len
1124                             )
1125                             + check_len;
1126                 if (end_point < start_point)
1127                     goto fail_finish;
1128             }
1129         }
1130
1131         check_at = fbm_instr( start_point, end_point,
1132                       check, multiline ? FBMrf_MULTILINE : 0);
1133
1134         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1135             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1136             (IV)((char*)start_point - strbeg),
1137             (IV)((char*)end_point   - strbeg),
1138             (IV)(check_at ? check_at - strbeg : -1)
1139         ));
1140
1141         /* Update the count-of-usability, remove useless subpatterns,
1142             unshift s.  */
1143
1144         DEBUG_EXECUTE_r({
1145             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1146                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1147             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1148                               (check_at ? "Found" : "Did not find"),
1149                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1150                     ? "anchored" : "floating"),
1151                 quoted,
1152                 RE_SV_TAIL(check),
1153                 (check_at ? " at offset " : "...\n") );
1154         });
1155
1156         if (!check_at)
1157             goto fail_finish;
1158         /* set rx_origin to the minimum position where the regex could start
1159          * matching, given the constraint of the just-matched check substring.
1160          * But don't set it lower than previously.
1161          */
1162
1163         if (check_at - rx_origin > prog->check_offset_max)
1164             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1165         /* Finish the diagnostic message */
1166         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1167             "%ld (rx_origin now %" IVdf ")...\n",
1168             (long)(check_at - strbeg),
1169             (IV)(rx_origin - strbeg)
1170         ));
1171     }
1172
1173
1174     /* now look for the 'other' substring if defined */
1175
1176     if (prog->substrs->data[other_ix].utf8_substr
1177         || prog->substrs->data[other_ix].substr)
1178     {
1179         /* Take into account the "other" substring. */
1180         char *last, *last1;
1181         char *s;
1182         SV* must;
1183         struct reg_substr_datum *other;
1184
1185       do_other_substr:
1186         other = &prog->substrs->data[other_ix];
1187         if (!utf8_target && !other->substr) {
1188             if (!to_byte_substr(prog)) {
1189                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1190             }
1191         }
1192
1193         /* if "other" is anchored:
1194          * we've previously found a floating substr starting at check_at.
1195          * This means that the regex origin must lie somewhere
1196          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1197          * and max:                 HOP3(check_at, -check_offset_min)
1198          * (except that min will be >= strpos)
1199          * So the fixed  substr must lie somewhere between
1200          *  HOP3(min, anchored_offset)
1201          *  HOP3(max, anchored_offset) + SvCUR(substr)
1202          */
1203
1204         /* if "other" is floating
1205          * Calculate last1, the absolute latest point where the
1206          * floating substr could start in the string, ignoring any
1207          * constraints from the earlier fixed match. It is calculated
1208          * as follows:
1209          *
1210          * strend - prog->minlen (in chars) is the absolute latest
1211          * position within the string where the origin of the regex
1212          * could appear. The latest start point for the floating
1213          * substr is float_min_offset(*) on from the start of the
1214          * regex.  last1 simply combines thee two offsets.
1215          *
1216          * (*) You might think the latest start point should be
1217          * float_max_offset from the regex origin, and technically
1218          * you'd be correct. However, consider
1219          *    /a\d{2,4}bcd\w/
1220          * Here, float min, max are 3,5 and minlen is 7.
1221          * This can match either
1222          *    /a\d\dbcd\w/
1223          *    /a\d\d\dbcd\w/
1224          *    /a\d\d\d\dbcd\w/
1225          * In the first case, the regex matches minlen chars; in the
1226          * second, minlen+1, in the third, minlen+2.
1227          * In the first case, the floating offset is 3 (which equals
1228          * float_min), in the second, 4, and in the third, 5 (which
1229          * equals float_max). In all cases, the floating string bcd
1230          * can never start more than 4 chars from the end of the
1231          * string, which equals minlen - float_min. As the substring
1232          * starts to match more than float_min from the start of the
1233          * regex, it makes the regex match more than minlen chars,
1234          * and the two cancel each other out. So we can always use
1235          * float_min - minlen, rather than float_max - minlen for the
1236          * latest position in the string.
1237          *
1238          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1239          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1240          */
1241
1242         assert(prog->minlen >= other->min_offset);
1243         last1 = HOP3c(strend,
1244                         other->min_offset - prog->minlen, strbeg);
1245
1246         if (other_ix) {/* i.e. if (other-is-float) */
1247             /* last is the latest point where the floating substr could
1248              * start, *given* any constraints from the earlier fixed
1249              * match. This constraint is that the floating string starts
1250              * <= float_max_offset chars from the regex origin (rx_origin).
1251              * If this value is less than last1, use it instead.
1252              */
1253             assert(rx_origin <= last1);
1254             last =
1255                 /* this condition handles the offset==infinity case, and
1256                  * is a short-cut otherwise. Although it's comparing a
1257                  * byte offset to a char length, it does so in a safe way,
1258                  * since 1 char always occupies 1 or more bytes,
1259                  * so if a string range is  (last1 - rx_origin) bytes,
1260                  * it will be less than or equal to  (last1 - rx_origin)
1261                  * chars; meaning it errs towards doing the accurate HOP3
1262                  * rather than just using last1 as a short-cut */
1263                 (last1 - rx_origin) < other->max_offset
1264                     ? last1
1265                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1266         }
1267         else {
1268             assert(strpos + start_shift <= check_at);
1269             last = HOP4c(check_at, other->min_offset - start_shift,
1270                         strbeg, strend);
1271         }
1272
1273         s = HOP3c(rx_origin, other->min_offset, strend);
1274         if (s < other_last)     /* These positions already checked */
1275             s = other_last;
1276
1277         must = utf8_target ? other->utf8_substr : other->substr;
1278         assert(SvPOK(must));
1279         {
1280             char *from = s;
1281             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1282
1283             if (to > strend)
1284                 to = strend;
1285             if (from > to) {
1286                 s = NULL;
1287                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1288                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1289                     (IV)(from - strbeg),
1290                     (IV)(to   - strbeg)
1291                 ));
1292             }
1293             else {
1294                 s = fbm_instr(
1295                     (unsigned char*)from,
1296                     (unsigned char*)to,
1297                     must,
1298                     multiline ? FBMrf_MULTILINE : 0
1299                 );
1300                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1301                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1302                     (IV)(from - strbeg),
1303                     (IV)(to   - strbeg),
1304                     (IV)(s ? s - strbeg : -1)
1305                 ));
1306             }
1307         }
1308
1309         DEBUG_EXECUTE_r({
1310             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1311                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1312             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1313                 s ? "Found" : "Contradicts",
1314                 other_ix ? "floating" : "anchored",
1315                 quoted, RE_SV_TAIL(must));
1316         });
1317
1318
1319         if (!s) {
1320             /* last1 is latest possible substr location. If we didn't
1321              * find it before there, we never will */
1322             if (last >= last1) {
1323                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1324                                         "; giving up...\n"));
1325                 goto fail_finish;
1326             }
1327
1328             /* try to find the check substr again at a later
1329              * position. Maybe next time we'll find the "other" substr
1330              * in range too */
1331             other_last = HOP3c(last, 1, strend) /* highest failure */;
1332             rx_origin =
1333                 other_ix /* i.e. if other-is-float */
1334                     ? HOP3c(rx_origin, 1, strend)
1335                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1336             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1337                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1338                 (other_ix ? "floating" : "anchored"),
1339                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1340                 (IV)(rx_origin - strbeg)
1341             ));
1342             goto restart;
1343         }
1344         else {
1345             if (other_ix) { /* if (other-is-float) */
1346                 /* other_last is set to s, not s+1, since its possible for
1347                  * a floating substr to fail first time, then succeed
1348                  * second time at the same floating position; e.g.:
1349                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1350                  * The first time round, anchored and float match at
1351                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1352                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1353                  */
1354                 other_last = s;
1355             }
1356             else {
1357                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1358                 other_last = HOP3c(s, 1, strend);
1359             }
1360             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1361                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1362                   (long)(s - strbeg),
1363                 (IV)(rx_origin - strbeg)
1364               ));
1365
1366         }
1367     }
1368     else {
1369         DEBUG_OPTIMISE_MORE_r(
1370             Perl_re_printf( aTHX_
1371                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1372                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1373                 " strend:%" IVdf "\n",
1374                 (IV)prog->check_offset_min,
1375                 (IV)prog->check_offset_max,
1376                 (IV)(check_at-strbeg),
1377                 (IV)(rx_origin-strbeg),
1378                 (IV)(rx_origin-check_at),
1379                 (IV)(strend-strbeg)
1380             )
1381         );
1382     }
1383
1384   postprocess_substr_matches:
1385
1386     /* handle the extra constraint of /^.../m if present */
1387
1388     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1389         char *s;
1390
1391         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1392                         "  looking for /^/m anchor"));
1393
1394         /* we have failed the constraint of a \n before rx_origin.
1395          * Find the next \n, if any, even if it's beyond the current
1396          * anchored and/or floating substrings. Whether we should be
1397          * scanning ahead for the next \n or the next substr is debatable.
1398          * On the one hand you'd expect rare substrings to appear less
1399          * often than \n's. On the other hand, searching for \n means
1400          * we're effectively flipping between check_substr and "\n" on each
1401          * iteration as the current "rarest" string candidate, which
1402          * means for example that we'll quickly reject the whole string if
1403          * hasn't got a \n, rather than trying every substr position
1404          * first
1405          */
1406
1407         s = HOP3c(strend, - prog->minlen, strpos);
1408         if (s <= rx_origin ||
1409             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1410         {
1411             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1412                             "  Did not find /%s^%s/m...\n",
1413                             PL_colors[0], PL_colors[1]));
1414             goto fail_finish;
1415         }
1416
1417         /* earliest possible origin is 1 char after the \n.
1418          * (since *rx_origin == '\n', it's safe to ++ here rather than
1419          * HOP(rx_origin, 1)) */
1420         rx_origin++;
1421
1422         if (prog->substrs->check_ix == 0  /* check is anchored */
1423             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1424         {
1425             /* Position contradicts check-string; either because
1426              * check was anchored (and thus has no wiggle room),
1427              * or check was float and rx_origin is above the float range */
1428             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1429                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1430                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1431             goto restart;
1432         }
1433
1434         /* if we get here, the check substr must have been float,
1435          * is in range, and we may or may not have had an anchored
1436          * "other" substr which still contradicts */
1437         assert(prog->substrs->check_ix); /* check is float */
1438
1439         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1440             /* whoops, the anchored "other" substr exists, so we still
1441              * contradict. On the other hand, the float "check" substr
1442              * didn't contradict, so just retry the anchored "other"
1443              * substr */
1444             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1445                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1446                 PL_colors[0], PL_colors[1],
1447                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1448                 (IV)(rx_origin - strbeg)
1449             ));
1450             goto do_other_substr;
1451         }
1452
1453         /* success: we don't contradict the found floating substring
1454          * (and there's no anchored substr). */
1455         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1456             "  Found /%s^%s/m with rx_origin %ld...\n",
1457             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1458     }
1459     else {
1460         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1461             "  (multiline anchor test skipped)\n"));
1462     }
1463
1464   success_at_start:
1465
1466
1467     /* if we have a starting character class, then test that extra constraint.
1468      * (trie stclasses are too expensive to use here, we are better off to
1469      * leave it to regmatch itself) */
1470
1471     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1472         const U8* const str = (U8*)STRING(progi->regstclass);
1473
1474         /* XXX this value could be pre-computed */
1475         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1476                     ?  (reginfo->is_utf8_pat
1477                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1478                         : STR_LEN(progi->regstclass))
1479                     : 1);
1480         char * endpos;
1481         char *s;
1482         /* latest pos that a matching float substr constrains rx start to */
1483         char *rx_max_float = NULL;
1484
1485         /* if the current rx_origin is anchored, either by satisfying an
1486          * anchored substring constraint, or a /^.../m constraint, then we
1487          * can reject the current origin if the start class isn't found
1488          * at the current position. If we have a float-only match, then
1489          * rx_origin is constrained to a range; so look for the start class
1490          * in that range. if neither, then look for the start class in the
1491          * whole rest of the string */
1492
1493         /* XXX DAPM it's not clear what the minlen test is for, and why
1494          * it's not used in the floating case. Nothing in the test suite
1495          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1496          * Here are some old comments, which may or may not be correct:
1497          *
1498          *   minlen == 0 is possible if regstclass is \b or \B,
1499          *   and the fixed substr is ''$.
1500          *   Since minlen is already taken into account, rx_origin+1 is
1501          *   before strend; accidentally, minlen >= 1 guaranties no false
1502          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1503          *   0) below assumes that regstclass does not come from lookahead...
1504          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1505          *   This leaves EXACTF-ish only, which are dealt with in
1506          *   find_byclass().
1507          */
1508
1509         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1510             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1511         else if (prog->float_substr || prog->float_utf8) {
1512             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1513             endpos = HOP3clim(rx_max_float, cl_l, strend);
1514         }
1515         else
1516             endpos= strend;
1517
1518         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1519             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1520             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1521               (IV)start_shift, (IV)(check_at - strbeg),
1522               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1523
1524         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1525                             reginfo);
1526         if (!s) {
1527             if (endpos == strend) {
1528                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1529                                 "  Could not match STCLASS...\n") );
1530                 goto fail;
1531             }
1532             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1533                                "  This position contradicts STCLASS...\n") );
1534             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1535                         && !(prog->intflags & PREGf_IMPLICIT))
1536                 goto fail;
1537
1538             /* Contradict one of substrings */
1539             if (prog->anchored_substr || prog->anchored_utf8) {
1540                 if (prog->substrs->check_ix == 1) { /* check is float */
1541                     /* Have both, check_string is floating */
1542                     assert(rx_origin + start_shift <= check_at);
1543                     if (rx_origin + start_shift != check_at) {
1544                         /* not at latest position float substr could match:
1545                          * Recheck anchored substring, but not floating.
1546                          * The condition above is in bytes rather than
1547                          * chars for efficiency. It's conservative, in
1548                          * that it errs on the side of doing 'goto
1549                          * do_other_substr'. In this case, at worst,
1550                          * an extra anchored search may get done, but in
1551                          * practice the extra fbm_instr() is likely to
1552                          * get skipped anyway. */
1553                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1554                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1555                             (long)(other_last - strbeg),
1556                             (IV)(rx_origin - strbeg)
1557                         ));
1558                         goto do_other_substr;
1559                     }
1560                 }
1561             }
1562             else {
1563                 /* float-only */
1564
1565                 if (ml_anch) {
1566                     /* In the presence of ml_anch, we might be able to
1567                      * find another \n without breaking the current float
1568                      * constraint. */
1569
1570                     /* strictly speaking this should be HOP3c(..., 1, ...),
1571                      * but since we goto a block of code that's going to
1572                      * search for the next \n if any, its safe here */
1573                     rx_origin++;
1574                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1575                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1576                               PL_colors[0], PL_colors[1],
1577                               (long)(rx_origin - strbeg)) );
1578                     goto postprocess_substr_matches;
1579                 }
1580
1581                 /* strictly speaking this can never be true; but might
1582                  * be if we ever allow intuit without substrings */
1583                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1584                     goto fail;
1585
1586                 rx_origin = rx_max_float;
1587             }
1588
1589             /* at this point, any matching substrings have been
1590              * contradicted. Start again... */
1591
1592             rx_origin = HOP3c(rx_origin, 1, strend);
1593
1594             /* uses bytes rather than char calculations for efficiency.
1595              * It's conservative: it errs on the side of doing 'goto restart',
1596              * where there is code that does a proper char-based test */
1597             if (rx_origin + start_shift + end_shift > strend) {
1598                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1599                                        "  Could not match STCLASS...\n") );
1600                 goto fail;
1601             }
1602             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1603                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1604                 (prog->substrs->check_ix ? "floating" : "anchored"),
1605                 (long)(rx_origin + start_shift - strbeg),
1606                 (IV)(rx_origin - strbeg)
1607             ));
1608             goto restart;
1609         }
1610
1611         /* Success !!! */
1612
1613         if (rx_origin != s) {
1614             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1615                         "  By STCLASS: moving %ld --> %ld\n",
1616                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1617                    );
1618         }
1619         else {
1620             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1621                                   "  Does not contradict STCLASS...\n");
1622                    );
1623         }
1624     }
1625
1626     /* Decide whether using the substrings helped */
1627
1628     if (rx_origin != strpos) {
1629         /* Fixed substring is found far enough so that the match
1630            cannot start at strpos. */
1631
1632         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1633         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1634     }
1635     else {
1636         /* The found rx_origin position does not prohibit matching at
1637          * strpos, so calling intuit didn't gain us anything. Decrement
1638          * the BmUSEFUL() count on the check substring, and if we reach
1639          * zero, free it.  */
1640         if (!(prog->intflags & PREGf_NAUGHTY)
1641             && (utf8_target ? (
1642                 prog->check_utf8                /* Could be deleted already */
1643                 && --BmUSEFUL(prog->check_utf8) < 0
1644                 && (prog->check_utf8 == prog->float_utf8)
1645             ) : (
1646                 prog->check_substr              /* Could be deleted already */
1647                 && --BmUSEFUL(prog->check_substr) < 0
1648                 && (prog->check_substr == prog->float_substr)
1649             )))
1650         {
1651             /* If flags & SOMETHING - do not do it many times on the same match */
1652             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1653             /* XXX Does the destruction order has to change with utf8_target? */
1654             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1655             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1656             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1657             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1658             check = NULL;                       /* abort */
1659             /* XXXX This is a remnant of the old implementation.  It
1660                     looks wasteful, since now INTUIT can use many
1661                     other heuristics. */
1662             prog->extflags &= ~RXf_USE_INTUIT;
1663         }
1664     }
1665
1666     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1667             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1668              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1669
1670     return rx_origin;
1671
1672   fail_finish:                          /* Substring not found */
1673     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1674         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1675   fail:
1676     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1677                           PL_colors[4], PL_colors[5]));
1678     return NULL;
1679 }
1680
1681
1682 #define DECL_TRIE_TYPE(scan) \
1683     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1684                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1685                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1686                     trie_type = ((scan->flags == EXACT)                             \
1687                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1688                                  : (scan->flags == EXACTL)                          \
1689                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1690                                     : (scan->flags == EXACTFAA)                     \
1691                                       ? (utf8_target                                \
1692                                          ? trie_utf8_exactfa_fold                   \
1693                                          : trie_latin_utf8_exactfa_fold)            \
1694                                       : (scan->flags == EXACTFLU8                   \
1695                                          ? (utf8_target                             \
1696                                            ? trie_flu8                              \
1697                                            : trie_flu8_latin)                       \
1698                                          : (utf8_target                             \
1699                                            ? trie_utf8_fold                         \
1700                                            : trie_latin_utf8_fold)))
1701
1702 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1703  * 'foldbuf+sizeof(foldbuf)' */
1704 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1705 STMT_START {                                                                        \
1706     STRLEN skiplen;                                                                 \
1707     U8 flags = FOLD_FLAGS_FULL;                                                     \
1708     switch (trie_type) {                                                            \
1709     case trie_flu8:                                                                 \
1710         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1711         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1712             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1713         }                                                                           \
1714         goto do_trie_utf8_fold;                                                     \
1715     case trie_utf8_exactfa_fold:                                                    \
1716         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1717         /* FALLTHROUGH */                                                           \
1718     case trie_utf8_fold:                                                            \
1719       do_trie_utf8_fold:                                                            \
1720         if ( foldlen>0 ) {                                                          \
1721             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1722             foldlen -= len;                                                         \
1723             uscan += len;                                                           \
1724             len=0;                                                                  \
1725         } else {                                                                    \
1726             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1727                                                                             flags); \
1728             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1729             skiplen = UVCHR_SKIP( uvc );                                            \
1730             foldlen -= skiplen;                                                     \
1731             uscan = foldbuf + skiplen;                                              \
1732         }                                                                           \
1733         break;                                                                      \
1734     case trie_flu8_latin:                                                           \
1735         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1736         goto do_trie_latin_utf8_fold;                                               \
1737     case trie_latin_utf8_exactfa_fold:                                              \
1738         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1739         /* FALLTHROUGH */                                                           \
1740     case trie_latin_utf8_fold:                                                      \
1741       do_trie_latin_utf8_fold:                                                      \
1742         if ( foldlen>0 ) {                                                          \
1743             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1744             foldlen -= len;                                                         \
1745             uscan += len;                                                           \
1746             len=0;                                                                  \
1747         } else {                                                                    \
1748             len = 1;                                                                \
1749             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1750             skiplen = UVCHR_SKIP( uvc );                                            \
1751             foldlen -= skiplen;                                                     \
1752             uscan = foldbuf + skiplen;                                              \
1753         }                                                                           \
1754         break;                                                                      \
1755     case trie_utf8l:                                                                \
1756         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1757         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1758             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1759         }                                                                           \
1760         /* FALLTHROUGH */                                                           \
1761     case trie_utf8:                                                                 \
1762         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1763         break;                                                                      \
1764     case trie_plain:                                                                \
1765         uvc = (UV)*uc;                                                              \
1766         len = 1;                                                                    \
1767     }                                                                               \
1768     if (uvc < 256) {                                                                \
1769         charid = trie->charmap[ uvc ];                                              \
1770     }                                                                               \
1771     else {                                                                          \
1772         charid = 0;                                                                 \
1773         if (widecharmap) {                                                          \
1774             SV** const svpp = hv_fetch(widecharmap,                                 \
1775                         (char*)&uvc, sizeof(UV), 0);                                \
1776             if (svpp)                                                               \
1777                 charid = (U16)SvIV(*svpp);                                          \
1778         }                                                                           \
1779     }                                                                               \
1780 } STMT_END
1781
1782 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1783     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1784                 startpos, doutf8, depth)
1785
1786 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1787     STMT_START {                                            \
1788         while (s < strend) {                                \
1789             CODE                                            \
1790             s += ((UTF8) ? UTF8SKIP(s) : 1);                \
1791         }                                                   \
1792     } STMT_END
1793
1794 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1795     STMT_START {                                            \
1796         while (s < strend) {                                \
1797             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1798         }                                                   \
1799     } STMT_END
1800
1801 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1802     if (COND) {                                                \
1803         FBC_CHECK_AND_TRY                                      \
1804         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1805         previous_occurrence_end = s;                           \
1806     }                                                          \
1807     else {                                                     \
1808         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1809     }
1810
1811 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1812     if (utf8_target) {                                         \
1813         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1814     }                                                          \
1815     else {                                                     \
1816         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1817     }
1818
1819 /* We keep track of where the next character should start after an occurrence
1820  * of the one we're looking for.  Knowing that, we can see right away if the
1821  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1822  * don't accept the 2nd and succeeding adjacent occurrences */
1823 #define FBC_CHECK_AND_TRY                                      \
1824         if (   (   doevery                                     \
1825                 || s != previous_occurrence_end)               \
1826             && (reginfo->intuit || regtry(reginfo, &s)))       \
1827         {                                                      \
1828             goto got_it;                                       \
1829         }
1830
1831
1832 /* This differs from the above macros in that it calls a function which returns
1833  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1834  * there is no such occurrence. */
1835 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1836     while (s < strend) {                                    \
1837         s = (f);                                            \
1838         if (s >= strend) {                                  \
1839             break;                                          \
1840         }                                                   \
1841                                                             \
1842         FBC_CHECK_AND_TRY                                   \
1843         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1844         previous_occurrence_end = s;                        \
1845     }
1846
1847 /* This differs from the above macros in that it is passed a single byte that
1848  * is known to begin the next occurrence of the thing being looked for in 's'.
1849  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1850  * at that position. */
1851 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
1852     while (s < strend) {                                    \
1853         s = (char *) memchr(s, byte, strend -s);            \
1854         if (s == NULL) {                                    \
1855             s = (char *) strend;                            \
1856             break;                                          \
1857         }                                                   \
1858                                                             \
1859         if (COND) {                                         \
1860             FBC_CHECK_AND_TRY                               \
1861             s += UTF8SKIP(s);                               \
1862             previous_occurrence_end = s;                    \
1863         }                                                   \
1864         else {                                              \
1865             s += UTF8SKIP(s);                               \
1866         }                                                   \
1867     }
1868
1869 /* The three macros below are slightly different versions of the same logic.
1870  *
1871  * The first is for /a and /aa when the target string is UTF-8.  This can only
1872  * match ascii, but it must advance based on UTF-8.   The other two handle the
1873  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1874  * for the boundary (or non-boundary) between a word and non-word character.
1875  * The utf8 and non-utf8 cases have the same logic, but the details must be
1876  * different.  Find the "wordness" of the character just prior to this one, and
1877  * compare it with the wordness of this one.  If they differ, we have a
1878  * boundary.  At the beginning of the string, pretend that the previous
1879  * character was a new-line.
1880  *
1881  * All these macros uncleanly have side-effects with each other and outside
1882  * variables.  So far it's been too much trouble to clean-up
1883  *
1884  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1885  *               a word character or not.
1886  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1887  *               word/non-word
1888  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1889  *
1890  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1891  * are looking for a boundary or for a non-boundary.  If we are looking for a
1892  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1893  * see if this tentative match actually works, and if so, to quit the loop
1894  * here.  And vice-versa if we are looking for a non-boundary.
1895  *
1896  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1897  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1898  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1899  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1900  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1901  * complement.  But in that branch we complement tmp, meaning that at the
1902  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1903  * which means at the top of the loop in the next iteration, it is
1904  * TEST_NON_UTF8(s-1) */
1905 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1906     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1907     tmp = TEST_NON_UTF8(tmp);                                                  \
1908     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1909         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1910             tmp = !tmp;                                                        \
1911             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1912         }                                                                      \
1913         else {                                                                 \
1914             IF_FAIL;                                                           \
1915         }                                                                      \
1916     );                                                                         \
1917
1918 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1919  * TEST_UTF8 is a macro that for the same input code points returns identically
1920  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1921 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1922     if (s == reginfo->strbeg) {                                                \
1923         tmp = '\n';                                                            \
1924     }                                                                          \
1925     else { /* Back-up to the start of the previous character */                \
1926         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1927         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1928                                                        0, UTF8_ALLOW_DEFAULT); \
1929     }                                                                          \
1930     tmp = TEST_UV(tmp);                                                        \
1931     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1932         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1933             tmp = !tmp;                                                        \
1934             IF_SUCCESS;                                                        \
1935         }                                                                      \
1936         else {                                                                 \
1937             IF_FAIL;                                                           \
1938         }                                                                      \
1939     );
1940
1941 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1942  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1943  * macros below */
1944 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1945     if (utf8_target) {                                                         \
1946         UTF8_CODE                                                              \
1947     }                                                                          \
1948     else {  /* Not utf8 */                                                     \
1949         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1950         tmp = TEST_NON_UTF8(tmp);                                              \
1951         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
1952             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1953                 IF_SUCCESS;                                                    \
1954                 tmp = !tmp;                                                    \
1955             }                                                                  \
1956             else {                                                             \
1957                 IF_FAIL;                                                       \
1958             }                                                                  \
1959         );                                                                     \
1960     }                                                                          \
1961     /* Here, things have been set up by the previous code so that tmp is the   \
1962      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1963      * utf8ness of the target).  We also have to check if this matches against \
1964      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1965      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1966      * string */                                                               \
1967     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1968         IF_SUCCESS;                                                            \
1969     }                                                                          \
1970     else {                                                                     \
1971         IF_FAIL;                                                               \
1972     }
1973
1974 /* This is the macro to use when we want to see if something that looks like it
1975  * could match, actually does, and if so exits the loop.  It needs to be used
1976  * only for bounds checking macros, as it allows for matching beyond the end of
1977  * string (which should be zero length without having to look at the string
1978  * contents) */
1979 #define REXEC_FBC_TRYIT                                                     \
1980     if ((reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1981         goto got_it
1982
1983 /* The only difference between the BOUND and NBOUND cases is that
1984  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1985  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1986  * with the other one being empty (PLACEHOLDER is defined as empty).
1987  *
1988  * The TEST_FOO parameters are for operating on different forms of input, but
1989  * all should be ones that return identically for the same underlying code
1990  * points */
1991 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1992     FBC_BOUND_COMMON(                                                          \
1993           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1994           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1995
1996 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1997     FBC_BOUND_COMMON(                                                          \
1998             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1999             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2000
2001 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2002     FBC_BOUND_COMMON(                                                          \
2003           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2004           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2005
2006 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2007     FBC_BOUND_COMMON(                                                          \
2008             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2009             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2010
2011 #ifdef DEBUGGING
2012 static IV
2013 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2014   IV cp_out = _invlist_search(invlist, cp_in);
2015   assert(cp_out >= 0);
2016   return cp_out;
2017 }
2018 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2019         invmap[S_get_break_val_cp_checked(invlist, cp)]
2020 #else
2021 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2022         invmap[_invlist_search(invlist, cp)]
2023 #endif
2024
2025 /* Takes a pointer to an inversion list, a pointer to its corresponding
2026  * inversion map, and a code point, and returns the code point's value
2027  * according to the two arrays.  It assumes that all code points have a value.
2028  * This is used as the base macro for macros for particular properties */
2029 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2030         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2031
2032 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2033  * of a code point, returning the value for the first code point in the string.
2034  * And it takes the particular macro name that finds the desired value given a
2035  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2036 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2037              (__ASSERT_(pos < strend)                                          \
2038                  /* Note assumes is valid UTF-8 */                             \
2039              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2040
2041 /* Returns the GCB value for the input code point */
2042 #define getGCB_VAL_CP(cp)                                                      \
2043           _generic_GET_BREAK_VAL_CP(                                           \
2044                                     PL_GCB_invlist,                            \
2045                                     _Perl_GCB_invmap,                          \
2046                                     (cp))
2047
2048 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2049  * bounded by pos and strend */
2050 #define getGCB_VAL_UTF8(pos, strend)                                           \
2051     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2052
2053 /* Returns the LB value for the input code point */
2054 #define getLB_VAL_CP(cp)                                                       \
2055           _generic_GET_BREAK_VAL_CP(                                           \
2056                                     PL_LB_invlist,                             \
2057                                     _Perl_LB_invmap,                           \
2058                                     (cp))
2059
2060 /* Returns the LB value for the first code point in the UTF-8 encoded string
2061  * bounded by pos and strend */
2062 #define getLB_VAL_UTF8(pos, strend)                                            \
2063     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2064
2065
2066 /* Returns the SB value for the input code point */
2067 #define getSB_VAL_CP(cp)                                                       \
2068           _generic_GET_BREAK_VAL_CP(                                           \
2069                                     PL_SB_invlist,                             \
2070                                     _Perl_SB_invmap,                     \
2071                                     (cp))
2072
2073 /* Returns the SB value for the first code point in the UTF-8 encoded string
2074  * bounded by pos and strend */
2075 #define getSB_VAL_UTF8(pos, strend)                                            \
2076     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2077
2078 /* Returns the WB value for the input code point */
2079 #define getWB_VAL_CP(cp)                                                       \
2080           _generic_GET_BREAK_VAL_CP(                                           \
2081                                     PL_WB_invlist,                             \
2082                                     _Perl_WB_invmap,                         \
2083                                     (cp))
2084
2085 /* Returns the WB value for the first code point in the UTF-8 encoded string
2086  * bounded by pos and strend */
2087 #define getWB_VAL_UTF8(pos, strend)                                            \
2088     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2089
2090 /* We know what class REx starts with.  Try to find this position... */
2091 /* if reginfo->intuit, its a dryrun */
2092 /* annoyingly all the vars in this routine have different names from their counterparts
2093    in regmatch. /grrr */
2094 STATIC char *
2095 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2096     const char *strend, regmatch_info *reginfo)
2097 {
2098     dVAR;
2099
2100     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2101     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2102
2103     char *pat_string;   /* The pattern's exactish string */
2104     char *pat_end;          /* ptr to end char of pat_string */
2105     re_fold_t folder;   /* Function for computing non-utf8 folds */
2106     const U8 *fold_array;   /* array for folding ords < 256 */
2107     STRLEN ln;
2108     STRLEN lnc;
2109     U8 c1;
2110     U8 c2;
2111     char *e = NULL;
2112
2113     /* In some cases we accept only the first occurence of 'x' in a sequence of
2114      * them.  This variable points to just beyond the end of the previous
2115      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2116      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2117      * hop back.) */
2118     char * previous_occurrence_end = 0;
2119
2120     I32 tmp;            /* Scratch variable */
2121     const bool utf8_target = reginfo->is_utf8_target;
2122     UV utf8_fold_flags = 0;
2123     const bool is_utf8_pat = reginfo->is_utf8_pat;
2124     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2125                                    with a result inverts that result, as 0^1 =
2126                                    1 and 1^1 = 0 */
2127     _char_class_number classnum;
2128
2129     RXi_GET_DECL(prog,progi);
2130
2131     PERL_ARGS_ASSERT_FIND_BYCLASS;
2132
2133     /* We know what class it must start with. */
2134     switch (OP(c)) {
2135     case ANYOFPOSIXL:
2136     case ANYOFL:
2137         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2138
2139         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2140             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2141         }
2142
2143         /* FALLTHROUGH */
2144     case ANYOFD:
2145     case ANYOF:
2146         if (utf8_target) {
2147             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2148                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2149         }
2150         else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2151             /* We know that s is in the bitmap range since the target isn't
2152              * UTF-8, so what happens for out-of-range values is not relevant,
2153              * so exclude that from the flags */
2154             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2155         }
2156         else {
2157             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2158         }
2159         break;
2160
2161     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2162         /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
2163          * use 0 */
2164         REXEC_FBC_FIND_NEXT_SCAN(0,
2165          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2166                                    (U8) ARG(c), FLAGS(c)));
2167         break;
2168
2169     case NANYOFM:   /* UTF-8ness does matter because can match UTF-8 variants.
2170                      */
2171         REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
2172          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2173                                    (U8) ARG(c), FLAGS(c)));
2174         break;
2175
2176     case ANYOFH:
2177         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2178             U8 first_byte = FLAGS(c);
2179
2180             if (first_byte) {   /* We know what the first byte of any matched
2181                                    string should be */
2182                 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2183                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2184             }
2185             else {
2186                 REXEC_FBC_CLASS_SCAN(TRUE,
2187                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2188             }
2189         }
2190         break;
2191
2192     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2193         assert(! is_utf8_pat);
2194         /* FALLTHROUGH */
2195     case EXACTFAA:
2196         if (is_utf8_pat) {
2197             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2198                              |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
2199             goto do_exactf_utf8;
2200         }
2201         else if (utf8_target) {
2202
2203             /* Here, and elsewhere in this file, the reason we can't consider a
2204              * non-UTF-8 pattern already folded in the presence of a UTF-8
2205              * target is because any MICRO SIGN in the pattern won't be folded.
2206              * Since the fold of the MICRO SIGN requires UTF-8 to represent, we
2207              * can consider a non-UTF-8 pattern folded when matching a
2208              * non-UTF-8 target */
2209             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2210             goto do_exactf_utf8;
2211         }
2212
2213         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2214          * which these functions don't handle anyway */
2215         fold_array = PL_fold_latin1;
2216         folder = foldEQ_latin1_s2_folded;
2217         goto do_exactf_non_utf8;
2218
2219     case EXACTF:   /* This node only generated for non-utf8 patterns */
2220         assert(! is_utf8_pat);
2221         if (utf8_target) {
2222             goto do_exactf_utf8;
2223         }
2224         fold_array = PL_fold;
2225         folder = foldEQ;
2226         goto do_exactf_non_utf8;
2227
2228     case EXACTFL:
2229         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2230         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2231             utf8_fold_flags = FOLDEQ_LOCALE;
2232             goto do_exactf_utf8;
2233         }
2234         fold_array = PL_fold_locale;
2235         folder = foldEQ_locale;
2236         goto do_exactf_non_utf8;
2237
2238     case EXACTFUP:      /* Problematic even though pattern isn't UTF-8.  Use
2239                            full functionality normally not done except for
2240                            UTF-8 */
2241         assert(! is_utf8_pat);
2242         goto do_exactf_utf8;
2243
2244     case EXACTFLU8:
2245             if (! utf8_target) {    /* All code points in this node require
2246                                        UTF-8 to express.  */
2247                 break;
2248             }
2249             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2250                                              | FOLDEQ_S2_FOLDS_SANE;
2251             goto do_exactf_utf8;
2252
2253     case EXACTFU_ONLY8:
2254         if (! utf8_target) {
2255             break;
2256         }
2257         assert(is_utf8_pat);
2258         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2259         goto do_exactf_utf8;
2260
2261     case EXACTFU:
2262         if (is_utf8_pat || utf8_target) {
2263             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2264             goto do_exactf_utf8;
2265         }
2266
2267         /* Any 'ss' in the pattern should have been replaced by regcomp,
2268          * so we don't have to worry here about this single special case
2269          * in the Latin1 range */
2270         fold_array = PL_fold_latin1;
2271         folder = foldEQ_latin1_s2_folded;
2272
2273         /* FALLTHROUGH */
2274
2275       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2276                            are no glitches with fold-length differences
2277                            between the target string and pattern */
2278
2279         /* The idea in the non-utf8 EXACTF* cases is to first find the
2280          * first character of the EXACTF* node and then, if necessary,
2281          * case-insensitively compare the full text of the node.  c1 is the
2282          * first character.  c2 is its fold.  This logic will not work for
2283          * Unicode semantics and the german sharp ss, which hence should
2284          * not be compiled into a node that gets here. */
2285         pat_string = STRING(c);
2286         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2287
2288         /* We know that we have to match at least 'ln' bytes (which is the
2289          * same as characters, since not utf8).  If we have to match 3
2290          * characters, and there are only 2 availabe, we know without
2291          * trying that it will fail; so don't start a match past the
2292          * required minimum number from the far end */
2293         e = HOP3c(strend, -((SSize_t)ln), s);
2294         if (e < s)
2295             break;
2296
2297         c1 = *pat_string;
2298         c2 = fold_array[c1];
2299         if (c1 == c2) { /* If char and fold are the same */
2300             while (s <= e) {
2301                 s = (char *) memchr(s, c1, e + 1 - s);
2302                 if (s == NULL) {
2303                     break;
2304                 }
2305
2306                 /* Check that the rest of the node matches */
2307                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2308                     && (reginfo->intuit || regtry(reginfo, &s)) )
2309                 {
2310                     goto got_it;
2311                 }
2312                 s++;
2313             }
2314         }
2315         else {
2316             U8 bits_differing = c1 ^ c2;
2317
2318             /* If the folds differ in one bit position only, we can mask to
2319              * match either of them, and can use this faster find method.  Both
2320              * ASCII and EBCDIC tend to have their case folds differ in only
2321              * one position, so this is very likely */
2322             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2323                 bits_differing = ~ bits_differing;
2324                 while (s <= e) {
2325                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2326                                         (c1 & bits_differing), bits_differing);
2327                     if (s > e) {
2328                         break;
2329                     }
2330
2331                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2332                         && (reginfo->intuit || regtry(reginfo, &s)) )
2333                     {
2334                         goto got_it;
2335                     }
2336                     s++;
2337                 }
2338             }
2339             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2340                        should actually happen only in EXACTFL nodes */
2341                 while (s <= e) {
2342                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2343                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2344                         && (reginfo->intuit || regtry(reginfo, &s)) )
2345                     {
2346                         goto got_it;
2347                     }
2348                     s++;
2349                 }
2350             }
2351         }
2352         break;
2353
2354       do_exactf_utf8:
2355       {
2356         unsigned expansion;
2357
2358         /* If one of the operands is in utf8, we can't use the simpler folding
2359          * above, due to the fact that many different characters can have the
2360          * same fold, or portion of a fold, or different- length fold */
2361         pat_string = STRING(c);
2362         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2363         pat_end = pat_string + ln;
2364         lnc = is_utf8_pat       /* length to match in characters */
2365                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2366                 : ln;
2367
2368         /* We have 'lnc' characters to match in the pattern, but because of
2369          * multi-character folding, each character in the target can match
2370          * up to 3 characters (Unicode guarantees it will never exceed
2371          * this) if it is utf8-encoded; and up to 2 if not (based on the
2372          * fact that the Latin 1 folds are already determined, and the
2373          * only multi-char fold in that range is the sharp-s folding to
2374          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2375          * string character.  Adjust lnc accordingly, rounding up, so that
2376          * if we need to match at least 4+1/3 chars, that really is 5. */
2377         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2378         lnc = (lnc + expansion - 1) / expansion;
2379
2380         /* As in the non-UTF8 case, if we have to match 3 characters, and
2381          * only 2 are left, it's guaranteed to fail, so don't start a
2382          * match that would require us to go beyond the end of the string
2383          */
2384         e = HOP3c(strend, -((SSize_t)lnc), s);
2385
2386         /* XXX Note that we could recalculate e to stop the loop earlier,
2387          * as the worst case expansion above will rarely be met, and as we
2388          * go along we would usually find that e moves further to the left.
2389          * This would happen only after we reached the point in the loop
2390          * where if there were no expansion we should fail.  Unclear if
2391          * worth the expense */
2392
2393         while (s <= e) {
2394             char *my_strend= (char *)strend;
2395             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2396                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2397                 && (reginfo->intuit || regtry(reginfo, &s)) )
2398             {
2399                 goto got_it;
2400             }
2401             s += (utf8_target) ? UTF8SKIP(s) : 1;
2402         }
2403         break;
2404     }
2405
2406     case BOUNDL:
2407         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2408         if (FLAGS(c) != TRADITIONAL_BOUND) {
2409             if (! IN_UTF8_CTYPE_LOCALE) {
2410                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2411                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2412             }
2413             goto do_boundu;
2414         }
2415
2416         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2417         break;
2418
2419     case NBOUNDL:
2420         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2421         if (FLAGS(c) != TRADITIONAL_BOUND) {
2422             if (! IN_UTF8_CTYPE_LOCALE) {
2423                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2424                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2425             }
2426             goto do_nboundu;
2427         }
2428
2429         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2430         break;
2431
2432     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2433                    meaning */
2434         assert(FLAGS(c) == TRADITIONAL_BOUND);
2435
2436         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2437         break;
2438
2439     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2440                    meaning */
2441         assert(FLAGS(c) == TRADITIONAL_BOUND);
2442
2443         FBC_BOUND_A(isWORDCHAR_A);
2444         break;
2445
2446     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2447                    meaning */
2448         assert(FLAGS(c) == TRADITIONAL_BOUND);
2449
2450         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2451         break;
2452
2453     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2454                    meaning */
2455         assert(FLAGS(c) == TRADITIONAL_BOUND);
2456
2457         FBC_NBOUND_A(isWORDCHAR_A);
2458         break;
2459
2460     case NBOUNDU:
2461         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2462             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2463             break;
2464         }
2465
2466       do_nboundu:
2467
2468         to_complement = 1;
2469         /* FALLTHROUGH */
2470
2471     case BOUNDU:
2472       do_boundu:
2473         switch((bound_type) FLAGS(c)) {
2474             case TRADITIONAL_BOUND:
2475                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2476                 break;
2477             case GCB_BOUND:
2478                 if (s == reginfo->strbeg) {
2479                     if (reginfo->intuit || regtry(reginfo, &s))
2480                     {
2481                         goto got_it;
2482                     }
2483
2484                     /* Didn't match.  Try at the next position (if there is one) */
2485                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2486                     if (UNLIKELY(s >= reginfo->strend)) {
2487                         break;
2488                     }
2489                 }
2490
2491                 if (utf8_target) {
2492                     GCB_enum before = getGCB_VAL_UTF8(
2493                                                reghop3((U8*)s, -1,
2494                                                        (U8*)(reginfo->strbeg)),
2495                                                (U8*) reginfo->strend);
2496                     while (s < strend) {
2497                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2498                                                         (U8*) reginfo->strend);
2499                         if (   (to_complement ^ isGCB(before,
2500                                                       after,
2501                                                       (U8*) reginfo->strbeg,
2502                                                       (U8*) s,
2503                                                       utf8_target))
2504                             && (reginfo->intuit || regtry(reginfo, &s)))
2505                         {
2506                             goto got_it;
2507                         }
2508                         before = after;
2509                         s += UTF8SKIP(s);
2510                     }
2511                 }
2512                 else {  /* Not utf8.  Everything is a GCB except between CR and
2513                            LF */
2514                     while (s < strend) {
2515                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2516                                               || UCHARAT(s) != '\n'))
2517                             && (reginfo->intuit || regtry(reginfo, &s)))
2518                         {
2519                             goto got_it;
2520                         }
2521                         s++;
2522                     }
2523                 }
2524
2525                 /* And, since this is a bound, it can match after the final
2526                  * character in the string */
2527                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2528                     goto got_it;
2529                 }
2530                 break;
2531
2532             case LB_BOUND:
2533                 if (s == reginfo->strbeg) {
2534                     if (reginfo->intuit || regtry(reginfo, &s)) {
2535                         goto got_it;
2536                     }
2537                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2538                     if (UNLIKELY(s >= reginfo->strend)) {
2539                         break;
2540                     }
2541                 }
2542
2543                 if (utf8_target) {
2544                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2545                                                                -1,
2546                                                                (U8*)(reginfo->strbeg)),
2547                                                        (U8*) reginfo->strend);
2548                     while (s < strend) {
2549                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2550                         if (to_complement ^ isLB(before,
2551                                                  after,
2552                                                  (U8*) reginfo->strbeg,
2553                                                  (U8*) s,
2554                                                  (U8*) reginfo->strend,
2555                                                  utf8_target)
2556                             && (reginfo->intuit || regtry(reginfo, &s)))
2557                         {
2558                             goto got_it;
2559                         }
2560                         before = after;
2561                         s += UTF8SKIP(s);
2562                     }
2563                 }
2564                 else {  /* Not utf8. */
2565                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2566                     while (s < strend) {
2567                         LB_enum after = getLB_VAL_CP((U8) *s);
2568                         if (to_complement ^ isLB(before,
2569                                                  after,
2570                                                  (U8*) reginfo->strbeg,
2571                                                  (U8*) s,
2572                                                  (U8*) reginfo->strend,
2573                                                  utf8_target)
2574                             && (reginfo->intuit || regtry(reginfo, &s)))
2575                         {
2576                             goto got_it;
2577                         }
2578                         before = after;
2579                         s++;
2580                     }
2581                 }
2582
2583                 if (reginfo->intuit || regtry(reginfo, &s)) {
2584                     goto got_it;
2585                 }
2586
2587                 break;
2588
2589             case SB_BOUND:
2590                 if (s == reginfo->strbeg) {
2591                     if (reginfo->intuit || regtry(reginfo, &s)) {
2592                         goto got_it;
2593                     }
2594                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2595                     if (UNLIKELY(s >= reginfo->strend)) {
2596                         break;
2597                     }
2598                 }
2599
2600                 if (utf8_target) {
2601                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2602                                                         -1,
2603                                                         (U8*)(reginfo->strbeg)),
2604                                                       (U8*) reginfo->strend);
2605                     while (s < strend) {
2606                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2607                                                          (U8*) reginfo->strend);
2608                         if ((to_complement ^ isSB(before,
2609                                                   after,
2610                                                   (U8*) reginfo->strbeg,
2611                                                   (U8*) s,
2612                                                   (U8*) reginfo->strend,
2613                                                   utf8_target))
2614                             && (reginfo->intuit || regtry(reginfo, &s)))
2615                         {
2616                             goto got_it;
2617                         }
2618                         before = after;
2619                         s += UTF8SKIP(s);
2620                     }
2621                 }
2622                 else {  /* Not utf8. */
2623                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2624                     while (s < strend) {
2625                         SB_enum after = getSB_VAL_CP((U8) *s);
2626                         if ((to_complement ^ isSB(before,
2627                                                   after,
2628                                                   (U8*) reginfo->strbeg,
2629                                                   (U8*) s,
2630                                                   (U8*) reginfo->strend,
2631                                                   utf8_target))
2632                             && (reginfo->intuit || regtry(reginfo, &s)))
2633                         {
2634                             goto got_it;
2635                         }
2636                         before = after;
2637                         s++;
2638                     }
2639                 }
2640
2641                 /* Here are at the final position in the target string.  The SB
2642                  * value is always true here, so matches, depending on other
2643                  * constraints */
2644                 if (reginfo->intuit || regtry(reginfo, &s)) {
2645                     goto got_it;
2646                 }
2647
2648                 break;
2649
2650             case WB_BOUND:
2651                 if (s == reginfo->strbeg) {
2652                     if (reginfo->intuit || regtry(reginfo, &s)) {
2653                         goto got_it;
2654                     }
2655                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2656                     if (UNLIKELY(s >= reginfo->strend)) {
2657                         break;
2658                     }
2659                 }
2660
2661                 if (utf8_target) {
2662                     /* We are at a boundary between char_sub_0 and char_sub_1.
2663                      * We also keep track of the value for char_sub_-1 as we
2664                      * loop through the line.   Context may be needed to make a
2665                      * determination, and if so, this can save having to
2666                      * recalculate it */
2667                     WB_enum previous = WB_UNKNOWN;
2668                     WB_enum before = getWB_VAL_UTF8(
2669                                               reghop3((U8*)s,
2670                                                       -1,
2671                                                       (U8*)(reginfo->strbeg)),
2672                                               (U8*) reginfo->strend);
2673                     while (s < strend) {
2674                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2675                                                         (U8*) reginfo->strend);
2676                         if ((to_complement ^ isWB(previous,
2677                                                   before,
2678                                                   after,
2679                                                   (U8*) reginfo->strbeg,
2680                                                   (U8*) s,
2681                                                   (U8*) reginfo->strend,
2682                                                   utf8_target))
2683                             && (reginfo->intuit || regtry(reginfo, &s)))
2684                         {
2685                             goto got_it;
2686                         }
2687                         previous = before;
2688                         before = after;
2689                         s += UTF8SKIP(s);
2690                     }
2691                 }
2692                 else {  /* Not utf8. */
2693                     WB_enum previous = WB_UNKNOWN;
2694                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2695                     while (s < strend) {
2696                         WB_enum after = getWB_VAL_CP((U8) *s);
2697                         if ((to_complement ^ isWB(previous,
2698                                                   before,
2699                                                   after,
2700                                                   (U8*) reginfo->strbeg,
2701                                                   (U8*) s,
2702                                                   (U8*) reginfo->strend,
2703                                                   utf8_target))
2704                             && (reginfo->intuit || regtry(reginfo, &s)))
2705                         {
2706                             goto got_it;
2707                         }
2708                         previous = before;
2709                         before = after;
2710                         s++;
2711                     }
2712                 }
2713
2714                 if (reginfo->intuit || regtry(reginfo, &s)) {
2715                     goto got_it;
2716                 }
2717         }
2718         break;
2719
2720     case LNBREAK:
2721         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2722                         is_LNBREAK_latin1_safe(s, strend)
2723         );
2724         break;
2725
2726     /* The argument to all the POSIX node types is the class number to pass to
2727      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2728
2729     case NPOSIXL:
2730         to_complement = 1;
2731         /* FALLTHROUGH */
2732
2733     case POSIXL:
2734         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2735         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2736                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2737         break;
2738
2739     case NPOSIXD:
2740         to_complement = 1;
2741         /* FALLTHROUGH */
2742
2743     case POSIXD:
2744         if (utf8_target) {
2745             goto posix_utf8;
2746         }
2747         goto posixa;
2748
2749     case NPOSIXA:
2750         if (utf8_target) {
2751             /* The complement of something that matches only ASCII matches all
2752              * non-ASCII, plus everything in ASCII that isn't in the class. */
2753             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2754                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2755             break;
2756         }
2757
2758         to_complement = 1;
2759         goto posixa;
2760
2761     case POSIXA:
2762         /* Don't need to worry about utf8, as it can match only a single
2763          * byte invariant character.  But we do anyway for performance reasons,
2764          * as otherwise we would have to examine all the continuation
2765          * characters */
2766         if (utf8_target) {
2767             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2768             break;
2769         }
2770
2771       posixa:
2772         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2773                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2774         break;
2775
2776     case NPOSIXU:
2777         to_complement = 1;
2778         /* FALLTHROUGH */
2779
2780     case POSIXU:
2781         if (! utf8_target) {
2782             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2783                                  to_complement ^ cBOOL(_generic_isCC(*s,
2784                                                                     FLAGS(c))));
2785         }
2786         else {
2787
2788           posix_utf8:
2789             classnum = (_char_class_number) FLAGS(c);
2790             switch (classnum) {
2791                 default:
2792                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2793                         to_complement ^ cBOOL(_invlist_contains_cp(
2794                                               PL_XPosix_ptrs[classnum],
2795                                               utf8_to_uvchr_buf((U8 *) s,
2796                                                                 (U8 *) strend,
2797                                                                 NULL))));
2798                     break;
2799                 case _CC_ENUM_SPACE:
2800                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2801                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2802                     break;
2803
2804                 case _CC_ENUM_BLANK:
2805                     REXEC_FBC_CLASS_SCAN(1,
2806                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2807                     break;
2808
2809                 case _CC_ENUM_XDIGIT:
2810                     REXEC_FBC_CLASS_SCAN(1,
2811                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2812                     break;
2813
2814                 case _CC_ENUM_VERTSPACE:
2815                     REXEC_FBC_CLASS_SCAN(1,
2816                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2817                     break;
2818
2819                 case _CC_ENUM_CNTRL:
2820                     REXEC_FBC_CLASS_SCAN(1,
2821                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2822                     break;
2823             }
2824         }
2825         break;
2826
2827     case AHOCORASICKC:
2828     case AHOCORASICK:
2829         {
2830             DECL_TRIE_TYPE(c);
2831             /* what trie are we using right now */
2832             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2833             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2834             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2835
2836             const char *last_start = strend - trie->minlen;
2837 #ifdef DEBUGGING
2838             const char *real_start = s;
2839 #endif
2840             STRLEN maxlen = trie->maxlen;
2841             SV *sv_points;
2842             U8 **points; /* map of where we were in the input string
2843                             when reading a given char. For ASCII this
2844                             is unnecessary overhead as the relationship
2845                             is always 1:1, but for Unicode, especially
2846                             case folded Unicode this is not true. */
2847             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2848             U8 *bitmap=NULL;
2849
2850
2851             GET_RE_DEBUG_FLAGS_DECL;
2852
2853             /* We can't just allocate points here. We need to wrap it in
2854              * an SV so it gets freed properly if there is a croak while
2855              * running the match */
2856             ENTER;
2857             SAVETMPS;
2858             sv_points=newSV(maxlen * sizeof(U8 *));
2859             SvCUR_set(sv_points,
2860                 maxlen * sizeof(U8 *));
2861             SvPOK_on(sv_points);
2862             sv_2mortal(sv_points);
2863             points=(U8**)SvPV_nolen(sv_points );
2864             if ( trie_type != trie_utf8_fold
2865                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2866             {
2867                 if (trie->bitmap)
2868                     bitmap=(U8*)trie->bitmap;
2869                 else
2870                     bitmap=(U8*)ANYOF_BITMAP(c);
2871             }
2872             /* this is the Aho-Corasick algorithm modified a touch
2873                to include special handling for long "unknown char" sequences.
2874                The basic idea being that we use AC as long as we are dealing
2875                with a possible matching char, when we encounter an unknown char
2876                (and we have not encountered an accepting state) we scan forward
2877                until we find a legal starting char.
2878                AC matching is basically that of trie matching, except that when
2879                we encounter a failing transition, we fall back to the current
2880                states "fail state", and try the current char again, a process
2881                we repeat until we reach the root state, state 1, or a legal
2882                transition. If we fail on the root state then we can either
2883                terminate if we have reached an accepting state previously, or
2884                restart the entire process from the beginning if we have not.
2885
2886              */
2887             while (s <= last_start) {
2888                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2889                 U8 *uc = (U8*)s;
2890                 U16 charid = 0;
2891                 U32 base = 1;
2892                 U32 state = 1;
2893                 UV uvc = 0;
2894                 STRLEN len = 0;
2895                 STRLEN foldlen = 0;
2896                 U8 *uscan = (U8*)NULL;
2897                 U8 *leftmost = NULL;
2898 #ifdef DEBUGGING
2899                 U32 accepted_word= 0;
2900 #endif
2901                 U32 pointpos = 0;
2902
2903                 while ( state && uc <= (U8*)strend ) {
2904                     int failed=0;
2905                     U32 word = aho->states[ state ].wordnum;
2906
2907                     if( state==1 ) {
2908                         if ( bitmap ) {
2909                             DEBUG_TRIE_EXECUTE_r(
2910                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2911                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2912                                         (char *)uc, utf8_target, 0 );
2913                                     Perl_re_printf( aTHX_
2914                                         " Scanning for legal start char...\n");
2915                                 }
2916                             );
2917                             if (utf8_target) {
2918                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2919                                     uc += UTF8SKIP(uc);
2920                                 }
2921                             } else {
2922                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2923                                     uc++;
2924                                 }
2925                             }
2926                             s= (char *)uc;
2927                         }
2928                         if (uc >(U8*)last_start) break;
2929                     }
2930
2931                     if ( word ) {
2932                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2933                         if (!leftmost || lpos < leftmost) {
2934                             DEBUG_r(accepted_word=word);
2935                             leftmost= lpos;
2936                         }
2937                         if (base==0) break;
2938
2939                     }
2940                     points[pointpos++ % maxlen]= uc;
2941                     if (foldlen || uc < (U8*)strend) {
2942                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2943                                              (U8 *) strend, uscan, len, uvc,
2944                                              charid, foldlen, foldbuf,
2945                                              uniflags);
2946                         DEBUG_TRIE_EXECUTE_r({
2947                             dump_exec_pos( (char *)uc, c, strend,
2948                                         real_start, s, utf8_target, 0);
2949                             Perl_re_printf( aTHX_
2950                                 " Charid:%3u CP:%4" UVxf " ",
2951                                  charid, uvc);
2952                         });
2953                     }
2954                     else {
2955                         len = 0;
2956                         charid = 0;
2957                     }
2958
2959
2960                     do {
2961 #ifdef DEBUGGING
2962                         word = aho->states[ state ].wordnum;
2963 #endif
2964                         base = aho->states[ state ].trans.base;
2965
2966                         DEBUG_TRIE_EXECUTE_r({
2967                             if (failed)
2968                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2969                                     s,   utf8_target, 0 );
2970                             Perl_re_printf( aTHX_
2971                                 "%sState: %4" UVxf ", word=%" UVxf,
2972                                 failed ? " Fail transition to " : "",
2973                                 (UV)state, (UV)word);
2974                         });
2975                         if ( base ) {
2976                             U32 tmp;
2977                             I32 offset;
2978                             if (charid &&
2979                                  ( ((offset = base + charid
2980                                     - 1 - trie->uniquecharcount)) >= 0)
2981                                  && ((U32)offset < trie->lasttrans)
2982                                  && trie->trans[offset].check == state
2983                                  && (tmp=trie->trans[offset].next))
2984                             {
2985                                 DEBUG_TRIE_EXECUTE_r(
2986                                     Perl_re_printf( aTHX_ " - legal\n"));
2987                                 state = tmp;
2988                                 break;
2989                             }
2990                             else {
2991                                 DEBUG_TRIE_EXECUTE_r(
2992                                     Perl_re_printf( aTHX_ " - fail\n"));
2993                                 failed = 1;
2994                                 state = aho->fail[state];
2995                             }
2996                         }
2997                         else {
2998                             /* we must be accepting here */
2999                             DEBUG_TRIE_EXECUTE_r(
3000                                     Perl_re_printf( aTHX_ " - accepting\n"));
3001                             failed = 1;
3002                             break;
3003                         }
3004                     } while(state);
3005                     uc += len;
3006                     if (failed) {
3007                         if (leftmost)
3008                             break;
3009                         if (!state) state = 1;
3010                     }
3011                 }
3012                 if ( aho->states[ state ].wordnum ) {
3013                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3014                     if (!leftmost || lpos < leftmost) {
3015                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3016                         leftmost = lpos;
3017                     }
3018                 }
3019                 if (leftmost) {
3020                     s = (char*)leftmost;
3021                     DEBUG_TRIE_EXECUTE_r({
3022                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3023                             (UV)accepted_word, (IV)(s - real_start)
3024                         );
3025                     });
3026                     if (reginfo->intuit || regtry(reginfo, &s)) {
3027                         FREETMPS;
3028                         LEAVE;
3029                         goto got_it;
3030                     }
3031                     s = HOPc(s,1);
3032                     DEBUG_TRIE_EXECUTE_r({
3033                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3034                     });
3035                 } else {
3036                     DEBUG_TRIE_EXECUTE_r(
3037                         Perl_re_printf( aTHX_ "No match.\n"));
3038                     break;
3039                 }
3040             }
3041             FREETMPS;
3042             LEAVE;
3043         }
3044         break;
3045     default:
3046         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3047     }
3048     return 0;
3049   got_it:
3050     return s;
3051 }
3052
3053 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3054  * flags have same meanings as with regexec_flags() */
3055
3056 static void
3057 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3058                             char *strbeg,
3059                             char *strend,
3060                             SV *sv,
3061                             U32 flags,
3062                             bool utf8_target)
3063 {
3064     struct regexp *const prog = ReANY(rx);
3065
3066     if (flags & REXEC_COPY_STR) {
3067 #ifdef PERL_ANY_COW
3068         if (SvCANCOW(sv)) {
3069             DEBUG_C(Perl_re_printf( aTHX_
3070                               "Copy on write: regexp capture, type %d\n",
3071                                     (int) SvTYPE(sv)));
3072             /* Create a new COW SV to share the match string and store
3073              * in saved_copy, unless the current COW SV in saved_copy
3074              * is valid and suitable for our purpose */
3075             if ((   prog->saved_copy
3076                  && SvIsCOW(prog->saved_copy)
3077                  && SvPOKp(prog->saved_copy)
3078                  && SvIsCOW(sv)
3079                  && SvPOKp(sv)
3080                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3081             {
3082                 /* just reuse saved_copy SV */
3083                 if (RXp_MATCH_COPIED(prog)) {
3084                     Safefree(prog->subbeg);
3085                     RXp_MATCH_COPIED_off(prog);
3086                 }
3087             }
3088             else {
3089                 /* create new COW SV to share string */
3090                 RXp_MATCH_COPY_FREE(prog);
3091                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3092             }
3093             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3094             assert (SvPOKp(prog->saved_copy));
3095             prog->sublen  = strend - strbeg;
3096             prog->suboffset = 0;
3097             prog->subcoffset = 0;
3098         } else
3099 #endif
3100         {
3101             SSize_t min = 0;
3102             SSize_t max = strend - strbeg;
3103             SSize_t sublen;
3104
3105             if (    (flags & REXEC_COPY_SKIP_POST)
3106                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3107                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3108             ) { /* don't copy $' part of string */
3109                 U32 n = 0;
3110                 max = -1;
3111                 /* calculate the right-most part of the string covered
3112                  * by a capture. Due to lookahead, this may be to
3113                  * the right of $&, so we have to scan all captures */
3114                 while (n <= prog->lastparen) {
3115                     if (prog->offs[n].end > max)
3116                         max = prog->offs[n].end;
3117                     n++;
3118                 }
3119                 if (max == -1)
3120                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3121                             ? prog->offs[0].start
3122                             : 0;
3123                 assert(max >= 0 && max <= strend - strbeg);
3124             }
3125
3126             if (    (flags & REXEC_COPY_SKIP_PRE)
3127                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3128                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3129             ) { /* don't copy $` part of string */
3130                 U32 n = 0;
3131                 min = max;
3132                 /* calculate the left-most part of the string covered
3133                  * by a capture. Due to lookbehind, this may be to
3134                  * the left of $&, so we have to scan all captures */
3135                 while (min && n <= prog->lastparen) {
3136                     if (   prog->offs[n].start != -1
3137                         && prog->offs[n].start < min)
3138                     {
3139                         min = prog->offs[n].start;
3140                     }
3141                     n++;
3142                 }
3143                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3144                     && min >  prog->offs[0].end
3145                 )
3146                     min = prog->offs[0].end;
3147
3148             }
3149
3150             assert(min >= 0 && min <= max && min <= strend - strbeg);
3151             sublen = max - min;
3152
3153             if (RXp_MATCH_COPIED(prog)) {
3154                 if (sublen > prog->sublen)
3155                     prog->subbeg =
3156                             (char*)saferealloc(prog->subbeg, sublen+1);
3157             }
3158             else
3159                 prog->subbeg = (char*)safemalloc(sublen+1);
3160             Copy(strbeg + min, prog->subbeg, sublen, char);
3161             prog->subbeg[sublen] = '\0';
3162             prog->suboffset = min;
3163             prog->sublen = sublen;
3164             RXp_MATCH_COPIED_on(prog);
3165         }
3166         prog->subcoffset = prog->suboffset;
3167         if (prog->suboffset && utf8_target) {
3168             /* Convert byte offset to chars.
3169              * XXX ideally should only compute this if @-/@+
3170              * has been seen, a la PL_sawampersand ??? */
3171
3172             /* If there's a direct correspondence between the
3173              * string which we're matching and the original SV,
3174              * then we can use the utf8 len cache associated with
3175              * the SV. In particular, it means that under //g,
3176              * sv_pos_b2u() will use the previously cached
3177              * position to speed up working out the new length of
3178              * subcoffset, rather than counting from the start of
3179              * the string each time. This stops
3180              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3181              * from going quadratic */
3182             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3183                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3184                                                 SV_GMAGIC|SV_CONST_RETURN);
3185             else
3186                 prog->subcoffset = utf8_length((U8*)strbeg,
3187                                     (U8*)(strbeg+prog->suboffset));
3188         }
3189     }
3190     else {
3191         RXp_MATCH_COPY_FREE(prog);
3192         prog->subbeg = strbeg;
3193         prog->suboffset = 0;
3194         prog->subcoffset = 0;
3195         prog->sublen = strend - strbeg;
3196     }
3197 }
3198
3199
3200
3201
3202 /*
3203  - regexec_flags - match a regexp against a string
3204  */
3205 I32
3206 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3207               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3208 /* stringarg: the point in the string at which to begin matching */
3209 /* strend:    pointer to null at end of string */
3210 /* strbeg:    real beginning of string */
3211 /* minend:    end of match must be >= minend bytes after stringarg. */
3212 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3213  *            itself is accessed via the pointers above */
3214 /* data:      May be used for some additional optimizations.
3215               Currently unused. */
3216 /* flags:     For optimizations. See REXEC_* in regexp.h */
3217
3218 {
3219     struct regexp *const prog = ReANY(rx);
3220     char *s;
3221     regnode *c;
3222     char *startpos;
3223     SSize_t minlen;             /* must match at least this many chars */
3224     SSize_t dontbother = 0;     /* how many characters not to try at end */
3225     const bool utf8_target = cBOOL(DO_UTF8(sv));
3226     I32 multiline;
3227     RXi_GET_DECL(prog,progi);
3228     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3229     regmatch_info *const reginfo = &reginfo_buf;
3230     regexp_paren_pair *swap = NULL;
3231     I32 oldsave;
3232     GET_RE_DEBUG_FLAGS_DECL;
3233
3234     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3235     PERL_UNUSED_ARG(data);
3236
3237     /* Be paranoid... */
3238     if (prog == NULL) {
3239         Perl_croak(aTHX_ "NULL regexp parameter");
3240     }
3241
3242     DEBUG_EXECUTE_r(
3243         debug_start_match(rx, utf8_target, stringarg, strend,
3244         "Matching");
3245     );
3246
3247     startpos = stringarg;
3248
3249     /* set these early as they may be used by the HOP macros below */
3250     reginfo->strbeg = strbeg;
3251     reginfo->strend = strend;
3252     reginfo->is_utf8_target = cBOOL(utf8_target);
3253
3254     if (prog->intflags & PREGf_GPOS_SEEN) {
3255         MAGIC *mg;
3256
3257         /* set reginfo->ganch, the position where \G can match */
3258
3259         reginfo->ganch =
3260             (flags & REXEC_IGNOREPOS)
3261             ? stringarg /* use start pos rather than pos() */
3262             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3263               /* Defined pos(): */
3264             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3265             : strbeg; /* pos() not defined; use start of string */
3266
3267         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3268             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3269
3270         /* in the presence of \G, we may need to start looking earlier in
3271          * the string than the suggested start point of stringarg:
3272          * if prog->gofs is set, then that's a known, fixed minimum
3273          * offset, such as
3274          * /..\G/:   gofs = 2
3275          * /ab|c\G/: gofs = 1
3276          * or if the minimum offset isn't known, then we have to go back
3277          * to the start of the string, e.g. /w+\G/
3278          */
3279
3280         if (prog->intflags & PREGf_ANCH_GPOS) {
3281             if (prog->gofs) {
3282                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3283                 if (!startpos ||
3284                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3285                 {
3286                     DEBUG_r(Perl_re_printf( aTHX_
3287                             "fail: ganch-gofs before earliest possible start\n"));
3288                     return 0;
3289                 }
3290             }
3291             else
3292                 startpos = reginfo->ganch;
3293         }
3294         else if (prog->gofs) {
3295             startpos = HOPBACKc(startpos, prog->gofs);
3296             if (!startpos)
3297                 startpos = strbeg;
3298         }
3299         else if (prog->intflags & PREGf_GPOS_FLOAT)
3300             startpos = strbeg;
3301     }
3302
3303     minlen = prog->minlen;
3304     if ((startpos + minlen) > strend || startpos < strbeg) {
3305         DEBUG_r(Perl_re_printf( aTHX_
3306                     "Regex match can't succeed, so not even tried\n"));
3307         return 0;
3308     }
3309
3310     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3311      * which will call destuctors to reset PL_regmatch_state, free higher
3312      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3313      * regmatch_info_aux_eval */
3314
3315     oldsave = PL_savestack_ix;
3316
3317     s = startpos;
3318
3319     if ((prog->extflags & RXf_USE_INTUIT)
3320         && !(flags & REXEC_CHECKED))
3321     {
3322         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3323                                     flags, NULL);
3324         if (!s)
3325             return 0;
3326
3327         if (prog->extflags & RXf_CHECK_ALL) {
3328             /* we can match based purely on the result of INTUIT.
3329              * Set up captures etc just for $& and $-[0]
3330              * (an intuit-only match wont have $1,$2,..) */
3331             assert(!prog->nparens);
3332
3333             /* s/// doesn't like it if $& is earlier than where we asked it to
3334              * start searching (which can happen on something like /.\G/) */
3335             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3336                     && (s < stringarg))
3337             {
3338                 /* this should only be possible under \G */
3339                 assert(prog->intflags & PREGf_GPOS_SEEN);
3340                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3341                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3342                 goto phooey;
3343             }
3344
3345             /* match via INTUIT shouldn't have any captures.
3346              * Let @-, @+, $^N know */
3347             prog->lastparen = prog->lastcloseparen = 0;
3348             RXp_MATCH_UTF8_set(prog, utf8_target);
3349             prog->offs[0].start = s - strbeg;
3350             prog->offs[0].end = utf8_target
3351                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3352                 : s - strbeg + prog->minlenret;
3353             if ( !(flags & REXEC_NOT_FIRST) )
3354                 S_reg_set_capture_string(aTHX_ rx,
3355                                         strbeg, strend,
3356                                         sv, flags, utf8_target);
3357
3358             return 1;
3359         }
3360     }
3361
3362     multiline = prog->extflags & RXf_PMf_MULTILINE;
3363
3364     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3365         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3366                               "String too short [regexec_flags]...\n"));
3367         goto phooey;
3368     }
3369
3370     /* Check validity of program. */
3371     if (UCHARAT(progi->program) != REG_MAGIC) {
3372         Perl_croak(aTHX_ "corrupted regexp program");
3373     }
3374
3375     RXp_MATCH_TAINTED_off(prog);
3376     RXp_MATCH_UTF8_set(prog, utf8_target);
3377
3378     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3379     reginfo->intuit = 0;
3380     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3381     reginfo->warned = FALSE;
3382     reginfo->sv = sv;
3383     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3384     /* see how far we have to get to not match where we matched before */
3385     reginfo->till = stringarg + minend;
3386
3387     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3388         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3389            S_cleanup_regmatch_info_aux has executed (registered by
3390            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3391            magic belonging to this SV.
3392            Not newSVsv, either, as it does not COW.
3393         */
3394         reginfo->sv = newSV(0);
3395         SvSetSV_nosteal(reginfo->sv, sv);
3396         SAVEFREESV(reginfo->sv);
3397     }
3398
3399     /* reserve next 2 or 3 slots in PL_regmatch_state:
3400      * slot N+0: may currently be in use: skip it
3401      * slot N+1: use for regmatch_info_aux struct
3402      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3403      * slot N+3: ready for use by regmatch()
3404      */
3405
3406     {
3407         regmatch_state *old_regmatch_state;
3408         regmatch_slab  *old_regmatch_slab;
3409         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3410
3411         /* on first ever match, allocate first slab */
3412         if (!PL_regmatch_slab) {
3413             Newx(PL_regmatch_slab, 1, regmatch_slab);
3414             PL_regmatch_slab->prev = NULL;
3415             PL_regmatch_slab->next = NULL;
3416             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3417         }
3418
3419         old_regmatch_state = PL_regmatch_state;
3420         old_regmatch_slab  = PL_regmatch_slab;
3421
3422         for (i=0; i <= max; i++) {
3423             if (i == 1)
3424                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3425             else if (i ==2)
3426                 reginfo->info_aux_eval =
3427                 reginfo->info_aux->info_aux_eval =
3428                             &(PL_regmatch_state->u.info_aux_eval);
3429
3430             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3431                 PL_regmatch_state = S_push_slab(aTHX);
3432         }
3433
3434         /* note initial PL_regmatch_state position; at end of match we'll
3435          * pop back to there and free any higher slabs */
3436
3437         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3438         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3439         reginfo->info_aux->poscache = NULL;
3440
3441         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3442
3443         if ((prog->extflags & RXf_EVAL_SEEN))
3444             S_setup_eval_state(aTHX_ reginfo);
3445         else
3446             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3447     }
3448
3449     /* If there is a "must appear" string, look for it. */
3450
3451     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3452         /* We have to be careful. If the previous successful match
3453            was from this regex we don't want a subsequent partially
3454            successful match to clobber the old results.
3455            So when we detect this possibility we add a swap buffer
3456            to the re, and switch the buffer each match. If we fail,
3457            we switch it back; otherwise we leave it swapped.
3458         */
3459         swap = prog->offs;
3460         /* avoid leak if we die, or clean up anyway if match completes */
3461         SAVEFREEPV(swap);
3462         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3463         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3464             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3465             0,
3466             PTR2UV(prog),
3467             PTR2UV(swap),
3468             PTR2UV(prog->offs)
3469         ));
3470     }
3471
3472     if (prog->recurse_locinput)
3473         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3474
3475     /* Simplest case: anchored match need be tried only once, or with
3476      * MBOL, only at the beginning of each line.
3477      *
3478      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3479      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3480      * match at the start of the string then it won't match anywhere else
3481      * either; while with /.*.../, if it doesn't match at the beginning,
3482      * the earliest it could match is at the start of the next line */
3483
3484     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3485         char *end;
3486
3487         if (regtry(reginfo, &s))
3488             goto got_it;
3489
3490         if (!(prog->intflags & PREGf_ANCH_MBOL))
3491             goto phooey;
3492
3493         /* didn't match at start, try at other newline positions */
3494
3495         if (minlen)
3496             dontbother = minlen - 1;
3497         end = HOP3c(strend, -dontbother, strbeg) - 1;
3498
3499         /* skip to next newline */
3500
3501         while (s <= end) { /* note it could be possible to match at the end of the string */
3502             /* NB: newlines are the same in unicode as they are in latin */
3503             if (*s++ != '\n')
3504                 continue;
3505             if (prog->check_substr || prog->check_utf8) {
3506             /* note that with PREGf_IMPLICIT, intuit can only fail
3507              * or return the start position, so it's of limited utility.
3508              * Nevertheless, I made the decision that the potential for
3509              * quick fail was still worth it - DAPM */
3510                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3511                 if (!s)
3512                     goto phooey;
3513             }
3514             if (regtry(reginfo, &s))
3515                 goto got_it;
3516         }
3517         goto phooey;
3518     } /* end anchored search */
3519
3520     if (prog->intflags & PREGf_ANCH_GPOS)
3521     {
3522         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3523         assert(prog->intflags & PREGf_GPOS_SEEN);
3524         /* For anchored \G, the only position it can match from is
3525          * (ganch-gofs); we already set startpos to this above; if intuit
3526          * moved us on from there, we can't possibly succeed */
3527         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3528         if (s == startpos && regtry(reginfo, &s))
3529             goto got_it;
3530         goto phooey;
3531     }
3532
3533     /* Messy cases:  unanchored match. */
3534     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3535         /* we have /x+whatever/ */
3536         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3537         char ch;
3538 #ifdef DEBUGGING
3539         int did_match = 0;
3540 #endif
3541         if (utf8_target) {
3542             if (! prog->anchored_utf8) {
3543                 to_utf8_substr(prog);
3544             }
3545             ch = SvPVX_const(prog->anchored_utf8)[0];
3546             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3547                 if (*s == ch) {
3548                     DEBUG_EXECUTE_r( did_match = 1 );
3549                     if (regtry(reginfo, &s)) goto got_it;
3550                     s += UTF8SKIP(s);
3551                     while (s < strend && *s == ch)
3552                         s += UTF8SKIP(s);
3553                 }
3554             );
3555
3556         }
3557         else {
3558             if (! prog->anchored_substr) {
3559                 if (! to_byte_substr(prog)) {
3560                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3561                 }
3562             }
3563             ch = SvPVX_const(prog->anchored_substr)[0];
3564             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3565                 if (*s == ch) {
3566                     DEBUG_EXECUTE_r( did_match = 1 );
3567                     if (regtry(reginfo, &s)) goto got_it;
3568                     s++;
3569                     while (s < strend && *s == ch)
3570                         s++;
3571                 }
3572             );
3573         }
3574         DEBUG_EXECUTE_r(if (!did_match)
3575                 Perl_re_printf( aTHX_
3576                                   "Did not find anchored character...\n")
3577                );
3578     }
3579     else if (prog->anchored_substr != NULL
3580               || prog->anchored_utf8 != NULL
3581               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3582                   && prog->float_max_offset < strend - s)) {
3583         SV *must;
3584         SSize_t back_max;
3585         SSize_t back_min;
3586         char *last;
3587         char *last1;            /* Last position checked before */
3588 #ifdef DEBUGGING
3589         int did_match = 0;
3590 #endif
3591         if (prog->anchored_substr || prog->anchored_utf8) {
3592             if (utf8_target) {
3593                 if (! prog->anchored_utf8) {
3594                     to_utf8_substr(prog);
3595                 }
3596                 must = prog->anchored_utf8;
3597             }
3598             else {
3599                 if (! prog->anchored_substr) {
3600                     if (! to_byte_substr(prog)) {
3601                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3602                     }
3603                 }
3604                 must = prog->anchored_substr;
3605             }
3606             back_max = back_min = prog->anchored_offset;
3607         } else {
3608             if (utf8_target) {
3609                 if (! prog->float_utf8) {
3610                     to_utf8_substr(prog);
3611                 }
3612                 must = prog->float_utf8;
3613             }
3614             else {
3615                 if (! prog->float_substr) {
3616                     if (! to_byte_substr(prog)) {
3617                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3618                     }
3619                 }
3620                 must = prog->float_substr;
3621             }
3622             back_max = prog->float_max_offset;
3623             back_min = prog->float_min_offset;
3624         }
3625
3626         if (back_min<0) {
3627             last = strend;
3628         } else {
3629             last = HOP3c(strend,        /* Cannot start after this */
3630                   -(SSize_t)(CHR_SVLEN(must)
3631                          - (SvTAIL(must) != 0) + back_min), strbeg);
3632         }
3633         if (s > reginfo->strbeg)
3634             last1 = HOPc(s, -1);
3635         else
3636             last1 = s - 1;      /* bogus */
3637
3638         /* XXXX check_substr already used to find "s", can optimize if
3639            check_substr==must. */
3640         dontbother = 0;
3641         strend = HOPc(strend, -dontbother);
3642         while ( (s <= last) &&
3643                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3644                                   (unsigned char*)strend, must,
3645                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3646             DEBUG_EXECUTE_r( did_match = 1 );
3647             if (HOPc(s, -back_max) > last1) {
3648                 last1 = HOPc(s, -back_min);
3649                 s = HOPc(s, -back_max);
3650             }
3651             else {
3652                 char * const t = (last1 >= reginfo->strbeg)
3653                                     ? HOPc(last1, 1) : last1 + 1;
3654
3655                 last1 = HOPc(s, -back_min);
3656                 s = t;
3657             }
3658             if (utf8_target) {
3659                 while (s <= last1) {
3660                     if (regtry(reginfo, &s))
3661                         goto got_it;
3662                     if (s >= last1) {
3663                         s++; /* to break out of outer loop */
3664                         break;
3665                     }
3666                     s += UTF8SKIP(s);
3667                 }
3668             }
3669             else {
3670                 while (s <= last1) {
3671                     if (regtry(reginfo, &s))
3672                         goto got_it;
3673                     s++;
3674                 }
3675             }
3676         }
3677         DEBUG_EXECUTE_r(if (!did_match) {
3678             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3679                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3680             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3681                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3682                                ? "anchored" : "floating"),
3683                 quoted, RE_SV_TAIL(must));
3684         });
3685         goto phooey;
3686     }
3687     else if ( (c = progi->regstclass) ) {
3688         if (minlen) {
3689             const OPCODE op = OP(progi->regstclass);
3690             /* don't bother with what can't match */
3691             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3692                 strend = HOPc(strend, -(minlen - 1));
3693         }
3694         DEBUG_EXECUTE_r({
3695             SV * const prop = sv_newmortal();
3696             regprop(prog, prop, c, reginfo, NULL);
3697             {
3698                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3699                     s,strend-s,PL_dump_re_max_len);
3700                 Perl_re_printf( aTHX_
3701                     "Matching stclass %.*s against %s (%d bytes)\n",
3702                     (int)SvCUR(prop), SvPVX_const(prop),
3703                      quoted, (int)(strend - s));
3704             }
3705         });
3706         if (find_byclass(prog, c, s, strend, reginfo))
3707             goto got_it;
3708         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3709     }
3710     else {
3711         dontbother = 0;
3712         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3713             /* Trim the end. */
3714             char *last= NULL;
3715             SV* float_real;
3716             STRLEN len;
3717             const char *little;
3718
3719             if (utf8_target) {
3720                 if (! prog->float_utf8) {
3721                     to_utf8_substr(prog);
3722                 }
3723                 float_real = prog->float_utf8;
3724             }
3725             else {
3726                 if (! prog->float_substr) {
3727                     if (! to_byte_substr(prog)) {
3728                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3729                     }
3730                 }
3731                 float_real = prog->float_substr;
3732             }
3733
3734             little = SvPV_const(float_real, len);
3735             if (SvTAIL(float_real)) {
3736                     /* This means that float_real contains an artificial \n on
3737                      * the end due to the presence of something like this:
3738                      * /foo$/ where we can match both "foo" and "foo\n" at the
3739                      * end of the string.  So we have to compare the end of the
3740                      * string first against the float_real without the \n and
3741                      * then against the full float_real with the string.  We
3742                      * have to watch out for cases where the string might be
3743                      * smaller than the float_real or the float_real without
3744                      * the \n. */
3745                     char *checkpos= strend - len;
3746                     DEBUG_OPTIMISE_r(
3747                         Perl_re_printf( aTHX_
3748                             "%sChecking for float_real.%s\n",
3749                             PL_colors[4], PL_colors[5]));
3750                     if (checkpos + 1 < strbeg) {
3751                         /* can't match, even if we remove the trailing \n
3752                          * string is too short to match */
3753                         DEBUG_EXECUTE_r(
3754                             Perl_re_printf( aTHX_
3755                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3756                                 PL_colors[4], PL_colors[5]));
3757                         goto phooey;
3758                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3759                         /* can match, the end of the string matches without the
3760                          * "\n" */
3761                         last = checkpos + 1;
3762                     } else if (checkpos < strbeg) {
3763                         /* cant match, string is too short when the "\n" is
3764                          * included */
3765                         DEBUG_EXECUTE_r(
3766                             Perl_re_printf( aTHX_
3767                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3768                                 PL_colors[4], PL_colors[5]));
3769                         goto phooey;
3770                     } else if (!multiline) {
3771                         /* non multiline match, so compare with the "\n" at the
3772                          * end of the string */
3773                         if (memEQ(checkpos, little, len)) {
3774                             last= checkpos;
3775                         } else {
3776                             DEBUG_EXECUTE_r(
3777                                 Perl_re_printf( aTHX_
3778                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3779                                     PL_colors[4], PL_colors[5]));
3780                             goto phooey;
3781                         }
3782                     } else {
3783                         /* multiline match, so we have to search for a place
3784                          * where the full string is located */
3785                         goto find_last;
3786                     }
3787             } else {
3788                   find_last:
3789                     if (len)
3790                         last = rninstr(s, strend, little, little + len);
3791                     else
3792                         last = strend;  /* matching "$" */
3793             }
3794             if (!last) {
3795                 /* at one point this block contained a comment which was
3796                  * probably incorrect, which said that this was a "should not
3797                  * happen" case.  Even if it was true when it was written I am
3798                  * pretty sure it is not anymore, so I have removed the comment
3799                  * and replaced it with this one. Yves */
3800                 DEBUG_EXECUTE_r(
3801                     Perl_re_printf( aTHX_
3802                         "%sString does not contain required substring, cannot match.%s\n",
3803                         PL_colors[4], PL_colors[5]
3804                     ));
3805                 goto phooey;
3806             }
3807             dontbother = strend - last + prog->float_min_offset;
3808         }
3809         if (minlen && (dontbother < minlen))
3810             dontbother = minlen - 1;
3811         strend -= dontbother;              /* this one's always in bytes! */
3812         /* We don't know much -- general case. */
3813         if (utf8_target) {
3814             for (;;) {
3815                 if (regtry(reginfo, &s))
3816                     goto got_it;
3817                 if (s >= strend)
3818                     break;
3819                 s += UTF8SKIP(s);
3820             };
3821         }
3822         else {
3823             do {
3824                 if (regtry(reginfo, &s))
3825                     goto got_it;
3826             } while (s++ < strend);
3827         }
3828     }
3829
3830     /* Failure. */
3831     goto phooey;
3832
3833   got_it:
3834     /* s/// doesn't like it if $& is earlier than where we asked it to
3835      * start searching (which can happen on something like /.\G/) */
3836     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3837             && (prog->offs[0].start < stringarg - strbeg))
3838     {
3839         /* this should only be possible under \G */
3840         assert(prog->intflags & PREGf_GPOS_SEEN);
3841         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3842             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3843         goto phooey;
3844     }
3845
3846     /* clean up; this will trigger destructors that will free all slabs
3847      * above the current one, and cleanup the regmatch_info_aux
3848      * and regmatch_info_aux_eval sructs */
3849
3850     LEAVE_SCOPE(oldsave);
3851
3852     if (RXp_PAREN_NAMES(prog))
3853         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3854
3855     /* make sure $`, $&, $', and $digit will work later */
3856     if ( !(flags & REXEC_NOT_FIRST) )
3857         S_reg_set_capture_string(aTHX_ rx,
3858                                     strbeg, reginfo->strend,
3859                                     sv, flags, utf8_target);
3860
3861     return 1;
3862
3863   phooey:
3864     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3865                           PL_colors[4], PL_colors[5]));
3866
3867     if (swap) {
3868         /* we failed :-( roll it back.
3869          * Since the swap buffer will be freed on scope exit which follows
3870          * shortly, restore the old captures by copying 'swap's original
3871          * data to the new offs buffer
3872          */
3873         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3874             "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
3875             0,
3876             PTR2UV(prog),
3877             PTR2UV(prog->offs),
3878             PTR2UV(swap)
3879         ));
3880
3881         Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
3882     }
3883
3884     /* clean up; this will trigger destructors that will free all slabs
3885      * above the current one, and cleanup the regmatch_info_aux
3886      * and regmatch_info_aux_eval sructs */
3887
3888     LEAVE_SCOPE(oldsave);
3889
3890     return 0;
3891 }
3892
3893
3894 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3895  * Do inc before dec, in case old and new rex are the same */
3896 #define SET_reg_curpm(Re2)                          \
3897     if (reginfo->info_aux_eval) {                   \
3898         (void)ReREFCNT_inc(Re2);                    \
3899         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3900         PM_SETRE((PL_reg_curpm), (Re2));            \
3901     }
3902
3903
3904 /*
3905  - regtry - try match at specific point
3906  */
3907 STATIC bool                     /* 0 failure, 1 success */
3908 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3909 {
3910     CHECKPOINT lastcp;
3911     REGEXP *const rx = reginfo->prog;
3912     regexp *const prog = ReANY(rx);
3913     SSize_t result;
3914 #ifdef DEBUGGING
3915     U32 depth = 0; /* used by REGCP_SET */
3916 #endif
3917     RXi_GET_DECL(prog,progi);
3918     GET_RE_DEBUG_FLAGS_DECL;
3919
3920     PERL_ARGS_ASSERT_REGTRY;
3921
3922     reginfo->cutpoint=NULL;
3923
3924     prog->offs[0].start = *startposp - reginfo->strbeg;
3925     prog->lastparen = 0;
3926     prog->lastcloseparen = 0;
3927
3928     /* XXXX What this code is doing here?!!!  There should be no need
3929        to do this again and again, prog->lastparen should take care of
3930        this!  --ilya*/
3931
3932     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3933      * Actually, the code in regcppop() (which Ilya may be meaning by
3934      * prog->lastparen), is not needed at all by the test suite
3935      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3936      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3937      * Meanwhile, this code *is* needed for the
3938      * above-mentioned test suite tests to succeed.  The common theme
3939      * on those tests seems to be returning null fields from matches.
3940      * --jhi updated by dapm */
3941
3942     /* After encountering a variant of the issue mentioned above I think
3943      * the point Ilya was making is that if we properly unwind whenever
3944      * we set lastparen to a smaller value then we should not need to do
3945      * this every time, only when needed. So if we have tests that fail if
3946      * we remove this, then it suggests somewhere else we are improperly
3947      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3948      * places it is called, and related regcp() routines. - Yves */
3949 #if 1
3950     if (prog->nparens) {
3951         regexp_paren_pair *pp = prog->offs;
3952         I32 i;
3953         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3954             ++pp;
3955             pp->start = -1;
3956             pp->end = -1;
3957         }
3958     }
3959 #endif
3960     REGCP_SET(lastcp);
3961     result = regmatch(reginfo, *startposp, progi->program + 1);
3962     if (result != -1) {
3963         prog->offs[0].end = result;
3964         return 1;
3965     }
3966     if (reginfo->cutpoint)
3967         *startposp= reginfo->cutpoint;
3968     REGCP_UNWIND(lastcp);
3969     return 0;
3970 }
3971
3972 /* this is used to determine how far from the left messages like
3973    'failed...' are printed in regexec.c. It should be set such that
3974    messages are inline with the regop output that created them.
3975 */
3976 #define REPORT_CODE_OFF 29
3977 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3978 #ifdef DEBUGGING
3979 int
3980 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3981 {
3982     va_list ap;
3983     int result;
3984     PerlIO *f= Perl_debug_log;
3985     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3986     va_start(ap, depth);
3987     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3988     result = PerlIO_vprintf(f, fmt, ap);
3989     va_end(ap);
3990     return result;
3991 }
3992 #endif /* DEBUGGING */
3993
3994 /* grab a new slab and return the first slot in it */
3995
3996 STATIC regmatch_state *
3997 S_push_slab(pTHX)
3998 {
3999     regmatch_slab *s = PL_regmatch_slab->next;
4000     if (!s) {
4001         Newx(s, 1, regmatch_slab);
4002         s->prev = PL_regmatch_slab;
4003         s->next = NULL;
4004         PL_regmatch_slab->next = s;
4005     }
4006     PL_regmatch_slab = s;
4007     return SLAB_FIRST(s);
4008 }
4009
4010 #ifdef DEBUGGING
4011
4012 STATIC void
4013 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4014     const char *start, const char *end, const char *blurb)
4015 {
4016     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4017
4018     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4019
4020     if (!PL_colorset)
4021             reginitcolors();
4022     {
4023         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4024             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4025
4026         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4027             start, end - start, PL_dump_re_max_len);
4028
4029         Perl_re_printf( aTHX_
4030             "%s%s REx%s %s against %s\n",
4031                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4032
4033         if (utf8_target||utf8_pat)
4034             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4035                 utf8_pat ? "pattern" : "",
4036                 utf8_pat && utf8_target ? " and " : "",
4037                 utf8_target ? "string" : ""
4038             );
4039     }
4040 }
4041
4042 STATIC void
4043 S_dump_exec_pos(pTHX_ const char *locinput,
4044                       const regnode *scan,
4045                       const char *loc_regeol,
4046                       const char *loc_bostr,
4047                       const char *loc_reg_starttry,
4048                       const bool utf8_target,
4049                       const U32 depth
4050                 )
4051 {
4052     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4053     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4054     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4055     /* The part of the string before starttry has one color
4056        (pref0_len chars), between starttry and current
4057        position another one (pref_len - pref0_len chars),
4058        after the current position the third one.
4059        We assume that pref0_len <= pref_len, otherwise we
4060        decrease pref0_len.  */
4061     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4062         ? (5 + taill) - l : locinput - loc_bostr;
4063     int pref0_len;
4064
4065     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4066
4067     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4068         pref_len++;
4069     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4070     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4071         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4072               ? (5 + taill) - pref_len : loc_regeol - locinput);
4073     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4074         l--;
4075     if (pref0_len < 0)
4076         pref0_len = 0;
4077     if (pref0_len > pref_len)
4078         pref0_len = pref_len;
4079     {
4080         const int is_uni = utf8_target ? 1 : 0;
4081
4082         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4083             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4084
4085         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4086                     (locinput - pref_len + pref0_len),
4087                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4088
4089         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4090                     locinput, loc_regeol - locinput, 10, 0, 1);
4091
4092         const STRLEN tlen=len0+len1+len2;
4093         Perl_re_printf( aTHX_
4094                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4095                     (IV)(locinput - loc_bostr),
4096                     len0, s0,
4097                     len1, s1,
4098                     (docolor ? "" : "> <"),
4099                     len2, s2,
4100                     (int)(tlen > 19 ? 0 :  19 - tlen),
4101                     "",
4102                     depth);
4103     }
4104 }
4105
4106 #endif
4107
4108 /* reg_check_named_buff_matched()
4109  * Checks to see if a named buffer has matched. The data array of
4110  * buffer numbers corresponding to the buffer is expected to reside
4111  * in the regexp->data->data array in the slot stored in the ARG() of
4112  * node involved. Note that this routine doesn't actually care about the
4113  * name, that information is not preserved from compilation to execution.
4114  * Returns the index of the leftmost defined buffer with the given name
4115  * or 0 if non of the buffers matched.
4116  */
4117 STATIC I32
4118 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4119 {
4120     I32 n;
4121     RXi_GET_DECL(rex,rexi);
4122     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4123     I32 *nums=(I32*)SvPVX(sv_dat);
4124
4125     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4126
4127     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4128         if ((I32)rex->lastparen >= nums[n] &&
4129             rex->offs[nums[n]].end != -1)
4130         {
4131             return nums[n];
4132         }
4133     }
4134     return 0;
4135 }
4136
4137 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4138 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4139 #define CHRTEST_NOT_A_CP_1 -999
4140 #define CHRTEST_NOT_A_CP_2 -998
4141
4142 static bool
4143 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4144         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4145 {
4146     /* This function determines if there are zero, one, two, or more characters
4147      * that match the first character of the passed-in EXACTish node
4148      * <text_node>, and if there are one or two, it returns them in the
4149      * passed-in pointers.
4150      *
4151      * If it determines that no possible character in the target string can
4152      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4153      * the first character in <text_node> requires UTF-8 to represent, and the
4154      * target string isn't in UTF-8.)
4155      *
4156      * If there are more than two characters that could match the beginning of
4157      * <text_node>, or if more context is required to determine a match or not,
4158      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4159      *
4160      * The motiviation behind this function is to allow the caller to set up
4161      * tight loops for matching.  If <text_node> is of type EXACT, there is
4162      * only one possible character that can match its first character, and so
4163      * the situation is quite simple.  But things get much more complicated if
4164      * folding is involved.  It may be that the first character of an EXACTFish
4165      * node doesn't participate in any possible fold, e.g., punctuation, so it
4166      * can be matched only by itself.  The vast majority of characters that are
4167      * in folds match just two things, their lower and upper-case equivalents.
4168      * But not all are like that; some have multiple possible matches, or match
4169      * sequences of more than one character.  This function sorts all that out.
4170      *
4171      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4172      * loop of trying to match A*, we know we can't exit where the thing
4173      * following it isn't a B.  And something can't be a B unless it is the
4174      * beginning of B.  By putting a quick test for that beginning in a tight
4175      * loop, we can rule out things that can't possibly be B without having to
4176      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4177      * character, we can make a tight loop matching A*, using the outputs of
4178      * this function.
4179      *
4180      * If the target string to match isn't in UTF-8, and there aren't
4181      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4182      * the one or two possible octets (which are characters in this situation)
4183      * that can match.  In all cases, if there is only one character that can
4184      * match, *<c1p> and *<c2p> will be identical.
4185      *
4186      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4187      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4188      * can match the beginning of <text_node>.  They should be declared with at
4189      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4190      * undefined what these contain.)  If one or both of the buffers are
4191      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4192      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4193      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4194      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4195      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4196
4197     const bool utf8_target = reginfo->is_utf8_target;
4198
4199     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4200     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4201     bool use_chrtest_void = FALSE;
4202     const bool is_utf8_pat = reginfo->is_utf8_pat;
4203
4204     /* Used when we have both utf8 input and utf8 output, to avoid converting
4205      * to/from code points */
4206     bool utf8_has_been_setup = FALSE;
4207
4208     dVAR;
4209
4210     U8 *pat = (U8*)STRING(text_node);
4211     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4212
4213     if (   OP(text_node) == EXACT
4214         || OP(text_node) == EXACT_ONLY8
4215         || OP(text_node) == EXACTL)
4216     {
4217
4218         /* In an exact node, only one thing can be matched, that first
4219          * character.  If both the pat and the target are UTF-8, we can just
4220          * copy the input to the output, avoiding finding the code point of
4221          * that character */
4222         if (!is_utf8_pat) {
4223             assert(OP(text_node) != EXACT_ONLY8);
4224             c2 = c1 = *pat;
4225         }
4226         else if (utf8_target) {
4227             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4228             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4229             utf8_has_been_setup = TRUE;
4230         }
4231         else if (OP(text_node) == EXACT_ONLY8) {
4232             return FALSE;   /* Can only match UTF-8 target */
4233         }
4234         else {
4235             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4236         }
4237     }
4238     else { /* an EXACTFish node */
4239         U8 *pat_end = pat + STR_LEN(text_node);
4240
4241         /* An EXACTFL node has at least some characters unfolded, because what
4242          * they match is not known until now.  So, now is the time to fold
4243          * the first few of them, as many as are needed to determine 'c1' and
4244          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4245          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4246          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4247          * need to fold as many characters as a single character can fold to,
4248          * so that later we can check if the first ones are such a multi-char
4249          * fold.  But, in such a pattern only locale-problematic characters
4250          * aren't folded, so we can skip this completely if the first character
4251          * in the node isn't one of the tricky ones */
4252         if (OP(text_node) == EXACTFL) {
4253
4254             if (! is_utf8_pat) {
4255                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4256                 {
4257                     folded[0] = folded[1] = 's';
4258                     pat = folded;
4259                     pat_end = folded + 2;
4260                 }
4261             }
4262             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4263                 U8 *s = pat;
4264                 U8 *d = folded;
4265                 int i;
4266
4267                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4268                     if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
4269                         *(d++) = (U8) toFOLD_LC(*s);
4270                         s++;
4271                     }
4272                     else {
4273                         STRLEN len;
4274                         _toFOLD_utf8_flags(s,
4275                                            pat_end,
4276                                            d,
4277                                            &len,
4278                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4279                         d += len;
4280                         s += UTF8SKIP(s);
4281                     }
4282                 }
4283
4284                 pat = folded;
4285                 pat_end = d;
4286             }
4287         }
4288
4289         if (    ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4290              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4291         {
4292             /* Multi-character folds require more context to sort out.  Also
4293              * PL_utf8_foldclosures used below doesn't handle them, so have to
4294              * be handled outside this routine */
4295             use_chrtest_void = TRUE;
4296         }
4297         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4298             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4299
4300             if (   UNLIKELY(PL_in_utf8_turkic_locale)
4301                 && OP(text_node) == EXACTFL
4302                 && UNLIKELY(   c1 == 'i' || c1 == 'I'
4303                             || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
4304                             || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
4305             {   /* Hard-coded Turkish locale rules for these 4 characters
4306                    override normal rules */
4307                 if (c1 == 'i') {
4308                     c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4309                 }
4310                 else if (c1 == 'I') {
4311                     c2 = LATIN_SMALL_LETTER_DOTLESS_I;
4312                 }
4313                 else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
4314                     c2 = 'i';
4315                 }
4316                 else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
4317                     c2 = 'I';
4318                 }
4319             }
4320             else if (c1 > 255) {
4321                 const unsigned int * remaining_folds;
4322                 unsigned int first_fold;
4323
4324                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4325                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4326                 Size_t folds_count = _inverse_folds(c1, &first_fold,
4327                                                        &remaining_folds);
4328                 if (folds_count == 0) {
4329                     c2 = c1;    /* there is only a single character that could
4330                                    match */
4331                 }
4332                 else if (folds_count != 1) {
4333                     /* If there aren't exactly two folds to this (itself and
4334                      * another), it is outside the scope of this function */
4335                     use_chrtest_void = TRUE;
4336                 }
4337                 else {  /* There are two.  We already have one, get the other */
4338                     c2 = first_fold;
4339
4340                     /* Folds that cross the 255/256 boundary are forbidden if
4341                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4342                      * ASCIII.  The only other match to c1 is c2, and since c1
4343                      * is above 255, c2 better be as well under these
4344                      * circumstances.  If it isn't, it means the only legal
4345                      * match of c1 is itself. */
4346                     if (    c2 < 256
4347                         && (   (   OP(text_node) == EXACTFL
4348                                 && ! IN_UTF8_CTYPE_LOCALE)
4349                             || ((     OP(text_node) == EXACTFAA
4350                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4351                                 && (isASCII(c1) || isASCII(c2)))))
4352                     {
4353                         c2 = c1;
4354                     }
4355                 }
4356             }
4357             else /* Here, c1 is <= 255 */
4358                 if (   utf8_target
4359                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4360                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4361                     && (   (   OP(text_node) != EXACTFAA
4362                             && OP(text_node) != EXACTFAA_NO_TRIE)
4363                         ||   ! isASCII(c1)))
4364             {
4365                 /* Here, there could be something above Latin1 in the target
4366                  * which folds to this character in the pattern.  All such
4367                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4368                  * than two characters involved in their folds, so are outside
4369                  * the scope of this function */
4370                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4371                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4372                 }
4373                 else {
4374                     use_chrtest_void = TRUE;
4375                 }
4376             }
4377             else { /* Here nothing above Latin1 can fold to the pattern
4378                       character */
4379                 switch (OP(text_node)) {
4380
4381                     case EXACTFL:   /* /l rules */
4382                         c2 = PL_fold_locale[c1];
4383                         break;
4384
4385                     case EXACTF:   /* This node only generated for non-utf8
4386                                     patterns */
4387                         assert(! is_utf8_pat);
4388                         if (! utf8_target) {    /* /d rules */
4389                             c2 = PL_fold[c1];
4390                             break;
4391                         }
4392                         /* FALLTHROUGH */
4393                         /* /u rules for all these.  This happens to work for
4394                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4395                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4396                                                 non-utf8 patterns */
4397                         assert(! is_utf8_pat);
4398                         /* FALLTHROUGH */
4399                     case EXACTFAA:
4400                     case EXACTFUP:
4401                     case EXACTFU:
4402                         c2 = PL_fold_latin1[c1];
4403                         break;
4404                     case EXACTFU_ONLY8:
4405                         return FALSE;
4406                         NOT_REACHED; /* NOTREACHED */
4407
4408                     default:
4409                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4410                         NOT_REACHED; /* NOTREACHED */
4411                 }
4412             }
4413         }
4414     }
4415
4416     /* Here have figured things out.  Set up the returns */
4417     if (use_chrtest_void) {
4418         *c2p = *c1p = CHRTEST_VOID;
4419     }
4420     else if (utf8_target) {
4421         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4422             uvchr_to_utf8(c1_utf8, c1);
4423             uvchr_to_utf8(c2_utf8, c2);
4424         }
4425
4426         /* Invariants are stored in both the utf8 and byte outputs; Use
4427          * negative numbers otherwise for the byte ones.  Make sure that the
4428          * byte ones are the same iff the utf8 ones are the same */
4429         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4430         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4431                 ? *c2_utf8
4432                 : (c1 == c2)
4433                   ? CHRTEST_NOT_A_CP_1
4434                   : CHRTEST_NOT_A_CP_2;
4435     }
4436     else if (c1 > 255) {
4437        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4438                            can represent */
4439            return FALSE;
4440        }
4441
4442        *c1p = *c2p = c2;    /* c2 is the only representable value */
4443     }
4444     else {  /* c1 is representable; see about c2 */
4445        *c1p = c1;
4446        *c2p = (c2 < 256) ? c2 : c1;
4447     }
4448
4449     return TRUE;
4450 }
4451
4452 STATIC bool
4453 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4454 {
4455     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4456      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4457
4458     PERL_ARGS_ASSERT_ISGCB;
4459
4460     switch (GCB_table[before][after]) {
4461         case GCB_BREAKABLE:
4462             return TRUE;
4463
4464         case GCB_NOBREAK:
4465             return FALSE;
4466
4467         case GCB_RI_then_RI:
4468             {
4469                 int RI_count = 1;
4470                 U8 * temp_pos = (U8 *) curpos;
4471
4472                 /* Do not break within emoji flag sequences. That is, do not
4473                  * break between regional indicator (RI) symbols if there is an
4474                  * odd number of RI characters before the break point.
4475                  *  GB12   sot (RI RI)* RI × RI
4476                  *  GB13 [^RI] (RI RI)* RI × RI */
4477
4478                 while (backup_one_GCB(strbeg,
4479                                     &temp_pos,
4480                                     utf8_target) == GCB_Regional_Indicator)
4481                 {
4482                     RI_count++;
4483                 }
4484
4485                 return RI_count % 2 != 1;
4486             }
4487
4488         case GCB_EX_then_EM:
4489
4490             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4491             {
4492                 U8 * temp_pos = (U8 *) curpos;
4493                 GCB_enum prev;
4494
4495                 do {
4496                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4497                 }
4498                 while (prev == GCB_Extend);
4499
4500                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4501             }
4502
4503         case GCB_Maybe_Emoji_NonBreak:
4504
4505             {
4506
4507             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4508               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4509               */
4510                 U8 * temp_pos = (U8 *) curpos;
4511                 GCB_enum prev;
4512
4513                 do {
4514                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4515                 }
4516                 while (prev == GCB_Extend);
4517
4518                 return prev != GCB_XPG_XX;
4519             }
4520
4521         default:
4522             break;
4523     }
4524
4525 #ifdef DEBUGGING
4526     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4527                                   before, after, GCB_table[before][after]);
4528     assert(0);
4529 #endif
4530     return TRUE;
4531 }
4532
4533 STATIC GCB_enum
4534 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4535 {
4536     dVAR;
4537     GCB_enum gcb;
4538
4539     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4540
4541     if (*curpos < strbeg) {
4542         return GCB_EDGE;
4543     }
4544
4545     if (utf8_target) {
4546         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4547         U8 * prev_prev_char_pos;
4548
4549         if (! prev_char_pos) {
4550             return GCB_EDGE;
4551         }
4552
4553         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4554             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4555             *curpos = prev_char_pos;
4556             prev_char_pos = prev_prev_char_pos;
4557         }
4558         else {
4559             *curpos = (U8 *) strbeg;
4560             return GCB_EDGE;
4561         }
4562     }
4563     else {
4564         if (*curpos - 2 < strbeg) {
4565             *curpos = (U8 *) strbeg;
4566             return GCB_EDGE;
4567         }
4568         (*curpos)--;
4569         gcb = getGCB_VAL_CP(*(*curpos - 1));
4570     }
4571
4572     return gcb;
4573 }
4574
4575 /* Combining marks attach to most classes that precede them, but this defines
4576  * the exceptions (from TR14) */
4577 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4578                                      || prev == LB_Mandatory_Break      \
4579                                      || prev == LB_Carriage_Return      \
4580                                      || prev == LB_Line_Feed            \
4581                                      || prev == LB_Next_Line            \
4582                                      || prev == LB_Space                \
4583                                      || prev == LB_ZWSpace))
4584
4585 STATIC bool
4586 S_isLB(pTHX_ LB_enum before,
4587              LB_enum after,
4588              const U8 * const strbeg,
4589              const U8 * const curpos,
4590              const U8 * const strend,
4591              const bool utf8_target)
4592 {
4593     U8 * temp_pos = (U8 *) curpos;
4594     LB_enum prev = before;
4595
4596     /* Is the boundary between 'before' and 'after' line-breakable?
4597      * Most of this is just a table lookup of a generated table from Unicode
4598      * rules.  But some rules require context to decide, and so have to be
4599      * implemented in code */
4600
4601     PERL_ARGS_ASSERT_ISLB;
4602
4603     /* Rule numbers in the comments below are as of Unicode 9.0 */
4604
4605   redo:
4606     before = prev;
4607     switch (LB_table[before][after]) {
4608         case LB_BREAKABLE:
4609             return TRUE;
4610
4611         case LB_NOBREAK:
4612         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4613             return FALSE;
4614
4615         case LB_SP_foo + LB_BREAKABLE:
4616         case LB_SP_foo + LB_NOBREAK:
4617         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4618
4619             /* When we have something following a SP, we have to look at the
4620              * context in order to know what to do.
4621              *
4622              * SP SP should not reach here because LB7: Do not break before
4623              * spaces.  (For two spaces in a row there is nothing that
4624              * overrides that) */
4625             assert(after != LB_Space);
4626
4627             /* Here we have a space followed by a non-space.  Mostly this is a
4628              * case of LB18: "Break after spaces".  But there are complications
4629              * as the handling of spaces is somewhat tricky.  They are in a
4630              * number of rules, which have to be applied in priority order, but
4631              * something earlier in the string can cause a rule to be skipped
4632              * and a lower priority rule invoked.  A prime example is LB7 which
4633              * says don't break before a space.  But rule LB8 (lower priority)
4634              * says that the first break opportunity after a ZW is after any
4635              * span of spaces immediately after it.  If a ZW comes before a SP
4636              * in the input, rule LB8 applies, and not LB7.  Other such rules
4637              * involve combining marks which are rules 9 and 10, but they may
4638              * override higher priority rules if they come earlier in the
4639              * string.  Since we're doing random access into the middle of the
4640              * string, we have to look for rules that should get applied based
4641              * on both string position and priority.  Combining marks do not
4642              * attach to either ZW nor SP, so we don't have to consider them
4643              * until later.
4644              *
4645              * To check for LB8, we have to find the first non-space character
4646              * before this span of spaces */
4647             do {
4648                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4649             }
4650             while (prev == LB_Space);
4651
4652             /* LB8 Break before any character following a zero-width space,
4653              * even if one or more spaces intervene.
4654              *      ZW SP* ÷
4655              * So if we have a ZW just before this span, and to get here this
4656              * is the final space in the span. */
4657             if (prev == LB_ZWSpace) {
4658                 return TRUE;
4659             }
4660
4661             /* Here, not ZW SP+.  There are several rules that have higher
4662              * priority than LB18 and can be resolved now, as they don't depend
4663              * on anything earlier in the string (except ZW, which we have
4664              * already handled).  One of these rules is LB11 Do not break
4665              * before Word joiner, but we have specially encoded that in the
4666              * lookup table so it is caught by the single test below which
4667              * catches the other ones. */
4668             if (LB_table[LB_Space][after] - LB_SP_foo
4669                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4670             {
4671                 return FALSE;
4672             }
4673
4674             /* If we get here, we have to XXX consider combining marks. */
4675             if (prev == LB_Combining_Mark) {
4676
4677                 /* What happens with these depends on the character they
4678                  * follow.  */
4679                 do {
4680                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4681                 }
4682                 while (prev == LB_Combining_Mark);
4683
4684                 /* Most times these attach to and inherit the characteristics
4685                  * of that character, but not always, and when not, they are to
4686                  * be treated as AL by rule LB10. */
4687                 if (! LB_CM_ATTACHES_TO(prev)) {
4688                     prev = LB_Alphabetic;
4689                 }
4690             }
4691
4692             /* Here, we have the character preceding the span of spaces all set
4693              * up.  We follow LB18: "Break after spaces" unless the table shows
4694              * that is overriden */
4695             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4696
4697         case LB_CM_ZWJ_foo:
4698
4699             /* We don't know how to treat the CM except by looking at the first
4700              * non-CM character preceding it.  ZWJ is treated as CM */
4701             do {
4702                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4703             }
4704             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4705
4706             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4707              * attatches to it, then it inherits the behavior of 'prev'.  If it
4708              * doesn't attach, it is to be treated as an AL */
4709             if (! LB_CM_ATTACHES_TO(prev)) {
4710                 prev = LB_Alphabetic;
4711             }
4712
4713             goto redo;
4714
4715         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4716         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4717
4718             /* LB21a Don't break after Hebrew + Hyphen.
4719              * HL (HY | BA) × */
4720
4721             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4722                                                           == LB_Hebrew_Letter)
4723             {
4724                 return FALSE;
4725             }
4726
4727             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4728
4729         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4730         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4731
4732             /* LB25a (PR | PO) × ( OP | HY )? NU */
4733             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4734                 return FALSE;
4735             }
4736
4737             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4738                                                                 == LB_BREAKABLE;
4739
4740         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4741         case LB_SY_or_IS_then_various + LB_NOBREAK:
4742         {
4743             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4744
4745             LB_enum temp = prev;
4746             do {
4747                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4748             }
4749             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4750             if (temp == LB_Numeric) {
4751                 return FALSE;
4752             }
4753
4754             return LB_table[prev][after] - LB_SY_or_IS_then_various
4755                                                                == LB_BREAKABLE;
4756         }
4757
4758         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4759         case LB_various_then_PO_or_PR + LB_NOBREAK:
4760         {
4761             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4762
4763             LB_enum temp = prev;
4764             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4765             {
4766                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4767             }
4768             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4769                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4770             }
4771             if (temp == LB_Numeric) {
4772                 return FALSE;
4773             }
4774             return LB_various_then_PO_or_PR;
4775         }
4776
4777         case LB_RI_then_RI + LB_NOBREAK:
4778         case LB_RI_then_RI + LB_BREAKABLE:
4779             {
4780                 int RI_count = 1;
4781
4782                 /* LB30a Break between two regional indicator symbols if and
4783                  * only if there are an even number of regional indicators
4784                  * preceding the position of the break.
4785                  *
4786                  *    sot (RI RI)* RI × RI
4787                  *  [^RI] (RI RI)* RI × RI */
4788
4789                 while (backup_one_LB(strbeg,
4790                                      &temp_pos,
4791                                      utf8_target) == LB_Regional_Indicator)
4792                 {
4793                     RI_count++;
4794                 }
4795
4796                 return RI_count % 2 == 0;
4797             }
4798
4799         default:
4800             break;
4801     }
4802
4803 #ifdef DEBUGGING
4804     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4805                                   before, after, LB_table[before][after]);
4806     assert(0);
4807 #endif
4808     return TRUE;
4809 }
4810
4811 STATIC LB_enum
4812 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4813 {
4814     dVAR;
4815
4816     LB_enum lb;
4817
4818     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4819
4820     if (*curpos >= strend) {
4821         return LB_EDGE;
4822     }
4823
4824     if (utf8_target) {
4825         *curpos += UTF8SKIP(*curpos);
4826         if (*curpos >= strend) {
4827             return LB_EDGE;
4828         }
4829         lb = getLB_VAL_UTF8(*curpos, strend);
4830     }
4831     else {
4832         (*curpos)++;
4833         if (*curpos >= strend) {
4834             return LB_EDGE;
4835         }
4836         lb = getLB_VAL_CP(**curpos);
4837     }
4838
4839     return lb;
4840 }
4841
4842 STATIC LB_enum
4843 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4844 {
4845     dVAR;
4846     LB_enum lb;
4847
4848     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4849
4850     if (*curpos < strbeg) {
4851         return LB_EDGE;
4852     }
4853
4854     if (utf8_target) {
4855         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4856         U8 * prev_prev_char_pos;
4857
4858         if (! prev_char_pos) {
4859             return LB_EDGE;
4860         }
4861
4862         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4863             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4864             *curpos = prev_char_pos;
4865             prev_char_pos = prev_prev_char_pos;
4866         }
4867         else {
4868             *curpos = (U8 *) strbeg;
4869             return LB_EDGE;
4870         }
4871     }
4872     else {
4873         if (*curpos - 2 < strbeg) {
4874             *curpos = (U8 *) strbeg;
4875             return LB_EDGE;
4876         }
4877         (*curpos)--;
4878         lb = getLB_VAL_CP(*(*curpos - 1));
4879     }
4880
4881     return lb;
4882 }
4883
4884 STATIC bool
4885 S_isSB(pTHX_ SB_enum before,
4886              SB_enum after,
4887              const U8 * const strbeg,
4888              const U8 * const curpos,
4889              const U8 * const strend,
4890              const bool utf8_target)
4891 {
4892     /* returns a boolean indicating if there is a Sentence Boundary Break
4893      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4894
4895     U8 * lpos = (U8 *) curpos;
4896     bool has_para_sep = FALSE;
4897     bool has_sp = FALSE;
4898
4899     PERL_ARGS_ASSERT_ISSB;
4900
4901     /* Break at the start and end of text.
4902         SB1.  sot  ÷
4903         SB2.  ÷  eot
4904       But unstated in Unicode is don't break if the text is empty */
4905     if (before == SB_EDGE || after == SB_EDGE) {
4906         return before != after;
4907     }
4908
4909     /* SB 3: Do not break within CRLF. */
4910     if (before == SB_CR && after == SB_LF) {
4911         return FALSE;
4912     }
4913
4914     /* Break after paragraph separators.  CR and LF are considered
4915      * so because Unicode views text as like word processing text where there
4916      * are no newlines except between paragraphs, and the word processor takes
4917      * care of wrapping without there being hard line-breaks in the text *./
4918        SB4.  Sep | CR | LF  ÷ */
4919     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4920         return TRUE;
4921     }
4922
4923     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4924      * (See Section 6.2, Replacing Ignore Rules.)
4925         SB5.  X (Extend | Format)*  →  X */
4926     if (after == SB_Extend || after == SB_Format) {
4927
4928         /* Implied is that the these characters attach to everything
4929          * immediately prior to them except for those separator-type
4930          * characters.  And the rules earlier have already handled the case
4931          * when one of those immediately precedes the extend char */
4932         return FALSE;
4933     }
4934
4935     if (before == SB_Extend || before == SB_Format) {
4936         U8 * temp_pos = lpos;
4937         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4938         if (   backup != SB_EDGE
4939             && backup != SB_Sep
4940             && backup != SB_CR
4941             && backup != SB_LF)
4942         {
4943             before = backup;
4944             lpos = temp_pos;
4945         }
4946
4947         /* Here, both 'before' and 'backup' are these types; implied is that we
4948          * don't break between them */
4949         if (backup == SB_Extend || backup == SB_Format) {
4950             return FALSE;
4951         }
4952     }
4953
4954     /* Do not break after ambiguous terminators like period, if they are
4955      * immediately followed by a number or lowercase letter, if they are
4956      * between uppercase letters, if the first following letter (optionally
4957      * after certain punctuation) is lowercase, or if they are followed by
4958      * "continuation" punctuation such as comma, colon, or semicolon. For
4959      * example, a period may be an abbreviation or numeric period, and thus may
4960      * not mark the end of a sentence.
4961
4962      * SB6. ATerm  ×  Numeric */
4963     if (before == SB_ATerm && after == SB_Numeric) {
4964         return FALSE;
4965     }
4966
4967     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4968     if (before == SB_ATerm && after == SB_Upper) {
4969         U8 * temp_pos = lpos;
4970         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4971         if (backup == SB_Upper || backup == SB_Lower) {
4972             return FALSE;
4973         }
4974     }
4975
4976     /* The remaining rules that aren't the final one, all require an STerm or
4977      * an ATerm after having backed up over some Close* Sp*, and in one case an
4978      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4979      * So do that backup now, setting flags if either Sp or a paragraph
4980      * separator are found */
4981
4982     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4983         has_para_sep = TRUE;
4984         before = backup_one_SB(strbeg, &lpos, utf8_target);
4985     }
4986
4987     if (before == SB_Sp) {
4988         has_sp = TRUE;
4989         do {
4990             before = backup_one_SB(strbeg, &lpos, utf8_target);
4991         }
4992         while (before == SB_Sp);
4993     }
4994
4995     while (before == SB_Close) {
4996         before = backup_one_SB(strbeg, &lpos, utf8_target);
4997     }
4998
4999     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5000      * most cases an STerm */
5001     if (before == SB_STerm || before == SB_ATerm) {
5002
5003         /* So, here the lhs matches
5004          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5005          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5006          * The rules that apply here are:
5007          *
5008          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5009                                            | LF | STerm | ATerm) )* Lower
5010            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5011            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5012            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5013            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5014          */
5015
5016         /* And all but SB11 forbid having seen a paragraph separator */
5017         if (! has_para_sep) {
5018             if (before == SB_ATerm) {          /* SB8 */
5019                 U8 * rpos = (U8 *) curpos;
5020                 SB_enum later = after;
5021
5022                 while (    later != SB_OLetter
5023                         && later != SB_Upper
5024                         && later != SB_Lower
5025                         && later != SB_Sep
5026                         && later != SB_CR
5027                         && later != SB_LF
5028                         && later != SB_STerm
5029                         && later != SB_ATerm
5030                         && later != SB_EDGE)
5031                 {
5032                     later = advance_one_SB(&rpos, strend, utf8_target);
5033                 }
5034                 if (later == SB_Lower) {
5035                     return FALSE;
5036                 }
5037             }
5038
5039             if (   after == SB_SContinue    /* SB8a */
5040                 || after == SB_STerm
5041                 || after == SB_ATerm)
5042             {
5043                 return FALSE;
5044             }
5045
5046             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5047                 if (   after == SB_Close
5048                     || after == SB_Sp
5049                     || after == SB_Sep
5050                     || after == SB_CR
5051                     || after == SB_LF)
5052                 {
5053                     return FALSE;
5054                 }
5055             }
5056
5057             /* SB10.  This and SB9 could probably be combined some way, but khw
5058              * has decided to follow the Unicode rule book precisely for
5059              * simplified maintenance */
5060             if (   after == SB_Sp
5061                 || after == SB_Sep
5062                 || after == SB_CR
5063                 || after == SB_LF)
5064             {
5065                 return FALSE;
5066             }
5067         }
5068
5069         /* SB11.  */
5070         return TRUE;
5071     }
5072
5073     /* Otherwise, do not break.
5074     SB12.  Any  ×  Any */
5075
5076     return FALSE;
5077 }
5078
5079 STATIC SB_enum
5080 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5081 {
5082     dVAR;
5083     SB_enum sb;
5084
5085     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5086
5087     if (*curpos >= strend) {
5088         return SB_EDGE;
5089     }
5090
5091     if (utf8_target) {
5092         do {
5093             *curpos += UTF8SKIP(*curpos);
5094             if (*curpos >= strend) {
5095                 return SB_EDGE;
5096             }
5097             sb = getSB_VAL_UTF8(*curpos, strend);
5098         } while (sb == SB_Extend || sb == SB_Format);
5099     }
5100     else {
5101         do {
5102             (*curpos)++;
5103             if (*curpos >= strend) {
5104                 return SB_EDGE;
5105             }
5106             sb = getSB_VAL_CP(**curpos);
5107         } while (sb == SB_Extend || sb == SB_Format);
5108     }
5109
5110     return sb;
5111 }
5112
5113 STATIC SB_enum
5114 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5115 {
5116     dVAR;
5117     SB_enum sb;
5118
5119     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5120
5121     if (*curpos < strbeg) {
5122         return SB_EDGE;
5123     }
5124
5125     if (utf8_target) {
5126         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5127         if (! prev_char_pos) {
5128             return SB_EDGE;
5129         }
5130
5131         /* Back up over Extend and Format.  curpos is always just to the right
5132          * of the characater whose value we are getting */
5133         do {
5134             U8 * prev_prev_char_pos;
5135             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5136                                                                       strbeg)))
5137             {
5138                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5139                 *curpos = prev_char_pos;
5140                 prev_char_pos = prev_prev_char_pos;
5141             }
5142             else {
5143                 *curpos = (U8 *) strbeg;
5144                 return SB_EDGE;
5145             }
5146         } while (sb == SB_Extend || sb == SB_Format);
5147     }
5148     else {
5149         do {
5150             if (*curpos - 2 < strbeg) {
5151                 *curpos = (U8 *) strbeg;
5152                 return SB_EDGE;
5153             }
5154             (*curpos)--;
5155             sb = getSB_VAL_CP(*(*curpos - 1));
5156         } while (sb == SB_Extend || sb == SB_Format);
5157     }
5158
5159     return sb;
5160 }
5161
5162 STATIC bool
5163 S_isWB(pTHX_ WB_enum previous,
5164              WB_enum before,
5165              WB_enum after,
5166              const U8 * const strbeg,
5167              const U8 * const curpos,
5168              const U8 * const strend,
5169              const bool utf8_target)
5170 {
5171     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5172      *  a Unicode word break, using their published algorithm, but tailored for
5173      *  Perl by treating spans of white space as one unit.  Context may be
5174      *  needed to make this determination.  If the value for the character
5175      *  before 'before' is known, it is passed as 'previous'; otherwise that
5176      *  should be set to WB_UNKNOWN.  The other input parameters give the
5177      *  boundaries and current position in the matching of the string.  That
5178      *  is, 'curpos' marks the position where the character whose wb value is
5179      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5180
5181     U8 * before_pos = (U8 *) curpos;
5182     U8 * after_pos = (U8 *) curpos;
5183     WB_enum prev = before;
5184     WB_enum next;
5185
5186     PERL_ARGS_ASSERT_ISWB;
5187
5188     /* Rule numbers in the comments below are as of Unicode 9.0 */
5189
5190   redo:
5191     before = prev;
5192     switch (WB_table[before][after]) {
5193         case WB_BREAKABLE:
5194             return TRUE;
5195
5196         case WB_NOBREAK:
5197             return FALSE;
5198
5199         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5200             next = advance_one_WB(&after_pos, strend, utf8_target,
5201                                  FALSE /* Don't skip Extend nor Format */ );
5202             /* A space immediately preceeding an Extend or Format is attached
5203              * to by them, and hence gets separated from previous spaces.
5204              * Otherwise don't break between horizontal white space */
5205             return next == WB_Extend || next == WB_Format;
5206
5207         /* WB4 Ignore Format and Extend characters, except when they appear at
5208          * the beginning of a region of text.  This code currently isn't
5209          * general purpose, but it works as the rules are currently and likely
5210          * to be laid out.  The reason it works is that when 'they appear at
5211          * the beginning of a region of text', the rule is to break before
5212          * them, just like any other character.  Therefore, the default rule
5213          * applies and we don't have to look in more depth.  Should this ever
5214          * change, we would have to have 2 'case' statements, like in the rules
5215          * below, and backup a single character (not spacing over the extend
5216          * ones) and then see if that is one of the region-end characters and
5217          * go from there */
5218         case WB_Ex_or_FO_or_ZWJ_then_foo:
5219             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5220             goto redo;
5221
5222         case WB_DQ_then_HL + WB_BREAKABLE:
5223         case WB_DQ_then_HL + WB_NOBREAK:
5224
5225             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5226
5227             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5228                                                             == WB_Hebrew_Letter)
5229             {
5230                 return FALSE;
5231             }
5232
5233              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5234
5235         case WB_HL_then_DQ + WB_BREAKABLE:
5236         case WB_HL_then_DQ + WB_NOBREAK:
5237
5238             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5239
5240             if (advance_one_WB(&after_pos, strend, utf8_target,
5241                                        TRUE /* Do skip Extend and Format */ )
5242                                                             == WB_Hebrew_Letter)
5243             {
5244                 return FALSE;
5245             }
5246
5247             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5248
5249         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5250         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5251
5252             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5253              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5254
5255             next = advance_one_WB(&after_pos, strend, utf8_target,
5256                                        TRUE /* Do skip Extend and Format */ );
5257
5258             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5259             {
5260                 return FALSE;
5261             }
5262
5263             return WB_table[before][after]
5264                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5265
5266         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5267         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5268
5269             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5270              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5271
5272             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5273             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5274             {
5275                 return FALSE;
5276             }
5277
5278             return WB_table[before][after]
5279                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5280
5281         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5282         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5283
5284             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5285              * */
5286
5287             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5288                                                             == WB_Numeric)
5289             {
5290                 return FALSE;
5291             }
5292
5293             return WB_table[before][after]
5294                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5295
5296         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5297         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5298
5299             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5300
5301             if (advance_one_WB(&after_pos, strend, utf8_target,
5302                                        TRUE /* Do skip Extend and Format */ )
5303                                                             == WB_Numeric)
5304             {
5305                 return FALSE;
5306             }
5307
5308             return WB_table[before][after]
5309                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5310
5311         case WB_RI_then_RI + WB_NOBREAK:
5312         case WB_RI_then_RI + WB_BREAKABLE:
5313             {
5314                 int RI_count = 1;
5315
5316                 /* Do not break within emoji flag sequences. That is, do not
5317                  * break between regional indicator (RI) symbols if there is an
5318                  * odd number of RI characters before the potential break
5319                  * point.
5320                  *
5321                  * WB15   sot (RI RI)* RI × RI
5322                  * WB16 [^RI] (RI RI)* RI × RI */
5323
5324                 while (backup_one_WB(&previous,
5325                                      strbeg,
5326                                      &before_pos,
5327                                      utf8_target) == WB_Regional_Indicator)
5328                 {
5329                     RI_count++;
5330                 }
5331
5332                 return RI_count % 2 != 1;
5333             }
5334
5335         default:
5336             break;
5337     }
5338
5339 #ifdef DEBUGGING
5340     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5341                                   before, after, WB_table[before][after]);
5342     assert(0);
5343 #endif
5344     return TRUE;
5345 }
5346
5347 STATIC WB_enum
5348 S_advance_one_WB(pTHX_ U8 ** curpos,
5349                        const U8 * const strend,
5350                        const bool utf8_target,
5351                        const bool skip_Extend_Format)
5352 {
5353     dVAR;
5354     WB_enum wb;
5355
5356     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5357
5358     if (*curpos >= strend) {
5359         return WB_EDGE;
5360     }
5361
5362     if (utf8_target) {
5363
5364         /* Advance over Extend and Format */
5365         do {
5366             *curpos += UTF8SKIP(*curpos);
5367             if (*curpos >= strend) {
5368                 return WB_EDGE;
5369             }
5370             wb = getWB_VAL_UTF8(*curpos, strend);
5371         } while (    skip_Extend_Format
5372                  && (wb == WB_Extend || wb == WB_Format));
5373     }
5374     else {
5375         do {
5376             (*curpos)++;
5377             if (*curpos >= strend) {
5378                 return WB_EDGE;
5379             }
5380             wb = getWB_VAL_CP(**curpos);
5381         } while (    skip_Extend_Format
5382                  && (wb == WB_Extend || wb == WB_Format));
5383     }
5384
5385     return wb;
5386 }
5387
5388 STATIC WB_enum
5389 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5390 {
5391     dVAR;
5392     WB_enum wb;
5393
5394     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5395
5396     /* If we know what the previous character's break value is, don't have
5397         * to look it up */
5398     if (*previous != WB_UNKNOWN) {
5399         wb = *previous;
5400
5401         /* But we need to move backwards by one */
5402         if (utf8_target) {
5403             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5404             if (! *curpos) {
5405                 *previous = WB_EDGE;
5406                 *curpos = (U8 *) strbeg;
5407             }
5408             else {
5409                 *previous = WB_UNKNOWN;
5410             }
5411         }
5412         else {
5413             (*curpos)--;
5414             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5415         }
5416
5417         /* And we always back up over these three types */
5418         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5419             return wb;
5420         }
5421     }
5422
5423     if (*curpos < strbeg) {
5424         return WB_EDGE;
5425     }
5426
5427     if (utf8_target) {
5428         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5429         if (! prev_char_pos) {
5430             return WB_EDGE;
5431         }
5432
5433         /* Back up over Extend and Format.  curpos is always just to the right
5434          * of the characater whose value we are getting */
5435         do {
5436             U8 * prev_prev_char_pos;
5437             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5438                                                    -1,
5439                                                    strbeg)))
5440             {
5441                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5442                 *curpos = prev_char_pos;
5443                 prev_char_pos = prev_prev_char_pos;
5444             }
5445             else {
5446                 *curpos = (U8 *) strbeg;
5447                 return WB_EDGE;
5448             }
5449         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5450     }
5451     else {
5452         do {
5453             if (*curpos - 2 < strbeg) {
5454                 *curpos = (U8 *) strbeg;
5455                 return WB_EDGE;
5456             }
5457             (*curpos)--;
5458             wb = getWB_VAL_CP(*(*curpos - 1));
5459         } while (wb == WB_Extend || wb == WB_Format);
5460     }
5461
5462     return wb;
5463 }
5464
5465 /* Macros for regmatch(), using its internal variables */
5466 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
5467 #define NEXTCHR_IS_EOS (nextchr < 0)
5468
5469 #define SET_nextchr \
5470     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
5471
5472 #define SET_locinput(p) \
5473     locinput = (p);  \
5474     SET_nextchr
5475
5476 #define sayYES goto yes
5477 #define sayNO goto no
5478 #define sayNO_SILENT goto no_silent
5479
5480 /* we dont use STMT_START/END here because it leads to
5481    "unreachable code" warnings, which are bogus, but distracting. */
5482 #define CACHEsayNO \
5483     if (ST.cache_mask) \
5484        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
5485     sayNO
5486
5487 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5488 (                                                           \
5489     (   ( st )                                         ) && \
5490     (   ( st )->u.eval.close_paren                     ) && \
5491     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5492 )
5493
5494 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5495 (                                                           \
5496     (   ( st )                                         ) && \
5497     (   ( st )->u.eval.close_paren                     ) && \
5498     (   ( expr )                                       ) && \
5499     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5500 )
5501
5502
5503 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5504     (st)->u.eval.close_paren = ( (expr) + 1 )
5505
5506 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5507     (st)->u.eval.close_paren = 0
5508
5509 /* push a new state then goto it */
5510
5511 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
5512     pushinput = input; \
5513     pusheol = eol; \
5514     pushsr0 = sr0; \
5515     scan = node; \
5516     st->resume_state = state; \
5517     goto push_state;
5518
5519 /* push a new state with success backtracking, then goto it */
5520
5521 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
5522     pushinput = input; \
5523     pusheol = eol;     \
5524     pushsr0 = sr0; \
5525     scan = node; \
5526     st->resume_state = state; \
5527     goto push_yes_state;
5528
5529 #define DEBUG_STATE_pp(pp)                                  \
5530     DEBUG_STATE_r({                                         \
5531         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
5532         Perl_re_printf( aTHX_                               \
5533             "%*s" pp " %s%s%s%s%s\n",                       \
5534             INDENT_CHARS(depth), "",                        \
5535             PL_reg_name[st->resume_state],                  \
5536             ((st==yes_state||st==mark_state) ? "[" : ""),   \
5537             ((st==yes_state) ? "Y" : ""),                   \
5538             ((st==mark_state) ? "M" : ""),                  \
5539             ((st==yes_state||st==mark_state) ? "]" : "")    \
5540         );                                                  \
5541     });
5542
5543 /*
5544
5545 regmatch() - main matching routine
5546
5547 This is basically one big switch statement in a loop. We execute an op,
5548 set 'next' to point the next op, and continue. If we come to a point which
5549 we may need to backtrack to on failure such as (A|B|C), we push a
5550 backtrack state onto the backtrack stack. On failure, we pop the top
5551 state, and re-enter the loop at the state indicated. If there are no more
5552 states to pop, we return failure.
5553
5554 Sometimes we also need to backtrack on success; for example /A+/, where
5555 after successfully matching one A, we need to go back and try to
5556 match another one; similarly for lookahead assertions: if the assertion
5557 completes successfully, we backtrack to the state just before the assertion
5558 and then carry on.  In these cases, the pushed state is marked as
5559 'backtrack on success too'. This marking is in fact done by a chain of
5560 pointers, each pointing to the previous 'yes' state. On success, we pop to
5561 the nearest yes state, discarding any intermediate failure-only states.
5562 Sometimes a yes state is pushed just to force some cleanup code to be
5563 called at the end of a successful match or submatch; e.g. (??{$re}) uses
5564 it to free the inner regex.
5565
5566 Note that failure backtracking rewinds the cursor position, while
5567 success backtracking leaves it alone.
5568
5569 A pattern is complete when the END op is executed, while a subpattern
5570 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
5571 ops trigger the "pop to last yes state if any, otherwise return true"
5572 behaviour.
5573
5574 A common convention in this function is to use A and B to refer to the two
5575 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
5576 the subpattern to be matched possibly multiple times, while B is the entire
5577 rest of the pattern. Variable and state names reflect this convention.
5578
5579 The states in the main switch are the union of ops and failure/success of
5580 substates associated with with that op.  For example, IFMATCH is the op
5581 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
5582 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
5583 successfully matched A and IFMATCH_A_fail is a state saying that we have
5584 just failed to match A. Resume states always come in pairs. The backtrack
5585 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
5586 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
5587 on success or failure.
5588
5589 The struct that holds a backtracking state is actually a big union, with
5590 one variant for each major type of op. The variable st points to the
5591 top-most backtrack struct. To make the code clearer, within each
5592 block of code we #define ST to alias the relevant union.
5593
5594 Here's a concrete example of a (vastly oversimplified) IFMATCH
5595 implementation:
5596
5597     switch (state) {
5598     ....
5599
5600 #define ST st->u.ifmatch
5601
5602     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5603         ST.foo = ...; // some state we wish to save
5604         ...
5605         // push a yes backtrack state with a resume value of
5606         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
5607         // first node of A:
5608         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
5609         // NOTREACHED
5610
5611     case IFMATCH_A: // we have successfully executed A; now continue with B
5612         next = B;
5613         bar = ST.foo; // do something with the preserved value
5614         break;
5615
5616     case IFMATCH_A_fail: // A failed, so the assertion failed
5617         ...;   // do some housekeeping, then ...
5618         sayNO; // propagate the failure
5619
5620 #undef ST
5621
5622     ...
5623     }
5624
5625 For any old-timers reading this who are familiar with the old recursive
5626 approach, the code above is equivalent to:
5627
5628     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5629     {
5630         int foo = ...
5631         ...
5632         if (regmatch(A)) {
5633             next = B;
5634             bar = foo;
5635             break;
5636         }
5637         ...;   // do some housekeeping, then ...
5638         sayNO; // propagate the failure
5639     }
5640
5641 The topmost backtrack state, pointed to by st, is usually free. If you
5642 want to claim it, populate any ST.foo fields in it with values you wish to
5643 save, then do one of
5644
5645         PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
5646         PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
5647
5648 which sets that backtrack state's resume value to 'resume_state', pushes a
5649 new free entry to the top of the backtrack stack, then goes to 'node'.
5650 On backtracking, the free slot is popped, and the saved state becomes the
5651 new free state. An ST.foo field in this new top state can be temporarily
5652 accessed to retrieve values, but once the main loop is re-entered, it
5653 becomes available for reuse.
5654
5655 Note that the depth of the backtrack stack constantly increases during the
5656 left-to-right execution of the pattern, rather than going up and down with
5657 the pattern nesting. For example the stack is at its maximum at Z at the
5658 end of the pattern, rather than at X in the following:
5659
5660     /(((X)+)+)+....(Y)+....Z/
5661
5662 The only exceptions to this are lookahead/behind assertions and the cut,
5663 (?>A), which pop all the backtrack states associated with A before
5664 continuing.
5665
5666 Backtrack state structs are allocated in slabs of about 4K in size.
5667 PL_regmatch_state and st always point to the currently active state,
5668 and PL_regmatch_slab points to the slab currently containing
5669 PL_regmatch_state.  The first time regmatch() is called, the first slab is
5670 allocated, and is never freed until interpreter destruction. When the slab
5671 is full, a new one is allocated and chained to the end. At exit from
5672 regmatch(), slabs allocated since entry are freed.
5673
5674 In order to work with variable length lookbehinds, an upper limit is placed on
5675 lookbehinds which is set to where the match position is at the end of where the
5676 lookbehind would get to.  Nothing in the lookbehind should match above that,
5677 except we should be able to look beyond if for things like \b, which need the
5678 next character in the string to be able to determine if this is a boundary or
5679 not.  We also can't match the end of string/line unless we are also at the end
5680 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
5681 that match a width, we have to add a condition that they are within the legal
5682 bounds of our window into the string.
5683
5684 */
5685
5686 /* returns -1 on failure, $+[0] on success */
5687 STATIC SSize_t
5688 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5689 {
5690     dVAR;
5691     const bool utf8_target = reginfo->is_utf8_target;
5692     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5693     REGEXP *rex_sv = reginfo->prog;
5694     regexp *rex = ReANY(rex_sv);
5695     RXi_GET_DECL(rex,rexi);
5696     /* the current state. This is a cached copy of PL_regmatch_state */
5697     regmatch_state *st;
5698     /* cache heavy used fields of st in registers */
5699     regnode *scan;
5700     regnode *next;
5701     U32 n = 0;  /* general value; init to avoid compiler warning */
5702     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5703     SSize_t endref = 0; /* offset of end of backref when ln is start */
5704     char *locinput = startpos;
5705     char *loceol = reginfo->strend;
5706     char *pushinput; /* where to continue after a PUSH */
5707     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
5708     U8   *pushsr0;   /* save starting pos of script run */
5709     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5710
5711     bool result = 0;        /* return value of S_regmatch */
5712     U32 depth = 0;            /* depth of backtrack stack */
5713     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5714     const U32 max_nochange_depth =
5715         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5716         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5717     regmatch_state *yes_state = NULL; /* state to pop to on success of
5718                                                             subpattern */
5719     /* mark_state piggy backs on the yes_state logic so that when we unwind
5720        the stack on success we can update the mark_state as we go */
5721     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5722     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5723     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5724     U32 state_num;
5725     bool no_final = 0;      /* prevent failure from backtracking? */
5726     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5727     char *startpoint = locinput;
5728     SV *popmark = NULL;     /* are we looking for a mark? */
5729     SV *sv_commit = NULL;   /* last mark name seen in failure */
5730     SV *sv_yes_mark = NULL; /* last mark name we have seen
5731                                during a successful match */
5732     U32 lastopen = 0;       /* last open we saw */
5733     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5734     SV* const oreplsv = GvSVn(PL_replgv);
5735     /* these three flags are set by various ops to signal information to
5736      * the very next op. They have a useful lifetime of exactly one loop
5737      * iteration, and are not preserved or restored by state pushes/pops
5738      */
5739     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5740     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5741     int logical = 0;        /* the following EVAL is:
5742                                 0: (?{...})
5743                                 1: (?(?{...})X|Y)
5744                                 2: (??{...})
5745                                or the following IFMATCH/UNLESSM is:
5746                                 false: plain (?=foo)
5747                                 true:  used as a condition: (?(?=foo))
5748                             */
5749     PAD* last_pad = NULL;
5750     dMULTICALL;
5751     U8 gimme = G_SCALAR;
5752     CV *caller_cv = NULL;       /* who called us */
5753     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5754     U32 maxopenparen = 0;       /* max '(' index seen so far */
5755     int to_complement;  /* Invert the result? */
5756     _char_class_number classnum;
5757     bool is_utf8_pat = reginfo->is_utf8_pat;
5758     bool match = FALSE;
5759     I32 orig_savestack_ix = PL_savestack_ix;
5760     U8 * script_run_begin = NULL;
5761
5762 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5763 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5764 #  define SOLARIS_BAD_OPTIMIZER
5765     const U32 *pl_charclass_dup = PL_charclass;
5766 #  define PL_charclass pl_charclass_dup
5767 #endif
5768
5769 #ifdef DEBUGGING
5770     GET_RE_DEBUG_FLAGS_DECL;
5771 #endif
5772
5773     /* protect against undef(*^R) */
5774     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5775
5776     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5777     multicall_oldcatch = 0;
5778     PERL_UNUSED_VAR(multicall_cop);
5779
5780     PERL_ARGS_ASSERT_REGMATCH;
5781
5782     st = PL_regmatch_state;
5783
5784     /* Note that nextchr is a byte even in UTF */
5785     SET_nextchr;
5786     scan = prog;
5787
5788     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5789             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5790             Perl_re_printf( aTHX_ "regmatch start\n" );
5791     }));
5792
5793     while (scan != NULL) {
5794         next = scan + NEXT_OFF(scan);
5795         if (next == scan)
5796             next = NULL;
5797         state_num = OP(scan);
5798
5799       reenter_switch:
5800         DEBUG_EXECUTE_r(
5801             if (state_num <= REGNODE_MAX) {
5802                 SV * const prop = sv_newmortal();
5803                 regnode *rnext = regnext(scan);
5804
5805                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5806                 regprop(rex, prop, scan, reginfo, NULL);
5807                 Perl_re_printf( aTHX_
5808                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5809                     INDENT_CHARS(depth), "",
5810                     (IV)(scan - rexi->program),
5811                     SvPVX_const(prop),
5812                     (PL_regkind[OP(scan)] == END || !rnext) ?
5813                         0 : (IV)(rnext - rexi->program));
5814             }
5815         );
5816
5817         to_complement = 0;
5818
5819         SET_nextchr;
5820         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5821
5822         switch (state_num) {
5823         case SBOL: /*  /^../ and /\A../  */
5824             if (locinput == reginfo->strbeg)
5825                 break;
5826             sayNO;
5827
5828         case MBOL: /*  /^../m  */
5829             if (locinput == reginfo->strbeg ||
5830                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5831             {
5832                 break;
5833             }
5834             sayNO;
5835
5836         case GPOS: /*  \G  */
5837             if (locinput == reginfo->ganch)
5838                 break;
5839             sayNO;
5840
5841         case KEEPS: /*   \K  */
5842             /* update the startpoint */
5843             st->u.keeper.val = rex->offs[0].start;
5844             rex->offs[0].start = locinput - reginfo->strbeg;
5845             PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
5846                             script_run_begin);
5847             NOT_REACHED; /* NOTREACHED */
5848
5849         case KEEPS_next_fail:
5850             /* rollback the start point change */
5851             rex->offs[0].start = st->u.keeper.val;
5852             sayNO_SILENT;
5853             NOT_REACHED; /* NOTREACHED */
5854
5855         case MEOL: /* /..$/m  */
5856             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5857                 sayNO;
5858             break;
5859
5860         case SEOL: /* /..$/  */
5861             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5862                 sayNO;
5863             if (reginfo->strend - locinput > 1)
5864                 sayNO;
5865             break;
5866
5867         case EOS: /*  \z  */
5868             if (!NEXTCHR_IS_EOS)
5869                 sayNO;
5870             break;
5871
5872         case SANY: /*  /./s  */
5873             if (NEXTCHR_IS_EOS || locinput >= loceol)
5874                 sayNO;
5875             goto increment_locinput;
5876
5877         case REG_ANY: /*  /./  */
5878             if (   NEXTCHR_IS_EOS
5879                 || locinput >= loceol
5880                 || nextchr == '\n')
5881             {
5882                 sayNO;
5883             }
5884             goto increment_locinput;
5885
5886
5887 #undef  ST
5888 #define ST st->u.trie
5889         case TRIEC: /* (ab|cd) with known charclass */
5890             /* In this case the charclass data is available inline so
5891                we can fail fast without a lot of extra overhead.
5892              */
5893             if ( !   NEXTCHR_IS_EOS
5894                 &&   locinput < loceol
5895                 && ! ANYOF_BITMAP_TEST(scan, nextchr))
5896             {
5897                 DEBUG_EXECUTE_r(
5898                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5899                               depth, PL_colors[4], PL_colors[5])
5900                 );
5901                 sayNO_SILENT;
5902                 NOT_REACHED; /* NOTREACHED */
5903             }
5904             /* FALLTHROUGH */
5905         case TRIE:  /* (ab|cd)  */
5906             /* the basic plan of execution of the trie is:
5907              * At the beginning, run though all the states, and
5908              * find the longest-matching word. Also remember the position
5909              * of the shortest matching word. For example, this pattern:
5910              *    1  2 3 4    5
5911              *    ab|a|x|abcd|abc
5912              * when matched against the string "abcde", will generate
5913              * accept states for all words except 3, with the longest
5914              * matching word being 4, and the shortest being 2 (with
5915              * the position being after char 1 of the string).
5916              *
5917              * Then for each matching word, in word order (i.e. 1,2,4,5),
5918              * we run the remainder of the pattern; on each try setting
5919              * the current position to the character following the word,
5920              * returning to try the next word on failure.
5921              *
5922              * We avoid having to build a list of words at runtime by
5923              * using a compile-time structure, wordinfo[].prev, which
5924              * gives, for each word, the previous accepting word (if any).
5925              * In the case above it would contain the mappings 1->2, 2->0,
5926              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5927              * the longest word (4 above), a list of all words, by
5928              * following the list of prev pointers; this gives us the
5929              * unordered list 4,5,1,2. Then given the current word we have
5930              * just tried, we can go through the list and find the
5931              * next-biggest word to try (so if we just failed on word 2,
5932              * the next in the list is 4).
5933              *
5934              * Since at runtime we don't record the matching position in
5935              * the string for each word, we have to work that out for
5936              * each word we're about to process. The wordinfo table holds
5937              * the character length of each word; given that we recorded
5938              * at the start: the position of the shortest word and its
5939              * length in chars, we just need to move the pointer the
5940              * difference between the two char lengths. Depending on
5941              * Unicode status and folding, that's cheap or expensive.
5942              *
5943              * This algorithm is optimised for the case where are only a
5944              * small number of accept states, i.e. 0,1, or maybe 2.
5945              * With lots of accepts states, and having to try all of them,
5946              * it becomes quadratic on number of accept states to find all
5947              * the next words.
5948              */
5949
5950             {
5951                 /* what type of TRIE am I? (utf8 makes this contextual) */
5952                 DECL_TRIE_TYPE(scan);
5953
5954                 /* what trie are we using right now */
5955                 reg_trie_data * const trie
5956                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5957                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5958                 U32 state = trie->startstate;
5959
5960                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5961                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5962                     if (utf8_target
5963                         && ! NEXTCHR_IS_EOS
5964                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5965                         && scan->flags == EXACTL)
5966                     {
5967                         /* We only output for EXACTL, as we let the folder
5968                          * output this message for EXACTFLU8 to avoid
5969                          * duplication */
5970                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5971                                                                reginfo->strend);
5972                     }
5973                 }
5974                 if (   trie->bitmap
5975                     && (     NEXTCHR_IS_EOS
5976                         ||   locinput >= loceol
5977                         || ! TRIE_BITMAP_TEST(trie, nextchr)))
5978                 {
5979                     if (trie->states[ state ].wordnum) {
5980                          DEBUG_EXECUTE_r(
5981                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
5982                                           depth, PL_colors[4], PL_colors[5])
5983                         );
5984                         if (!trie->jump)
5985                             break;
5986                     } else {
5987                         DEBUG_EXECUTE_r(
5988                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5989                                           depth, PL_colors[4], PL_colors[5])
5990                         );
5991                         sayNO_SILENT;
5992                    }
5993                 }
5994
5995             {
5996                 U8 *uc = ( U8* )locinput;
5997
5998                 STRLEN len = 0;
5999                 STRLEN foldlen = 0;
6000                 U8 *uscan = (U8*)NULL;
6001                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6002                 U32 charcount = 0; /* how many input chars we have matched */
6003                 U32 accepted = 0; /* have we seen any accepting states? */
6004
6005                 ST.jump = trie->jump;
6006                 ST.me = scan;
6007                 ST.firstpos = NULL;
6008                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6009                 ST.nextword = 0;
6010
6011                 /* fully traverse the TRIE; note the position of the
6012                    shortest accept state and the wordnum of the longest
6013                    accept state */
6014
6015                 while ( state && uc <= (U8*)(loceol) ) {
6016                     U32 base = trie->states[ state ].trans.base;
6017                     UV uvc = 0;
6018                     U16 charid = 0;
6019                     U16 wordnum;
6020                     wordnum = trie->states[ state ].wordnum;
6021
6022                     if (wordnum) { /* it's an accept state */
6023                         if (!accepted) {
6024                             accepted = 1;
6025                             /* record first match position */
6026                             if (ST.longfold) {
6027                                 ST.firstpos = (U8*)locinput;
6028                                 ST.firstchars = 0;
6029                             }
6030                             else {
6031                                 ST.firstpos = uc;
6032                                 ST.firstchars = charcount;
6033                             }
6034                         }
6035                         if (!ST.nextword || wordnum < ST.nextword)
6036                             ST.nextword = wordnum;
6037                         ST.topword = wordnum;
6038                     }
6039
6040                     DEBUG_TRIE_EXECUTE_r({
6041                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6042                                 /* HERE */
6043                                 PerlIO_printf( Perl_debug_log,
6044                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6045                                     INDENT_CHARS(depth), "", PL_colors[4],
6046                                     (UV)state, (accepted ? 'Y' : 'N'));
6047                     });
6048
6049                     /* read a char and goto next state */
6050                     if ( base && (foldlen || uc < (U8*)(loceol))) {
6051                         I32 offset;
6052                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6053                                              (U8 *) loceol, uscan,
6054                                              len, uvc, charid, foldlen,
6055                                              foldbuf, uniflags);
6056                         charcount++;
6057                         if (foldlen>0)
6058                             ST.longfold = TRUE;
6059                         if (charid &&
6060                              ( ((offset =
6061                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6062
6063                              && ((U32)offset < trie->lasttrans)
6064                              && trie->trans[offset].check == state)
6065                         {
6066                             state = trie->trans[offset].next;
6067                         }
6068                         else {
6069                             state = 0;
6070                         }
6071                         uc += len;
6072
6073                     }
6074                     else {
6075                         state = 0;
6076                     }
6077                     DEBUG_TRIE_EXECUTE_r(
6078                         Perl_re_printf( aTHX_
6079                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6080                             charid, uvc, (UV)state, PL_colors[5] );
6081                     );
6082                 }
6083                 if (!accepted)
6084                    sayNO;
6085
6086                 /* calculate total number of accept states */
6087                 {
6088                     U16 w = ST.topword;
6089                     accepted = 0;
6090                     while (w) {
6091                         w = trie->wordinfo[w].prev;
6092                         accepted++;
6093                     }
6094                     ST.accepted = accepted;
6095                 }
6096
6097                 DEBUG_EXECUTE_r(
6098                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6099                         depth,
6100                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6101                 );
6102                 goto trie_first_try; /* jump into the fail handler */
6103             }}
6104             NOT_REACHED; /* NOTREACHED */
6105
6106         case TRIE_next_fail: /* we failed - try next alternative */
6107         {
6108             U8 *uc;
6109             if ( ST.jump ) {
6110                 /* undo any captures done in the tail part of a branch,
6111                  * e.g.
6112                  *    /(?:X(.)(.)|Y(.)).../
6113                  * where the trie just matches X then calls out to do the
6114                  * rest of the branch */
6115                 REGCP_UNWIND(ST.cp);
6116                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6117             }
6118             if (!--ST.accepted) {
6119                 DEBUG_EXECUTE_r({
6120                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6121                         depth,
6122                         PL_colors[4],
6123                         PL_colors[5] );
6124                 });
6125                 sayNO_SILENT;
6126             }
6127             {
6128                 /* Find next-highest word to process.  Note that this code
6129                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6130                 U16 min = 0;
6131                 U16 word;
6132                 U16 const nextword = ST.nextword;
6133                 reg_trie_wordinfo * const wordinfo
6134                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6135                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6136                     if (word > nextword && (!min || word < min))
6137                         min = word;
6138                 }
6139                 ST.nextword = min;
6140             }
6141
6142           trie_first_try:
6143             if (do_cutgroup) {
6144                 do_cutgroup = 0;
6145                 no_final = 0;
6146             }
6147
6148             if ( ST.jump ) {
6149                 ST.lastparen = rex->lastparen;
6150                 ST.lastcloseparen = rex->lastcloseparen;
6151                 REGCP_SET(ST.cp);
6152             }
6153
6154             /* find start char of end of current word */
6155             {
6156                 U32 chars; /* how many chars to skip */
6157                 reg_trie_data * const trie
6158                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6159
6160                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6161                             >=  ST.firstchars);
6162                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6163                             - ST.firstchars;
6164                 uc = ST.firstpos;
6165
6166                 if (ST.longfold) {
6167                     /* the hard option - fold each char in turn and find
6168                      * its folded length (which may be different */
6169                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6170                     STRLEN foldlen;
6171                     STRLEN len;
6172                     UV uvc;
6173                     U8 *uscan;
6174
6175                     while (chars) {
6176                         if (utf8_target) {
6177                             /* XXX This assumes the length is well-formed, as
6178                              * does the UTF8SKIP below */
6179                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6180                                                     uniflags);
6181                             uc += len;
6182                         }
6183                         else {
6184                             uvc = *uc;
6185                             uc++;
6186                         }
6187                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6188                         uscan = foldbuf;
6189                         while (foldlen) {
6190                             if (!--chars)
6191                                 break;
6192                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6193                                                  uniflags);
6194                             uscan += len;
6195                             foldlen -= len;
6196                         }
6197                     }
6198                 }
6199                 else {
6200                     if (utf8_target)
6201                         while (chars--)
6202                             uc += UTF8SKIP(uc);
6203                     else
6204                         uc += chars;
6205                 }
6206             }
6207
6208             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6209                             ? ST.jump[ST.nextword]
6210                             : NEXT_OFF(ST.me));
6211
6212             DEBUG_EXECUTE_r({
6213                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6214                     depth,
6215                     PL_colors[4],
6216                     ST.nextword,
6217                     PL_colors[5]
6218                     );
6219             });
6220
6221             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6222                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6223                                 script_run_begin);
6224                 NOT_REACHED; /* NOTREACHED */
6225             }
6226             /* only one choice left - just continue */
6227             DEBUG_EXECUTE_r({
6228                 AV *const trie_words
6229                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6230                 SV ** const tmp = trie_words
6231                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6232                 SV *sv= tmp ? sv_newmortal() : NULL;
6233
6234                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6235                     depth, PL_colors[4],
6236                     ST.nextword,
6237                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6238                             PL_colors[0], PL_colors[1],
6239                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6240                         )
6241                     : "not compiled under -Dr",
6242                     PL_colors[5] );
6243             });
6244
6245             locinput = (char*)uc;
6246             continue; /* execute rest of RE */
6247             /* NOTREACHED */
6248         }
6249 #undef  ST
6250
6251         case EXACTL:             /*  /abc/l       */
6252             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6253
6254             /* Complete checking would involve going through every character
6255              * matched by the string to see if any is above latin1.  But the
6256              * comparision otherwise might very well be a fast assembly
6257              * language routine, and I (khw) don't think slowing things down
6258              * just to check for this warning is worth it.  So this just checks
6259              * the first character */
6260             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6261                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6262             }
6263             goto do_exact;
6264         case EXACT_ONLY8:
6265             if (! utf8_target) {
6266                 sayNO;
6267             }
6268             /* FALLTHROUGH */
6269         case EXACT: {            /*  /abc/        */
6270             char *s;
6271           do_exact:
6272             s = STRING(scan);
6273             ln = STR_LEN(scan);
6274             if (utf8_target != is_utf8_pat) {
6275                 /* The target and the pattern have differing utf8ness. */
6276                 char *l = locinput;
6277                 const char * const e = s + ln;
6278
6279                 if (utf8_target) {
6280                     /* The target is utf8, the pattern is not utf8.
6281                      * Above-Latin1 code points can't match the pattern;
6282                      * invariants match exactly, and the other Latin1 ones need
6283                      * to be downgraded to a single byte in order to do the
6284                      * comparison.  (If we could be confident that the target
6285                      * is not malformed, this could be refactored to have fewer
6286                      * tests by just assuming that if the first bytes match, it
6287                      * is an invariant, but there are tests in the test suite
6288                      * dealing with (??{...}) which violate this) */
6289                     while (s < e) {
6290                         if (   l >= loceol
6291                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6292                         {
6293                             sayNO;
6294                         }
6295                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6296                             if (*l != *s) {
6297                                 sayNO;
6298                             }
6299                             l++;
6300                         }
6301                         else {
6302                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6303                             {
6304                                 sayNO;
6305                             }
6306                             l += 2;
6307                         }
6308                         s++;
6309                     }
6310                 }
6311                 else {
6312                     /* The target is not utf8, the pattern is utf8. */
6313                     while (s < e) {
6314                         if (   l >= loceol
6315                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6316                         {
6317                             sayNO;
6318                         }
6319                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6320                             if (*s != *l) {
6321                                 sayNO;
6322                             }
6323                             s++;
6324                         }
6325                         else {
6326                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6327                             {
6328                                 sayNO;
6329                             }
6330                             s += 2;
6331                         }
6332                         l++;
6333                     }
6334                 }
6335                 locinput = l;
6336             }
6337             else {
6338                 /* The target and the pattern have the same utf8ness. */
6339                 /* Inline the first character, for speed. */
6340                 if (   loceol - locinput < ln
6341                     || UCHARAT(s) != nextchr
6342                     || (ln > 1 && memNE(s, locinput, ln)))
6343                 {
6344                     sayNO;
6345                 }
6346                 locinput += ln;
6347             }
6348             break;
6349             }
6350
6351         case EXACTFL:            /*  /abc/il      */
6352           {
6353             re_fold_t folder;
6354             const U8 * fold_array;
6355             const char * s;
6356             U32 fold_utf8_flags;
6357
6358             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6359             folder = foldEQ_locale;
6360             fold_array = PL_fold_locale;
6361             fold_utf8_flags = FOLDEQ_LOCALE;
6362             goto do_exactf;
6363
6364         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6365                                       is effectively /u; hence to match, target
6366                                       must be UTF-8. */
6367             if (! utf8_target) {
6368                 sayNO;
6369             }
6370             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6371                                              | FOLDEQ_S2_FOLDS_SANE;
6372             folder = foldEQ_latin1_s2_folded;
6373             fold_array = PL_fold_latin1;
6374             goto do_exactf;
6375
6376         case EXACTFU_ONLY8:      /* /abc/iu with something in /abc/ > 255 */
6377             if (! utf8_target) {
6378                 sayNO;
6379             }
6380             assert(is_utf8_pat);
6381             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6382             goto do_exactf;
6383
6384         case EXACTFUP:          /*  /foo/iu, and something is problematic in
6385                                     'foo' so can't take shortcuts. */
6386             assert(! is_utf8_pat);
6387             folder = foldEQ_latin1;
6388             fold_array = PL_fold_latin1;
6389             fold_utf8_flags = 0;
6390             goto do_exactf;
6391
6392         case EXACTFU:            /*  /abc/iu      */
6393             folder = foldEQ_latin1_s2_folded;
6394             fold_array = PL_fold_latin1;
6395             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6396             goto do_exactf;
6397
6398         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6399                                    patterns */
6400             assert(! is_utf8_pat);
6401             /* FALLTHROUGH */
6402         case EXACTFAA:            /*  /abc/iaa     */
6403             folder = foldEQ_latin1_s2_folded;
6404             fold_array = PL_fold_latin1;
6405             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6406             if (is_utf8_pat || ! utf8_target) {
6407
6408                 /* The possible presence of a MICRO SIGN in the pattern forbids
6409                  * us to view a non-UTF-8 pattern as folded when there is a
6410                  * UTF-8 target */
6411                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
6412                                   |FOLDEQ_S2_FOLDS_SANE;
6413             }
6414             goto do_exactf;
6415
6416
6417         case EXACTF:             /*  /abc/i    This node only generated for
6418                                                non-utf8 patterns */
6419             assert(! is_utf8_pat);
6420             folder = foldEQ;
6421             fold_array = PL_fold;
6422             fold_utf8_flags = 0;
6423
6424           do_exactf:
6425             s = STRING(scan);
6426             ln = STR_LEN(scan);
6427
6428             if (   utf8_target
6429                 || is_utf8_pat
6430                 || state_num == EXACTFUP
6431                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6432             {
6433               /* Either target or the pattern are utf8, or has the issue where
6434                * the fold lengths may differ. */
6435                 const char * const l = locinput;
6436                 char *e = loceol;
6437
6438                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
6439                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
6440                 {
6441                     sayNO;
6442                 }
6443                 locinput = e;
6444                 break;
6445             }
6446
6447             /* Neither the target nor the pattern are utf8 */
6448             if (UCHARAT(s) != nextchr
6449                 && !NEXTCHR_IS_EOS
6450                 && UCHARAT(s) != fold_array[nextchr])
6451             {
6452                 sayNO;
6453             }
6454             if (loceol - locinput < ln)
6455                 sayNO;
6456             if (ln > 1 && ! folder(locinput, s, ln))
6457                 sayNO;
6458             locinput += ln;
6459             break;
6460         }
6461
6462         case NBOUNDL: /*  /\B/l  */
6463             to_complement = 1;
6464             /* FALLTHROUGH */
6465
6466         case BOUNDL:  /*  /\b/l  */
6467         {
6468             bool b1, b2;
6469             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6470
6471             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6472                 if (! IN_UTF8_CTYPE_LOCALE) {
6473                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6474                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6475                 }
6476                 goto boundu;
6477             }
6478
6479             if (utf8_target) {
6480                 if (locinput == reginfo->strbeg)
6481                     b1 = isWORDCHAR_LC('\n');
6482                 else {
6483                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6484                                                         (U8*)(reginfo->strbeg)),
6485                                                  (U8*)(reginfo->strend));
6486                 }
6487                 b2 = (NEXTCHR_IS_EOS)
6488                     ? isWORDCHAR_LC('\n')
6489                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6490                                               (U8*) reginfo->strend);
6491             }
6492             else { /* Here the string isn't utf8 */
6493                 b1 = (locinput == reginfo->strbeg)
6494                      ? isWORDCHAR_LC('\n')
6495                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6496                 b2 = (NEXTCHR_IS_EOS)
6497                     ? isWORDCHAR_LC('\n')
6498                     : isWORDCHAR_LC(nextchr);
6499             }
6500             if (to_complement ^ (b1 == b2)) {
6501                 sayNO;
6502             }
6503             break;
6504         }
6505
6506         case NBOUND:  /*  /\B/   */
6507             to_complement = 1;
6508             /* FALLTHROUGH */
6509
6510         case BOUND:   /*  /\b/   */
6511             if (utf8_target) {
6512                 goto bound_utf8;
6513             }
6514             goto bound_ascii_match_only;
6515
6516         case NBOUNDA: /*  /\B/a  */
6517             to_complement = 1;
6518             /* FALLTHROUGH */
6519
6520         case BOUNDA:  /*  /\b/a  */
6521         {
6522             bool b1, b2;
6523
6524           bound_ascii_match_only:
6525             /* Here the string isn't utf8, or is utf8 and only ascii characters
6526              * are to match \w.  In the latter case looking at the byte just
6527              * prior to the current one may be just the final byte of a
6528              * multi-byte character.  This is ok.  There are two cases:
6529              * 1) it is a single byte character, and then the test is doing
6530              *    just what it's supposed to.
6531              * 2) it is a multi-byte character, in which case the final byte is
6532              *    never mistakable for ASCII, and so the test will say it is
6533              *    not a word character, which is the correct answer. */
6534             b1 = (locinput == reginfo->strbeg)
6535                  ? isWORDCHAR_A('\n')
6536                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6537             b2 = (NEXTCHR_IS_EOS)
6538                 ? isWORDCHAR_A('\n')
6539                 : isWORDCHAR_A(nextchr);
6540             if (to_complement ^ (b1 == b2)) {
6541                 sayNO;
6542             }
6543             break;
6544         }
6545
6546         case NBOUNDU: /*  /\B/u  */
6547             to_complement = 1;
6548             /* FALLTHROUGH */
6549
6550         case BOUNDU:  /*  /\b/u  */
6551
6552           boundu:
6553             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6554                 match = FALSE;
6555             }
6556             else if (utf8_target) {
6557               bound_utf8:
6558                 switch((bound_type) FLAGS(scan)) {
6559                     case TRADITIONAL_BOUND:
6560                     {
6561                         bool b1, b2;
6562                         b1 = (locinput == reginfo->strbeg)
6563                              ? 0 /* isWORDCHAR_L1('\n') */
6564                              : isWORDCHAR_utf8_safe(
6565                                                reghop3((U8*)locinput,
6566                                                        -1,
6567                                                        (U8*)(reginfo->strbeg)),
6568                                                     (U8*) reginfo->strend);
6569                         b2 = (NEXTCHR_IS_EOS)
6570                             ? 0 /* isWORDCHAR_L1('\n') */
6571                             : isWORDCHAR_utf8_safe((U8*)locinput,
6572                                                    (U8*) reginfo->strend);
6573                         match = cBOOL(b1 != b2);
6574                         break;
6575                     }
6576                     case GCB_BOUND:
6577                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6578                             match = TRUE; /* GCB always matches at begin and
6579                                              end */
6580                         }
6581                         else {
6582                             /* Find the gcb values of previous and current
6583                              * chars, then see if is a break point */
6584                             match = isGCB(getGCB_VAL_UTF8(
6585                                                 reghop3((U8*)locinput,
6586                                                         -1,
6587                                                         (U8*)(reginfo->strbeg)),
6588                                                 (U8*) reginfo->strend),
6589                                           getGCB_VAL_UTF8((U8*) locinput,
6590                                                         (U8*) reginfo->strend),
6591                                           (U8*) reginfo->strbeg,
6592                                           (U8*) locinput,
6593                                           utf8_target);
6594                         }
6595                         break;
6596
6597                     case LB_BOUND:
6598                         if (locinput == reginfo->strbeg) {
6599                             match = FALSE;
6600                         }
6601                         else if (NEXTCHR_IS_EOS) {
6602                             match = TRUE;
6603                         }
6604                         else {
6605                             match = isLB(getLB_VAL_UTF8(
6606                                                 reghop3((U8*)locinput,
6607                                                         -1,
6608                                                         (U8*)(reginfo->strbeg)),
6609                                                 (U8*) reginfo->strend),
6610                                           getLB_VAL_UTF8((U8*) locinput,
6611                                                         (U8*) reginfo->strend),
6612                                           (U8*) reginfo->strbeg,
6613                                           (U8*) locinput,
6614                                           (U8*) reginfo->strend,
6615                                           utf8_target);
6616                         }
6617                         break;
6618
6619                     case SB_BOUND: /* Always matches at begin and end */
6620                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6621                             match = TRUE;
6622                         }
6623                         else {
6624                             match = isSB(getSB_VAL_UTF8(
6625                                                 reghop3((U8*)locinput,
6626                                                         -1,
6627                                                         (U8*)(reginfo->strbeg)),
6628                                                 (U8*) reginfo->strend),
6629                                           getSB_VAL_UTF8((U8*) locinput,
6630                                                         (U8*) reginfo->strend),
6631                                           (U8*) reginfo->strbeg,
6632                                           (U8*) locinput,
6633                                           (U8*) reginfo->strend,
6634                                           utf8_target);
6635                         }
6636                         break;
6637
6638                     case WB_BOUND:
6639                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6640                             match = TRUE;
6641                         }
6642                         else {
6643                             match = isWB(WB_UNKNOWN,
6644                                          getWB_VAL_UTF8(
6645                                                 reghop3((U8*)locinput,
6646                                                         -1,
6647                                                         (U8*)(reginfo->strbeg)),
6648                                                 (U8*) reginfo->strend),
6649                                           getWB_VAL_UTF8((U8*) locinput,
6650                                                         (U8*) reginfo->strend),
6651                                           (U8*) reginfo->strbeg,
6652                                           (U8*) locinput,
6653                                           (U8*) reginfo->strend,
6654                                           utf8_target);
6655                         }
6656                         break;
6657                 }
6658             }
6659             else {  /* Not utf8 target */
6660                 switch((bound_type) FLAGS(scan)) {
6661                     case TRADITIONAL_BOUND:
6662                     {
6663                         bool b1, b2;
6664                         b1 = (locinput == reginfo->strbeg)
6665                             ? 0 /* isWORDCHAR_L1('\n') */
6666                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6667                         b2 = (NEXTCHR_IS_EOS)
6668                             ? 0 /* isWORDCHAR_L1('\n') */
6669                             : isWORDCHAR_L1(nextchr);
6670                         match = cBOOL(b1 != b2);
6671                         break;
6672                     }
6673
6674                     case GCB_BOUND:
6675                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6676                             match = TRUE; /* GCB always matches at begin and
6677                                              end */
6678                         }
6679                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6680                                    range */
6681                             match =    UCHARAT(locinput - 1) != '\r'
6682                                     || UCHARAT(locinput) != '\n';
6683                         }
6684                         break;
6685
6686                     case LB_BOUND:
6687                         if (locinput == reginfo->strbeg) {
6688                             match = FALSE;
6689                         }
6690                         else if (NEXTCHR_IS_EOS) {
6691                             match = TRUE;
6692                         }
6693                         else {
6694                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6695                                          getLB_VAL_CP(UCHARAT(locinput)),
6696                                          (U8*) reginfo->strbeg,
6697                                          (U8*) locinput,
6698                                          (U8*) reginfo->strend,
6699                                          utf8_target);
6700                         }
6701                         break;
6702
6703                     case SB_BOUND: /* Always matches at begin and end */
6704                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6705                             match = TRUE;
6706                         }
6707                         else {
6708                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6709                                          getSB_VAL_CP(UCHARAT(locinput)),
6710                                          (U8*) reginfo->strbeg,
6711                                          (U8*) locinput,
6712                                          (U8*) reginfo->strend,
6713                                          utf8_target);
6714                         }
6715                         break;
6716
6717                     case WB_BOUND:
6718                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6719                             match = TRUE;
6720                         }
6721                         else {
6722                             match = isWB(WB_UNKNOWN,
6723                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6724                                          getWB_VAL_CP(UCHARAT(locinput)),
6725                                          (U8*) reginfo->strbeg,
6726                                          (U8*) locinput,
6727                                          (U8*) reginfo->strend,
6728                                          utf8_target);
6729                         }
6730                         break;
6731                 }
6732             }
6733
6734             if (to_complement ^ ! match) {
6735                 sayNO;
6736             }
6737             break;
6738
6739         case ANYOFPOSIXL:
6740         case ANYOFL:  /*  /[abc]/l      */
6741             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6742
6743             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6744             {
6745               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6746             }
6747             /* FALLTHROUGH */
6748         case ANYOFD:  /*   /[abc]/d       */
6749         case ANYOF:  /*   /[abc]/       */
6750             if (NEXTCHR_IS_EOS || locinput >= loceol)
6751                 sayNO;
6752             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
6753                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
6754             {
6755                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
6756                     sayNO;
6757                 }
6758                 locinput++;
6759             }
6760             else {
6761                 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6762                                                                    utf8_target))
6763                 {
6764                     sayNO;
6765                 }
6766                 goto increment_locinput;
6767             }
6768             break;
6769
6770         case ANYOFM:
6771             if (   NEXTCHR_IS_EOS
6772                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
6773                 || locinput >= loceol)
6774             {
6775                 sayNO;
6776             }
6777             locinput++; /* ANYOFM is always single byte */
6778             break;
6779
6780         case NANYOFM:
6781             if (   NEXTCHR_IS_EOS
6782                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
6783                 || locinput >= loceol)
6784             {
6785                 sayNO;
6786             }
6787             goto increment_locinput;
6788             break;
6789
6790         case ANYOFH:
6791             if (   ! utf8_target
6792                 ||   NEXTCHR_IS_EOS
6793                 ||  (   ANYOF_FLAGS(scan) != 0
6794                      && ANYOF_FLAGS(scan) != (U8) *locinput)
6795                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6796                                                                    utf8_target))
6797             {
6798                 sayNO;
6799             }
6800             goto increment_locinput;
6801             break;
6802
6803         /* The argument (FLAGS) to all the POSIX node types is the class number
6804          * */
6805
6806         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6807             to_complement = 1;
6808             /* FALLTHROUGH */
6809
6810         case POSIXL:    /* \w or [:punct:] etc. under /l */
6811             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6812             if (NEXTCHR_IS_EOS || locinput >= loceol)
6813                 sayNO;
6814
6815             /* Use isFOO_lc() for characters within Latin1.  (Note that
6816              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6817              * wouldn't be invariant) */
6818             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6819                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6820                     sayNO;
6821                 }
6822
6823                 locinput++;
6824                 break;
6825             }
6826
6827             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6828                 /* An above Latin-1 code point, or malformed */
6829                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6830                                                        reginfo->strend);
6831                 goto utf8_posix_above_latin1;
6832             }
6833
6834             /* Here is a UTF-8 variant code point below 256 and the target is
6835              * UTF-8 */
6836             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6837                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6838                                             *(locinput + 1))))))
6839             {
6840                 sayNO;
6841             }
6842
6843             goto increment_locinput;
6844
6845         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6846             to_complement = 1;
6847             /* FALLTHROUGH */
6848
6849         case POSIXD:    /* \w or [:punct:] etc. under /d */
6850             if (utf8_target) {
6851                 goto utf8_posix;
6852             }
6853             goto posixa;
6854
6855         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6856
6857             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6858                 sayNO;
6859             }
6860
6861             /* All UTF-8 variants match */
6862             if (! UTF8_IS_INVARIANT(nextchr)) {
6863                 goto increment_locinput;
6864             }
6865
6866             to_complement = 1;
6867             goto join_nposixa;
6868
6869         case POSIXA:    /* \w or [:punct:] etc. under /a */
6870
6871           posixa:
6872             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6873              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6874              * character is a single byte */
6875
6876             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6877                 sayNO;
6878             }
6879
6880           join_nposixa:
6881
6882             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6883                                                                 FLAGS(scan)))))
6884             {
6885                 sayNO;
6886             }
6887
6888             /* Here we are either not in utf8, or we matched a utf8-invariant,
6889              * so the next char is the next byte */
6890             locinput++;
6891             break;
6892
6893         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6894             to_complement = 1;
6895             /* FALLTHROUGH */
6896
6897         case POSIXU:    /* \w or [:punct:] etc. under /u */
6898           utf8_posix:
6899             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6900                 sayNO;
6901             }
6902
6903             /* Use _generic_isCC() for characters within Latin1.  (Note that
6904              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6905              * wouldn't be invariant) */
6906             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6907                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6908                                                            FLAGS(scan)))))
6909                 {
6910                     sayNO;
6911                 }
6912                 locinput++;
6913             }
6914             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6915                 if (! (to_complement
6916                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6917                                                                *(locinput + 1)),
6918                                              FLAGS(scan)))))
6919                 {
6920                     sayNO;
6921                 }
6922                 locinput += 2;
6923             }
6924             else {  /* Handle above Latin-1 code points */
6925               utf8_posix_above_latin1:
6926                 classnum = (_char_class_number) FLAGS(scan);
6927                 switch (classnum) {
6928                     default:
6929                         if (! (to_complement
6930                            ^ cBOOL(_invlist_contains_cp(
6931                                       PL_XPosix_ptrs[classnum],
6932                                       utf8_to_uvchr_buf((U8 *) locinput,
6933                                                         (U8 *) reginfo->strend,
6934                                                         NULL)))))
6935                         {
6936                             sayNO;
6937                         }
6938                         break;
6939                     case _CC_ENUM_SPACE:
6940                         if (! (to_complement
6941                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
6942                         {
6943                             sayNO;
6944                         }
6945                         break;
6946                     case _CC_ENUM_BLANK:
6947                         if (! (to_complement
6948                                         ^ cBOOL(is_HORIZWS_high(locinput))))
6949                         {
6950                             sayNO;
6951                         }
6952                         break;
6953                     case _CC_ENUM_XDIGIT:
6954                         if (! (to_complement
6955                                         ^ cBOOL(is_XDIGIT_high(locinput))))
6956                         {
6957                             sayNO;
6958                         }
6959                         break;
6960                     case _CC_ENUM_VERTSPACE:
6961                         if (! (to_complement
6962                                         ^ cBOOL(is_VERTWS_high(locinput))))
6963                         {
6964                             sayNO;
6965                         }
6966                         break;
6967                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
6968                     case _CC_ENUM_ASCII:
6969                         if (! to_complement) {
6970                             sayNO;
6971                         }
6972                         break;
6973                 }
6974                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
6975             }
6976             break;
6977
6978         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6979                        a Unicode extended Grapheme Cluster */
6980             if (NEXTCHR_IS_EOS || locinput >= loceol)
6981                 sayNO;
6982             if  (! utf8_target) {
6983
6984                 /* Match either CR LF  or '.', as all the other possibilities
6985                  * require utf8 */
6986                 locinput++;         /* Match the . or CR */
6987                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6988                                        match the LF */
6989                     && locinput <  loceol
6990                     && UCHARAT(locinput) == '\n')
6991                 {
6992                     locinput++;
6993                 }
6994             }
6995             else {
6996
6997                 /* Get the gcb type for the current character */
6998                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6999                                                        (U8*) reginfo->strend);
7000
7001                 /* Then scan through the input until we get to the first
7002                  * character whose type is supposed to be a gcb with the
7003                  * current character.  (There is always a break at the
7004                  * end-of-input) */
7005                 locinput += UTF8SKIP(locinput);
7006                 while (locinput < loceol) {
7007                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7008                                                          (U8*) reginfo->strend);
7009                     if (isGCB(prev_gcb, cur_gcb,
7010                               (U8*) reginfo->strbeg, (U8*) locinput,
7011                               utf8_target))
7012                     {
7013                         break;
7014                     }
7015
7016                     prev_gcb = cur_gcb;
7017                     locinput += UTF8SKIP(locinput);
7018                 }
7019
7020
7021             }
7022             break;
7023
7024         case NREFFL:  /*  /\g{name}/il  */
7025         {   /* The capture buffer cases.  The ones beginning with N for the
7026                named buffers just convert to the equivalent numbered and
7027                pretend they were called as the corresponding numbered buffer
7028                op.  */
7029             /* don't initialize these in the declaration, it makes C++
7030                unhappy */
7031             const char *s;
7032             char type;
7033             re_fold_t folder;
7034             const U8 *fold_array;
7035             UV utf8_fold_flags;
7036
7037             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7038             folder = foldEQ_locale;
7039             fold_array = PL_fold_locale;
7040             type = REFFL;
7041             utf8_fold_flags = FOLDEQ_LOCALE;
7042             goto do_nref;
7043
7044         case NREFFA:  /*  /\g{name}/iaa  */
7045             folder = foldEQ_latin1;
7046             fold_array = PL_fold_latin1;
7047             type = REFFA;
7048             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7049             goto do_nref;
7050
7051         case NREFFU:  /*  /\g{name}/iu  */
7052             folder = foldEQ_latin1;
7053             fold_array = PL_fold_latin1;
7054             type = REFFU;
7055             utf8_fold_flags = 0;
7056             goto do_nref;
7057
7058         case NREFF:  /*  /\g{name}/i  */
7059             folder = foldEQ;
7060             fold_array = PL_fold;
7061             type = REFF;
7062             utf8_fold_flags = 0;
7063             goto do_nref;
7064
7065         case NREF:  /*  /\g{name}/   */
7066             type = REF;
7067             folder = NULL;
7068             fold_array = NULL;
7069             utf8_fold_flags = 0;
7070           do_nref:
7071
7072             /* For the named back references, find the corresponding buffer
7073              * number */
7074             n = reg_check_named_buff_matched(rex,scan);
7075
7076             if ( ! n ) {
7077                 sayNO;
7078             }
7079             goto do_nref_ref_common;
7080
7081         case REFFL:  /*  /\1/il  */
7082             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7083             folder = foldEQ_locale;
7084             fold_array = PL_fold_locale;
7085             utf8_fold_flags = FOLDEQ_LOCALE;
7086             goto do_ref;
7087
7088         case REFFA:  /*  /\1/iaa  */
7089             folder = foldEQ_latin1;
7090             fold_array = PL_fold_latin1;
7091             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7092             goto do_ref;
7093
7094         case REFFU:  /*  /\1/iu  */
7095             folder = foldEQ_latin1;
7096             fold_array = PL_fold_latin1;
7097             utf8_fold_flags = 0;
7098             goto do_ref;
7099
7100         case REFF:  /*  /\1/i  */
7101             folder = foldEQ;
7102             fold_array = PL_fold;
7103             utf8_fold_flags = 0;
7104             goto do_ref;
7105
7106         case REF:  /*  /\1/    */
7107             folder = NULL;
7108             fold_array = NULL;
7109             utf8_fold_flags = 0;
7110
7111           do_ref:
7112             type = OP(scan);
7113             n = ARG(scan);  /* which paren pair */
7114
7115           do_nref_ref_common:
7116             ln = rex->offs[n].start;
7117             endref = rex->offs[n].end;
7118             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7119             if (rex->lastparen < n || ln == -1 || endref == -1)
7120                 sayNO;                  /* Do not match unless seen CLOSEn. */
7121             if (ln == endref)
7122                 break;
7123
7124             s = reginfo->strbeg + ln;
7125             if (type != REF     /* REF can do byte comparison */
7126                 && (utf8_target || type == REFFU || type == REFFL))
7127             {
7128                 char * limit = loceol;
7129
7130                 /* This call case insensitively compares the entire buffer
7131                     * at s, with the current input starting at locinput, but
7132                     * not going off the end given by loceol, and
7133                     * returns in <limit> upon success, how much of the
7134                     * current input was matched */
7135                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7136                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7137                 {
7138                     sayNO;
7139                 }
7140                 locinput = limit;
7141                 break;
7142             }
7143
7144             /* Not utf8:  Inline the first character, for speed. */
7145             if ( ! NEXTCHR_IS_EOS
7146                 && locinput < loceol
7147                 && UCHARAT(s) != nextchr
7148                 && (   type == REF
7149                     || UCHARAT(s) != fold_array[nextchr]))
7150             {
7151                 sayNO;
7152             }
7153             ln = endref - ln;
7154             if (locinput + ln > loceol)
7155                 sayNO;
7156             if (ln > 1 && (type == REF
7157                            ? memNE(s, locinput, ln)
7158                            : ! folder(locinput, s, ln)))
7159                 sayNO;
7160             locinput += ln;
7161             break;
7162         }
7163
7164         case NOTHING: /* null op; e.g. the 'nothing' following
7165                        * the '*' in m{(a+|b)*}' */
7166             break;
7167         case TAIL: /* placeholder while compiling (A|B|C) */
7168             break;
7169
7170 #undef  ST
7171 #define ST st->u.eval
7172 #define CUR_EVAL cur_eval->u.eval
7173
7174         {
7175             SV *ret;
7176             REGEXP *re_sv;
7177             regexp *re;
7178             regexp_internal *rei;
7179             regnode *startpoint;
7180             U32 arg;
7181
7182         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7183             arg= (U32)ARG(scan);
7184             if (cur_eval && cur_eval->locinput == locinput) {
7185                 if ( ++nochange_depth > max_nochange_depth )
7186                     Perl_croak(aTHX_
7187                         "Pattern subroutine nesting without pos change"
7188                         " exceeded limit in regex");
7189             } else {
7190                 nochange_depth = 0;
7191             }
7192             re_sv = rex_sv;
7193             re = rex;
7194             rei = rexi;
7195             startpoint = scan + ARG2L(scan);
7196             EVAL_CLOSE_PAREN_SET( st, arg );
7197             /* Detect infinite recursion
7198              *
7199              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7200              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7201              * So we track the position in the string we are at each time
7202              * we recurse and if we try to enter the same routine twice from
7203              * the same position we throw an error.
7204              */
7205             if ( rex->recurse_locinput[arg] == locinput ) {
7206                 /* FIXME: we should show the regop that is failing as part
7207                  * of the error message. */
7208                 Perl_croak(aTHX_ "Infinite recursion in regex");
7209             } else {
7210                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7211                 rex->recurse_locinput[arg]= locinput;
7212
7213                 DEBUG_r({
7214                     GET_RE_DEBUG_FLAGS_DECL;
7215                     DEBUG_STACK_r({
7216                         Perl_re_exec_indentf( aTHX_
7217                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7218                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7219                         );
7220                     });
7221                 });
7222             }
7223
7224             /* Save all the positions seen so far. */
7225             ST.cp = regcppush(rex, 0, maxopenparen);
7226             REGCP_SET(ST.lastcp);
7227
7228             /* and then jump to the code we share with EVAL */
7229             goto eval_recurse_doit;
7230             /* NOTREACHED */
7231
7232         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7233             if (cur_eval && cur_eval->locinput==locinput) {
7234                 if ( ++nochange_depth > max_nochange_depth )
7235                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7236             } else {
7237                 nochange_depth = 0;
7238             }
7239             {
7240                 /* execute the code in the {...} */
7241
7242                 dSP;
7243                 IV before;
7244                 OP * const oop = PL_op;
7245                 COP * const ocurcop = PL_curcop;
7246                 OP *nop;
7247                 CV *newcv;
7248
7249                 /* save *all* paren positions */
7250                 regcppush(rex, 0, maxopenparen);
7251                 REGCP_SET(ST.lastcp);
7252
7253                 if (!caller_cv)
7254                     caller_cv = find_runcv(NULL);
7255
7256                 n = ARG(scan);
7257
7258                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7259                     newcv = (ReANY(
7260                                     (REGEXP*)(rexi->data->data[n])
7261                             ))->qr_anoncv;
7262                     nop = (OP*)rexi->data->data[n+1];
7263                 }
7264                 else if (rexi->data->what[n] == 'l') { /* literal code */
7265                     newcv = caller_cv;
7266                     nop = (OP*)rexi->data->data[n];
7267                     assert(CvDEPTH(newcv));
7268                 }
7269                 else {
7270                     /* literal with own CV */
7271                     assert(rexi->data->what[n] == 'L');
7272                     newcv = rex->qr_anoncv;
7273                     nop = (OP*)rexi->data->data[n];
7274                 }
7275
7276                 /* Some notes about MULTICALL and the context and save stacks.
7277                  *
7278                  * In something like
7279                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7280                  * since codeblocks don't introduce a new scope (so that
7281                  * local() etc accumulate), at the end of a successful
7282                  * match there will be a SAVEt_CLEARSV on the savestack
7283                  * for each of $x, $y, $z. If the three code blocks above
7284                  * happen to have come from different CVs (e.g. via
7285                  * embedded qr//s), then we must ensure that during any
7286                  * savestack unwinding, PL_comppad always points to the
7287                  * right pad at each moment. We achieve this by
7288                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7289                  * there is a change of pad.
7290                  * In theory whenever we call a code block, we should
7291                  * push a CXt_SUB context, then pop it on return from
7292                  * that code block. This causes a bit of an issue in that
7293                  * normally popping a context also clears the savestack
7294                  * back to cx->blk_oldsaveix, but here we specifically
7295                  * don't want to clear the save stack on exit from the
7296                  * code block.
7297                  * Also for efficiency we don't want to keep pushing and
7298                  * popping the single SUB context as we backtrack etc.
7299                  * So instead, we push a single context the first time
7300                  * we need, it, then hang onto it until the end of this
7301                  * function. Whenever we encounter a new code block, we
7302                  * update the CV etc if that's changed. During the times
7303                  * in this function where we're not executing a code
7304                  * block, having the SUB context still there is a bit
7305                  * naughty - but we hope that no-one notices.
7306                  * When the SUB context is initially pushed, we fake up
7307                  * cx->blk_oldsaveix to be as if we'd pushed this context
7308                  * on first entry to S_regmatch rather than at some random
7309                  * point during the regexe execution. That way if we
7310                  * croak, popping the context stack will ensure that
7311                  * *everything* SAVEd by this function is undone and then
7312                  * the context popped, rather than e.g., popping the
7313                  * context (and restoring the original PL_comppad) then
7314                  * popping more of the savestack and restoring a bad
7315                  * PL_comppad.
7316                  */
7317
7318                 /* If this is the first EVAL, push a MULTICALL. On
7319                  * subsequent calls, if we're executing a different CV, or
7320                  * if PL_comppad has got messed up from backtracking
7321                  * through SAVECOMPPADs, then refresh the context.
7322                  */
7323                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7324                 {
7325                     U8 flags = (CXp_SUB_RE |
7326                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7327                     SAVECOMPPAD();
7328                     if (last_pushed_cv) {
7329                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7330                     }
7331                     else {
7332                         PUSH_MULTICALL_FLAGS(newcv, flags);
7333                     }
7334                     /* see notes above */
7335                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7336
7337                     last_pushed_cv = newcv;
7338                 }
7339                 else {
7340                     /* these assignments are just to silence compiler
7341                      * warnings */
7342                     multicall_cop = NULL;
7343                 }
7344                 last_pad = PL_comppad;
7345
7346                 /* the initial nextstate you would normally execute
7347                  * at the start of an eval (which would cause error
7348                  * messages to come from the eval), may be optimised
7349                  * away from the execution path in the regex code blocks;
7350                  * so manually set PL_curcop to it initially */
7351                 {
7352                     OP *o = cUNOPx(nop)->op_first;
7353                     assert(o->op_type == OP_NULL);
7354                     if (o->op_targ == OP_SCOPE) {
7355                         o = cUNOPo->op_first;
7356                     }
7357                     else {
7358                         assert(o->op_targ == OP_LEAVE);
7359                         o = cUNOPo->op_first;
7360                         assert(o->op_type == OP_ENTER);
7361                         o = OpSIBLING(o);
7362                     }
7363
7364                     if (o->op_type != OP_STUB) {
7365                         assert(    o->op_type == OP_NEXTSTATE
7366                                 || o->op_type == OP_DBSTATE
7367                                 || (o->op_type == OP_NULL
7368                                     &&  (  o->op_targ == OP_NEXTSTATE
7369                                         || o->op_targ == OP_DBSTATE
7370                                         )
7371                                     )
7372                         );
7373                         PL_curcop = (COP*)o;
7374                     }
7375                 }
7376                 nop = nop->op_next;
7377
7378                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7379                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7380
7381                 rex->offs[0].end = locinput - reginfo->strbeg;
7382                 if (reginfo->info_aux_eval->pos_magic)
7383                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7384                                   reginfo->sv, reginfo->strbeg,
7385                                   locinput - reginfo->strbeg);
7386
7387                 if (sv_yes_mark) {
7388                     SV *sv_mrk = get_sv("REGMARK", 1);
7389                     sv_setsv(sv_mrk, sv_yes_mark);
7390                 }
7391
7392                 /* we don't use MULTICALL here as we want to call the
7393                  * first op of the block of interest, rather than the
7394                  * first op of the sub. Also, we don't want to free
7395                  * the savestack frame */
7396                 before = (IV)(SP-PL_stack_base);
7397                 PL_op = nop;
7398                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7399                 SPAGAIN;
7400                 if ((IV)(SP-PL_stack_base) == before)
7401                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7402                 else {
7403                     ret = POPs;
7404                     PUTBACK;
7405                 }
7406
7407                 /* before restoring everything, evaluate the returned
7408                  * value, so that 'uninit' warnings don't use the wrong
7409                  * PL_op or pad. Also need to process any magic vars
7410                  * (e.g. $1) *before* parentheses are restored */
7411
7412                 PL_op = NULL;
7413
7414                 re_sv = NULL;
7415                 if (logical == 0) {       /*   (?{})/   */
7416                     SV *replsv = save_scalar(PL_replgv);
7417                     sv_setsv(replsv, ret); /* $^R */
7418                     SvSETMAGIC(replsv);
7419                 }
7420                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7421                     sw = cBOOL(SvTRUE_NN(ret));
7422                     logical = 0;
7423                 }
7424                 else {                   /*  /(??{})  */
7425                     /*  if its overloaded, let the regex compiler handle
7426                      *  it; otherwise extract regex, or stringify  */
7427                     if (SvGMAGICAL(ret))
7428                         ret = sv_mortalcopy(ret);
7429                     if (!SvAMAGIC(ret)) {
7430                         SV *sv = ret;
7431                         if (SvROK(sv))
7432                             sv = SvRV(sv);
7433                         if (SvTYPE(sv) == SVt_REGEXP)
7434                             re_sv = (REGEXP*) sv;
7435                         else if (SvSMAGICAL(ret)) {
7436                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7437                             if (mg)
7438                                 re_sv = (REGEXP *) mg->mg_obj;
7439                         }
7440
7441                         /* force any undef warnings here */
7442                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7443                             ret = sv_mortalcopy(ret);
7444                             (void) SvPV_force_nolen(ret);
7445                         }
7446                     }
7447
7448                 }
7449
7450                 /* *** Note that at this point we don't restore
7451                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7452                  * be used again soon. This is safe as long as nothing
7453                  * in the regexp code uses the pad ! */
7454                 PL_op = oop;
7455                 PL_curcop = ocurcop;
7456                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7457                 PL_curpm_under = PL_curpm;
7458                 PL_curpm = PL_reg_curpm;
7459
7460                 if (logical != 2) {
7461                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
7462                                     script_run_begin);
7463                     /* NOTREACHED */
7464                 }
7465             }
7466
7467                 /* only /(??{})/  from now on */
7468                 logical = 0;
7469                 {
7470                     /* extract RE object from returned value; compiling if
7471                      * necessary */
7472
7473                     if (re_sv) {
7474                         re_sv = reg_temp_copy(NULL, re_sv);
7475                     }
7476                     else {
7477                         U32 pm_flags = 0;
7478
7479                         if (SvUTF8(ret) && IN_BYTES) {
7480                             /* In use 'bytes': make a copy of the octet
7481                              * sequence, but without the flag on */
7482                             STRLEN len;
7483                             const char *const p = SvPV(ret, len);
7484                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7485                         }
7486                         if (rex->intflags & PREGf_USE_RE_EVAL)
7487                             pm_flags |= PMf_USE_RE_EVAL;
7488
7489                         /* if we got here, it should be an engine which
7490                          * supports compiling code blocks and stuff */
7491                         assert(rex->engine && rex->engine->op_comp);
7492                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7493                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7494                                     rex->engine, NULL, NULL,
7495                                     /* copy /msixn etc to inner pattern */
7496                                     ARG2L(scan),
7497                                     pm_flags);
7498
7499                         if (!(SvFLAGS(ret)
7500                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7501                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7502                             /* This isn't a first class regexp. Instead, it's
7503                                caching a regexp onto an existing, Perl visible
7504                                scalar.  */
7505                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7506                         }
7507                     }
7508                     SAVEFREESV(re_sv);
7509                     re = ReANY(re_sv);
7510                 }
7511                 RXp_MATCH_COPIED_off(re);
7512                 re->subbeg = rex->subbeg;
7513                 re->sublen = rex->sublen;
7514                 re->suboffset = rex->suboffset;
7515                 re->subcoffset = rex->subcoffset;
7516                 re->lastparen = 0;
7517                 re->lastcloseparen = 0;
7518                 rei = RXi_GET(re);
7519                 DEBUG_EXECUTE_r(
7520                     debug_start_match(re_sv, utf8_target, locinput,
7521                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7522                 );
7523                 startpoint = rei->program + 1;
7524                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7525                                              * close_paren only for GOSUB */
7526                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7527                 /* Save all the seen positions so far. */
7528                 ST.cp = regcppush(rex, 0, maxopenparen);
7529                 REGCP_SET(ST.lastcp);
7530                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7531                 maxopenparen = 0;
7532                 /* run the pattern returned from (??{...}) */
7533
7534               eval_recurse_doit: /* Share code with GOSUB below this line
7535                             * At this point we expect the stack context to be
7536                             * set up correctly */
7537
7538                 /* invalidate the S-L poscache. We're now executing a
7539                  * different set of WHILEM ops (and their associated
7540                  * indexes) against the same string, so the bits in the
7541                  * cache are meaningless. Setting maxiter to zero forces
7542                  * the cache to be invalidated and zeroed before reuse.
7543                  * XXX This is too dramatic a measure. Ideally we should
7544                  * save the old cache and restore when running the outer
7545                  * pattern again */
7546                 reginfo->poscache_maxiter = 0;
7547
7548                 /* the new regexp might have a different is_utf8_pat than we do */
7549                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7550
7551                 ST.prev_rex = rex_sv;
7552                 ST.prev_curlyx = cur_curlyx;
7553                 rex_sv = re_sv;
7554                 SET_reg_curpm(rex_sv);
7555                 rex = re;
7556                 rexi = rei;
7557                 cur_curlyx = NULL;
7558                 ST.B = next;
7559                 ST.prev_eval = cur_eval;
7560                 cur_eval = st;
7561                 /* now continue from first node in postoned RE */
7562                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
7563                                     loceol, script_run_begin);
7564                 NOT_REACHED; /* NOTREACHED */
7565         }
7566
7567         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7568             /* note: this is called twice; first after popping B, then A */
7569             DEBUG_STACK_r({
7570                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7571                     depth, cur_eval, ST.prev_eval);
7572             });
7573
7574 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7575             if ( cur_eval && CUR_EVAL.close_paren ) {\
7576                 DEBUG_STACK_r({ \
7577                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7578                         depth,    \
7579                         CUR_EVAL.close_paren - 1,\
7580                         cur_eval, \
7581                         VAL);     \
7582                 });               \
7583                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7584             }
7585
7586             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7587
7588             rex_sv = ST.prev_rex;
7589             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7590             SET_reg_curpm(rex_sv);
7591             rex = ReANY(rex_sv);
7592             rexi = RXi_GET(rex);
7593             {
7594                 /* preserve $^R across LEAVE's. See Bug 121070. */
7595                 SV *save_sv= GvSV(PL_replgv);
7596                 SV *replsv;
7597                 SvREFCNT_inc(save_sv);
7598                 regcpblow(ST.cp); /* LEAVE in disguise */
7599                 /* don't move this initialization up */
7600                 replsv = GvSV(PL_replgv);
7601                 sv_setsv(replsv, save_sv);
7602                 SvSETMAGIC(replsv);
7603                 SvREFCNT_dec(save_sv);
7604             }
7605             cur_eval = ST.prev_eval;
7606             cur_curlyx = ST.prev_curlyx;
7607
7608             /* Invalidate cache. See "invalidate" comment above. */
7609             reginfo->poscache_maxiter = 0;
7610             if ( nochange_depth )
7611                 nochange_depth--;
7612
7613             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7614             sayYES;
7615
7616
7617         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7618             REGCP_UNWIND(ST.lastcp);
7619             sayNO;
7620
7621         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7622             /* note: this is called twice; first after popping B, then A */
7623             DEBUG_STACK_r({
7624                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7625                     depth, cur_eval, ST.prev_eval);
7626             });
7627
7628             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7629
7630             rex_sv = ST.prev_rex;
7631             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7632             SET_reg_curpm(rex_sv);
7633             rex = ReANY(rex_sv);
7634             rexi = RXi_GET(rex);
7635
7636             REGCP_UNWIND(ST.lastcp);
7637             regcppop(rex, &maxopenparen);
7638             cur_eval = ST.prev_eval;
7639             cur_curlyx = ST.prev_curlyx;
7640
7641             /* Invalidate cache. See "invalidate" comment above. */
7642             reginfo->poscache_maxiter = 0;
7643             if ( nochange_depth )
7644                 nochange_depth--;
7645
7646             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7647             sayNO_SILENT;
7648 #undef ST
7649
7650         case OPEN: /*  (  */
7651             n = ARG(scan);  /* which paren pair */
7652             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7653             if (n > maxopenparen)
7654                 maxopenparen = n;
7655             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7656                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7657                 depth,
7658                 PTR2UV(rex),
7659                 PTR2UV(rex->offs),
7660                 (UV)n,
7661                 (IV)rex->offs[n].start_tmp,
7662                 (UV)maxopenparen
7663             ));
7664             lastopen = n;
7665             break;
7666
7667         case SROPEN: /*  (*SCRIPT_RUN:  */
7668             script_run_begin = (U8 *) locinput;
7669             break;
7670
7671
7672         case CLOSE:  /*  )  */
7673             n = ARG(scan);  /* which paren pair */
7674             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7675                              locinput - reginfo->strbeg);
7676             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7677                 goto fake_end;
7678
7679             break;
7680
7681         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7682
7683             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7684             {
7685                 sayNO;
7686             }
7687
7688             break;
7689
7690
7691         case ACCEPT:  /*  (*ACCEPT)  */
7692             if (scan->flags)
7693                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7694             if (ARG2L(scan)){
7695                 regnode *cursor;
7696                 for (cursor=scan;
7697                      cursor && OP(cursor)!=END;
7698                      cursor=regnext(cursor))
7699                 {
7700                     if ( OP(cursor)==CLOSE ){
7701                         n = ARG(cursor);
7702                         if ( n <= lastopen ) {
7703                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7704                                              locinput - reginfo->strbeg);
7705                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7706                                 break;
7707                         }
7708                     }
7709                 }
7710             }
7711             goto fake_end;
7712             /* NOTREACHED */
7713
7714         case GROUPP:  /*  (?(1))  */
7715             n = ARG(scan);  /* which paren pair */
7716             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7717             break;
7718
7719         case NGROUPP:  /*  (?(<name>))  */
7720             /* reg_check_named_buff_matched returns 0 for no match */
7721             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7722             break;
7723
7724         case INSUBP:   /*  (?(R))  */
7725             n = ARG(scan);
7726             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7727              * of SCAN is already set up as matches a eval.close_paren */
7728             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7729             break;
7730
7731         case DEFINEP:  /*  (?(DEFINE))  */
7732             sw = 0;
7733             break;
7734
7735         case IFTHEN:   /*  (?(cond)A|B)  */
7736             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7737             if (sw)
7738                 next = NEXTOPER(NEXTOPER(scan));
7739             else {
7740                 next = scan + ARG(scan);
7741                 if (OP(next) == IFTHEN) /* Fake one. */
7742                     next = NEXTOPER(NEXTOPER(next));
7743             }
7744             break;
7745
7746         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7747             logical = scan->flags;
7748             break;
7749
7750 /*******************************************************************
7751
7752 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7753 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7754 STAR/PLUS/CURLY/CURLYN are used instead.)
7755
7756 A*B is compiled as <CURLYX><A><WHILEM><B>
7757
7758 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7759 state, which contains the current count, initialised to -1. It also sets
7760 cur_curlyx to point to this state, with any previous value saved in the
7761 state block.
7762
7763 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7764 since the pattern may possibly match zero times (i.e. it's a while {} loop
7765 rather than a do {} while loop).
7766
7767 Each entry to WHILEM represents a successful match of A. The count in the
7768 CURLYX block is incremented, another WHILEM state is pushed, and execution
7769 passes to A or B depending on greediness and the current count.
7770
7771 For example, if matching against the string a1a2a3b (where the aN are
7772 substrings that match /A/), then the match progresses as follows: (the
7773 pushed states are interspersed with the bits of strings matched so far):
7774
7775     <CURLYX cnt=-1>
7776     <CURLYX cnt=0><WHILEM>
7777     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7778     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7779     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7780     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7781
7782 (Contrast this with something like CURLYM, which maintains only a single
7783 backtrack state:
7784
7785     <CURLYM cnt=0> a1
7786     a1 <CURLYM cnt=1> a2
7787     a1 a2 <CURLYM cnt=2> a3
7788     a1 a2 a3 <CURLYM cnt=3> b
7789 )
7790
7791 Each WHILEM state block marks a point to backtrack to upon partial failure
7792 of A or B, and also contains some minor state data related to that
7793 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7794 overall state, such as the count, and pointers to the A and B ops.
7795
7796 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7797 must always point to the *current* CURLYX block, the rules are:
7798
7799 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7800 and set cur_curlyx to point the new block.
7801
7802 When popping the CURLYX block after a successful or unsuccessful match,
7803 restore the previous cur_curlyx.
7804
7805 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7806 to the outer one saved in the CURLYX block.
7807
7808 When popping the WHILEM block after a successful or unsuccessful B match,
7809 restore the previous cur_curlyx.
7810
7811 Here's an example for the pattern (AI* BI)*BO
7812 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7813
7814 cur_
7815 curlyx backtrack stack
7816 ------ ---------------
7817 NULL
7818 CO     <CO prev=NULL> <WO>
7819 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7820 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7821 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7822
7823 At this point the pattern succeeds, and we work back down the stack to
7824 clean up, restoring as we go:
7825
7826 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7827 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7828 CO     <CO prev=NULL> <WO>
7829 NULL
7830
7831 *******************************************************************/
7832
7833 #define ST st->u.curlyx
7834
7835         case CURLYX:    /* start of /A*B/  (for complex A) */
7836         {
7837             /* No need to save/restore up to this paren */
7838             I32 parenfloor = scan->flags;
7839
7840             assert(next); /* keep Coverity happy */
7841             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7842                 next += ARG(next);
7843
7844             /* XXXX Probably it is better to teach regpush to support
7845                parenfloor > maxopenparen ... */
7846             if (parenfloor > (I32)rex->lastparen)
7847                 parenfloor = rex->lastparen; /* Pessimization... */
7848
7849             ST.prev_curlyx= cur_curlyx;
7850             cur_curlyx = st;
7851             ST.cp = PL_savestack_ix;
7852
7853             /* these fields contain the state of the current curly.
7854              * they are accessed by subsequent WHILEMs */
7855             ST.parenfloor = parenfloor;
7856             ST.me = scan;
7857             ST.B = next;
7858             ST.minmod = minmod;
7859             minmod = 0;
7860             ST.count = -1;      /* this will be updated by WHILEM */
7861             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7862
7863             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
7864                                 script_run_begin);
7865             NOT_REACHED; /* NOTREACHED */
7866         }
7867
7868         case CURLYX_end: /* just finished matching all of A*B */
7869             cur_curlyx = ST.prev_curlyx;
7870             sayYES;
7871             NOT_REACHED; /* NOTREACHED */
7872
7873         case CURLYX_end_fail: /* just failed to match all of A*B */
7874             regcpblow(ST.cp);
7875             cur_curlyx = ST.prev_curlyx;
7876             sayNO;
7877             NOT_REACHED; /* NOTREACHED */
7878
7879
7880 #undef ST
7881 #define ST st->u.whilem
7882
7883         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7884         {
7885             /* see the discussion above about CURLYX/WHILEM */
7886             I32 n;
7887             int min, max;
7888             regnode *A;
7889
7890             assert(cur_curlyx); /* keep Coverity happy */
7891
7892             min = ARG1(cur_curlyx->u.curlyx.me);
7893             max = ARG2(cur_curlyx->u.curlyx.me);
7894             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7895             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7896             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7897             ST.cache_offset = 0;
7898             ST.cache_mask = 0;
7899
7900
7901             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
7902                   depth, (long)n, min, max)
7903             );
7904
7905             /* First just match a string of min A's. */
7906
7907             if (n < min) {
7908                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7909                 cur_curlyx->u.curlyx.lastloc = locinput;
7910                 REGCP_SET(ST.lastcp);
7911
7912                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
7913                                 script_run_begin);
7914                 NOT_REACHED; /* NOTREACHED */
7915             }
7916
7917             /* If degenerate A matches "", assume A done. */
7918
7919             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7920                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
7921                    depth)
7922                 );
7923                 goto do_whilem_B_max;
7924             }
7925
7926             /* super-linear cache processing.
7927              *
7928              * The idea here is that for certain types of CURLYX/WHILEM -
7929              * principally those whose upper bound is infinity (and
7930              * excluding regexes that have things like \1 and other very
7931              * non-regular expresssiony things), then if a pattern like
7932              * /....A*.../ fails and we backtrack to the WHILEM, then we
7933              * make a note that this particular WHILEM op was at string
7934              * position 47 (say) when the rest of pattern failed. Then, if
7935              * we ever find ourselves back at that WHILEM, and at string
7936              * position 47 again, we can just fail immediately rather than
7937              * running the rest of the pattern again.
7938              *
7939              * This is very handy when patterns start to go
7940              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7941              * with a combinatorial explosion of backtracking.
7942              *
7943              * The cache is implemented as a bit array, with one bit per
7944              * string byte position per WHILEM op (up to 16) - so its
7945              * between 0.25 and 2x the string size.
7946              *
7947              * To avoid allocating a poscache buffer every time, we do an
7948              * initially countdown; only after we have  executed a WHILEM
7949              * op (string-length x #WHILEMs) times do we allocate the
7950              * cache.
7951              *
7952              * The top 4 bits of scan->flags byte say how many different
7953              * relevant CURLLYX/WHILEM op pairs there are, while the
7954              * bottom 4-bits is the identifying index number of this
7955              * WHILEM.
7956              */
7957
7958             if (scan->flags) {
7959
7960                 if (!reginfo->poscache_maxiter) {
7961                     /* start the countdown: Postpone detection until we
7962                      * know the match is not *that* much linear. */
7963                     reginfo->poscache_maxiter
7964                         =    (reginfo->strend - reginfo->strbeg + 1)
7965                            * (scan->flags>>4);
7966                     /* possible overflow for long strings and many CURLYX's */
7967                     if (reginfo->poscache_maxiter < 0)
7968                         reginfo->poscache_maxiter = I32_MAX;
7969                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7970                 }
7971
7972                 if (reginfo->poscache_iter-- == 0) {
7973                     /* initialise cache */
7974                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7975                     regmatch_info_aux *const aux = reginfo->info_aux;
7976                     if (aux->poscache) {
7977                         if ((SSize_t)reginfo->poscache_size < size) {
7978                             Renew(aux->poscache, size, char);
7979                             reginfo->poscache_size = size;
7980                         }
7981                         Zero(aux->poscache, size, char);
7982                     }
7983                     else {
7984                         reginfo->poscache_size = size;
7985                         Newxz(aux->poscache, size, char);
7986                     }
7987                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7988       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
7989                               PL_colors[4], PL_colors[5])
7990                     );
7991                 }
7992
7993                 if (reginfo->poscache_iter < 0) {
7994                     /* have we already failed at this position? */
7995                     SSize_t offset, mask;
7996
7997                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7998                     offset  = (scan->flags & 0xf) - 1
7999                                 +   (locinput - reginfo->strbeg)
8000                                   * (scan->flags>>4);
8001                     mask    = 1 << (offset % 8);
8002                     offset /= 8;
8003                     if (reginfo->info_aux->poscache[offset] & mask) {
8004                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8005                             depth)
8006                         );
8007                         cur_curlyx->u.curlyx.count--;
8008                         sayNO; /* cache records failure */
8009                     }
8010                     ST.cache_offset = offset;
8011                     ST.cache_mask   = mask;
8012                 }
8013             }
8014
8015             /* Prefer B over A for minimal matching. */
8016
8017             if (cur_curlyx->u.curlyx.minmod) {
8018                 ST.save_curlyx = cur_curlyx;
8019                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8020                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8021                                     locinput, loceol, script_run_begin);
8022                 NOT_REACHED; /* NOTREACHED */
8023             }
8024
8025             /* Prefer A over B for maximal matching. */
8026
8027             if (n < max) { /* More greed allowed? */
8028                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8029                             maxopenparen);
8030                 cur_curlyx->u.curlyx.lastloc = locinput;
8031                 REGCP_SET(ST.lastcp);
8032                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8033                                 script_run_begin);
8034                 NOT_REACHED; /* NOTREACHED */
8035             }
8036             goto do_whilem_B_max;
8037         }
8038         NOT_REACHED; /* NOTREACHED */
8039
8040         case WHILEM_B_min: /* just matched B in a minimal match */
8041         case WHILEM_B_max: /* just matched B in a maximal match */
8042             cur_curlyx = ST.save_curlyx;
8043             sayYES;
8044             NOT_REACHED; /* NOTREACHED */
8045
8046         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8047             cur_curlyx = ST.save_curlyx;
8048             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8049             cur_curlyx->u.curlyx.count--;
8050             CACHEsayNO;
8051             NOT_REACHED; /* NOTREACHED */
8052
8053         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8054             /* FALLTHROUGH */
8055         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8056             REGCP_UNWIND(ST.lastcp);
8057             regcppop(rex, &maxopenparen);
8058             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8059             cur_curlyx->u.curlyx.count--;
8060             CACHEsayNO;
8061             NOT_REACHED; /* NOTREACHED */
8062
8063         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8064             REGCP_UNWIND(ST.lastcp);
8065             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8066             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8067                 depth)
8068             );
8069           do_whilem_B_max:
8070             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8071                 && ckWARN(WARN_REGEXP)
8072                 && !reginfo->warned)
8073             {
8074                 reginfo->warned = TRUE;
8075                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8076                      "Complex regular subexpression recursion limit (%d) "
8077                      "exceeded",
8078                      REG_INFTY - 1);
8079             }
8080
8081             /* now try B */
8082             ST.save_curlyx = cur_curlyx;
8083             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8084             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8085                                 locinput, loceol, script_run_begin);
8086             NOT_REACHED; /* NOTREACHED */
8087
8088         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8089             cur_curlyx = ST.save_curlyx;
8090
8091             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8092                 /* Maximum greed exceeded */
8093                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8094                     && ckWARN(WARN_REGEXP)
8095                     && !reginfo->warned)
8096                 {
8097                     reginfo->warned     = TRUE;
8098                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8099                         "Complex regular subexpression recursion "
8100                         "limit (%d) exceeded",
8101                         REG_INFTY - 1);
8102                 }
8103                 cur_curlyx->u.curlyx.count--;
8104                 CACHEsayNO;
8105             }
8106
8107             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8108             );
8109             /* Try grabbing another A and see if it helps. */
8110             cur_curlyx->u.curlyx.lastloc = locinput;
8111             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8112                             maxopenparen);
8113             REGCP_SET(ST.lastcp);
8114             PUSH_STATE_GOTO(WHILEM_A_min,
8115                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8116                 locinput, loceol, script_run_begin);
8117             NOT_REACHED; /* NOTREACHED */
8118
8119 #undef  ST
8120 #define ST st->u.branch
8121
8122         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8123             next = scan + ARG(scan);
8124             if (next == scan)
8125                 next = NULL;
8126             scan = NEXTOPER(scan);
8127             /* FALLTHROUGH */
8128
8129         case BRANCH:        /*  /(...|A|...)/ */
8130             scan = NEXTOPER(scan); /* scan now points to inner node */
8131             ST.lastparen = rex->lastparen;
8132             ST.lastcloseparen = rex->lastcloseparen;
8133             ST.next_branch = next;
8134             REGCP_SET(ST.cp);
8135
8136             /* Now go into the branch */
8137             if (has_cutgroup) {
8138                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8139                                     script_run_begin);
8140             } else {
8141                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8142                                 script_run_begin);
8143             }
8144             NOT_REACHED; /* NOTREACHED */
8145
8146         case CUTGROUP:  /*  /(*THEN)/  */
8147             sv_yes_mark = st->u.mark.mark_name = scan->flags
8148                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8149                 : NULL;
8150             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8151                             script_run_begin);
8152             NOT_REACHED; /* NOTREACHED */
8153
8154         case CUTGROUP_next_fail:
8155             do_cutgroup = 1;
8156             no_final = 1;
8157             if (st->u.mark.mark_name)
8158                 sv_commit = st->u.mark.mark_name;
8159             sayNO;
8160             NOT_REACHED; /* NOTREACHED */
8161
8162         case BRANCH_next:
8163             sayYES;
8164             NOT_REACHED; /* NOTREACHED */
8165
8166         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8167             if (do_cutgroup) {
8168                 do_cutgroup = 0;
8169                 no_final = 0;
8170             }
8171             REGCP_UNWIND(ST.cp);
8172             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8173             scan = ST.next_branch;
8174             /* no more branches? */
8175             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8176                 DEBUG_EXECUTE_r({
8177                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8178                         depth,
8179                         PL_colors[4],
8180                         PL_colors[5] );
8181                 });
8182                 sayNO_SILENT;
8183             }
8184             continue; /* execute next BRANCH[J] op */
8185             /* NOTREACHED */
8186
8187         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8188             minmod = 1;
8189             break;
8190
8191 #undef  ST
8192 #define ST st->u.curlym
8193
8194         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8195
8196             /* This is an optimisation of CURLYX that enables us to push
8197              * only a single backtracking state, no matter how many matches
8198              * there are in {m,n}. It relies on the pattern being constant
8199              * length, with no parens to influence future backrefs
8200              */
8201
8202             ST.me = scan;
8203             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8204
8205             ST.lastparen      = rex->lastparen;
8206             ST.lastcloseparen = rex->lastcloseparen;
8207
8208             /* if paren positive, emulate an OPEN/CLOSE around A */
8209             if (ST.me->flags) {
8210                 U32 paren = ST.me->flags;
8211                 if (paren > maxopenparen)
8212                     maxopenparen = paren;
8213                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8214             }
8215             ST.A = scan;
8216             ST.B = next;
8217             ST.alen = 0;
8218             ST.count = 0;
8219             ST.minmod = minmod;
8220             minmod = 0;
8221             ST.c1 = CHRTEST_UNINIT;
8222             REGCP_SET(ST.cp);
8223
8224             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8225                 goto curlym_do_B;
8226
8227           curlym_do_A: /* execute the A in /A{m,n}B/  */
8228             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8229                                 script_run_begin);
8230             NOT_REACHED; /* NOTREACHED */
8231
8232         case CURLYM_A: /* we've just matched an A */
8233             ST.count++;
8234             /* after first match, determine A's length: u.curlym.alen */
8235             if (ST.count == 1) {
8236                 if (reginfo->is_utf8_target) {
8237                     char *s = st->locinput;
8238                     while (s < locinput) {
8239                         ST.alen++;
8240                         s += UTF8SKIP(s);
8241                     }
8242                 }
8243                 else {
8244                     ST.alen = locinput - st->locinput;
8245                 }
8246                 if (ST.alen == 0)
8247                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8248             }
8249             DEBUG_EXECUTE_r(
8250                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8251                           depth, (IV) ST.count, (IV)ST.alen)
8252             );
8253
8254             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8255                 goto fake_end;
8256
8257             {
8258                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8259                 if ( max == REG_INFTY || ST.count < max )
8260                     goto curlym_do_A; /* try to match another A */
8261             }
8262             goto curlym_do_B; /* try to match B */
8263
8264         case CURLYM_A_fail: /* just failed to match an A */
8265             REGCP_UNWIND(ST.cp);
8266
8267
8268             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8269                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8270                 sayNO;
8271
8272           curlym_do_B: /* execute the B in /A{m,n}B/  */
8273             if (ST.c1 == CHRTEST_UNINIT) {
8274                 /* calculate c1 and c2 for possible match of 1st char
8275                  * following curly */
8276                 ST.c1 = ST.c2 = CHRTEST_VOID;
8277                 assert(ST.B);
8278                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8279                     regnode *text_node = ST.B;
8280                     if (! HAS_TEXT(text_node))
8281                         FIND_NEXT_IMPT(text_node);
8282                     if (PL_regkind[OP(text_node)] == EXACT) {
8283                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8284                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8285                            reginfo))
8286                         {
8287                             sayNO;
8288                         }
8289                     }
8290                 }
8291             }
8292
8293             DEBUG_EXECUTE_r(
8294                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8295                     depth, (IV)ST.count)
8296                 );
8297             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8298                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8299
8300                            /* (We can use memEQ and memNE in this file without
8301                             * having to worry about one being shorter than the
8302                             * other, since the first byte of each gives the
8303                             * length of the character) */
8304                     if (   memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
8305                                                               reginfo->strend))
8306                         && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
8307                                                              reginfo->strend)))
8308                     {
8309                         /* simulate B failing */
8310                         DEBUG_OPTIMISE_r(
8311                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8312                                 depth,
8313                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8314                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8315                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8316                         );
8317                         state_num = CURLYM_B_fail;
8318                         goto reenter_switch;
8319                     }
8320                 }
8321                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8322                     /* simulate B failing */
8323                     DEBUG_OPTIMISE_r(
8324                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8325                             depth,
8326                             (int) nextchr, ST.c1, ST.c2)
8327                     );
8328                     state_num = CURLYM_B_fail;
8329                     goto reenter_switch;
8330                 }
8331             }
8332
8333             if (ST.me->flags) {
8334                 /* emulate CLOSE: mark current A as captured */
8335                 U32 paren = (U32)ST.me->flags;
8336                 if (ST.count) {
8337                     CLOSE_CAPTURE(paren,
8338                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
8339                         locinput - reginfo->strbeg);
8340                 }
8341                 else
8342                     rex->offs[paren].end = -1;
8343
8344                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8345                 {
8346                     if (ST.count)
8347                         goto fake_end;
8348                     else
8349                         sayNO;
8350                 }
8351             }
8352
8353             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
8354                             script_run_begin);
8355             NOT_REACHED; /* NOTREACHED */
8356
8357         case CURLYM_B_fail: /* just failed to match a B */
8358             REGCP_UNWIND(ST.cp);
8359             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8360             if (ST.minmod) {
8361                 I32 max = ARG2(ST.me);
8362                 if (max != REG_INFTY && ST.count == max)
8363                     sayNO;
8364                 goto curlym_do_A; /* try to match a further A */
8365             }
8366             /* backtrack one A */
8367             if (ST.count == ARG1(ST.me) /* min */)
8368                 sayNO;
8369             ST.count--;
8370             SET_locinput(HOPc(locinput, -ST.alen));
8371             goto curlym_do_B; /* try to match B */
8372
8373 #undef ST
8374 #define ST st->u.curly
8375
8376 #define CURLY_SETPAREN(paren, success) \
8377     if (paren) { \
8378         if (success) { \
8379             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8380                                  locinput - reginfo->strbeg); \
8381         } \
8382         else { \
8383             rex->offs[paren].end = -1; \
8384             rex->lastparen      = ST.lastparen; \
8385             rex->lastcloseparen = ST.lastcloseparen; \
8386         } \
8387     }
8388
8389         case STAR:              /*  /A*B/ where A is width 1 char */
8390             ST.paren = 0;
8391             ST.min = 0;
8392             ST.max = REG_INFTY;
8393             scan = NEXTOPER(scan);
8394             goto repeat;
8395
8396         case PLUS:              /*  /A+B/ where A is width 1 char */
8397             ST.paren = 0;
8398             ST.min = 1;
8399             ST.max = REG_INFTY;
8400             scan = NEXTOPER(scan);
8401             goto repeat;
8402
8403         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8404             ST.paren = scan->flags;     /* Which paren to set */
8405             ST.lastparen      = rex->lastparen;
8406             ST.lastcloseparen = rex->lastcloseparen;
8407             if (ST.paren > maxopenparen)
8408                 maxopenparen = ST.paren;
8409             ST.min = ARG1(scan);  /* min to match */
8410             ST.max = ARG2(scan);  /* max to match */
8411             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8412
8413             /* handle the single-char capture called as a GOSUB etc */
8414             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8415             {
8416                 char *li = locinput;
8417                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
8418                     sayNO;
8419                 SET_locinput(li);
8420                 goto fake_end;
8421             }
8422
8423             goto repeat;
8424
8425         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8426             ST.paren = 0;
8427             ST.min = ARG1(scan);  /* min to match */
8428             ST.max = ARG2(scan);  /* max to match */
8429             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8430           repeat:
8431             /*
8432             * Lookahead to avoid useless match attempts
8433             * when we know what character comes next.
8434             *
8435             * Used to only do .*x and .*?x, but now it allows
8436             * for )'s, ('s and (?{ ... })'s to be in the way
8437             * of the quantifier and the EXACT-like node.  -- japhy
8438             */
8439
8440             assert(ST.min <= ST.max);
8441             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8442                 ST.c1 = ST.c2 = CHRTEST_VOID;
8443             }
8444             else {
8445                 regnode *text_node = next;
8446
8447                 if (! HAS_TEXT(text_node))
8448                     FIND_NEXT_IMPT(text_node);
8449
8450                 if (! HAS_TEXT(text_node))
8451                     ST.c1 = ST.c2 = CHRTEST_VOID;
8452                 else {
8453                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8454                         ST.c1 = ST.c2 = CHRTEST_VOID;
8455                     }
8456                     else {
8457                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8458                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8459                            reginfo))
8460                         {
8461                             sayNO;
8462                         }
8463                     }
8464                 }
8465             }
8466
8467             ST.A = scan;
8468             ST.B = next;
8469             if (minmod) {
8470                 char *li = locinput;
8471                 minmod = 0;
8472                 if (ST.min &&
8473                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
8474                             < ST.min)
8475                     sayNO;
8476                 SET_locinput(li);
8477                 ST.count = ST.min;
8478                 REGCP_SET(ST.cp);
8479                 if (ST.c1 == CHRTEST_VOID)
8480                     goto curly_try_B_min;
8481
8482                 ST.oldloc = locinput;
8483
8484                 /* set ST.maxpos to the furthest point along the
8485                  * string that could possibly match */
8486                 if  (ST.max == REG_INFTY) {
8487                     ST.maxpos = loceol - 1;
8488                     if (utf8_target)
8489                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8490                             ST.maxpos--;
8491                 }
8492                 else if (utf8_target) {
8493                     int m = ST.max - ST.min;
8494                     for (ST.maxpos = locinput;
8495                          m >0 && ST.maxpos <  loceol; m--)
8496                         ST.maxpos += UTF8SKIP(ST.maxpos);
8497                 }
8498                 else {
8499                     ST.maxpos = locinput + ST.max - ST.min;
8500                     if (ST.maxpos >=  loceol)
8501                         ST.maxpos =  loceol - 1;
8502                 }
8503                 goto curly_try_B_min_known;
8504
8505             }
8506             else {
8507                 /* avoid taking address of locinput, so it can remain
8508                  * a register var */
8509                 char *li = locinput;
8510                 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
8511                 if (ST.count < ST.min)
8512                     sayNO;
8513                 SET_locinput(li);
8514                 if ((ST.count > ST.min)
8515                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8516                 {
8517                     /* A{m,n} must come at the end of the string, there's
8518                      * no point in backing off ... */
8519                     ST.min = ST.count;
8520                     /* ...except that $ and \Z can match before *and* after
8521                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8522                        We may back off by one in this case. */
8523                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8524                         ST.min--;
8525                 }
8526                 REGCP_SET(ST.cp);
8527                 goto curly_try_B_max;
8528             }
8529             NOT_REACHED; /* NOTREACHED */
8530
8531         case CURLY_B_min_fail:
8532             /* failed to find B in a non-greedy match.
8533              * Handles both cases where c1,c2 valid or not */
8534
8535             REGCP_UNWIND(ST.cp);
8536             if (ST.paren) {
8537                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8538             }
8539
8540             if (ST.c1 == CHRTEST_VOID) {
8541                 /* failed -- move forward one */
8542                 char *li = locinput;
8543                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
8544                     sayNO;
8545                 }
8546                 locinput = li;
8547                 ST.count++;
8548                 if (!(   ST.count <= ST.max
8549                         /* count overflow ? */
8550                      || (ST.max == REG_INFTY && ST.count > 0))
8551                 )
8552                     sayNO;
8553             }
8554             else {
8555                 int n;
8556                 /* Couldn't or didn't -- move forward. */
8557                 ST.oldloc = locinput;
8558                 if (utf8_target)
8559                     locinput += UTF8SKIP(locinput);
8560                 else
8561                     locinput++;
8562                 ST.count++;
8563
8564               curly_try_B_min_known:
8565                 /* find the next place where 'B' could work, then call B */
8566                 if (utf8_target) {
8567                     n = (ST.oldloc == locinput) ? 0 : 1;
8568                     if (ST.c1 == ST.c2) {
8569                         /* set n to utf8_distance(oldloc, locinput) */
8570                         while (    locinput <= ST.maxpos
8571                                &&  locinput < loceol
8572                                &&  memNE(locinput, ST.c1_utf8,
8573                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8574                         {
8575                             locinput += UTF8_SAFE_SKIP(locinput,
8576                                                        reginfo->strend);
8577                             n++;
8578                         }
8579                     }
8580                     else {
8581                         /* set n to utf8_distance(oldloc, locinput) */
8582                         while (   locinput <= ST.maxpos
8583                                && locinput < loceol
8584                                && memNE(locinput, ST.c1_utf8,
8585                                      UTF8_SAFE_SKIP(locinput, reginfo->strend))
8586                                && memNE(locinput, ST.c2_utf8,
8587                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8588                         {
8589                             locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
8590                             n++;
8591                         }
8592                     }
8593                 }
8594                 else {  /* Not utf8_target */
8595                     if (ST.c1 == ST.c2) {
8596                         locinput = (char *) memchr(locinput,
8597                                                    ST.c1,
8598                                                    ST.maxpos + 1 - locinput);
8599                         if (! locinput) {
8600                             locinput = ST.maxpos + 1;
8601                         }
8602                     }
8603                     else {
8604                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8605
8606                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8607                             while (   locinput <= ST.maxpos
8608                                    && UCHARAT(locinput) != ST.c1
8609                                    && UCHARAT(locinput) != ST.c2)
8610                             {
8611                                 locinput++;
8612                             }
8613                         }
8614                         else {
8615                             /* If c1 and c2 only differ by a single bit, we can
8616                              * avoid a conditional each time through the loop,
8617                              * at the expense of a little preliminary setup and
8618                              * an extra mask each iteration.  By masking out
8619                              * that bit, we match exactly two characters, c1
8620                              * and c2, and so we don't have to test for both.
8621                              * On both ASCII and EBCDIC platforms, most of the
8622                              * ASCII-range and Latin1-range folded equivalents
8623                              * differ only in a single bit, so this is actually
8624                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8625                              * 0x61). */
8626                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8627                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8628                             while (   locinput <= ST.maxpos
8629                                    && (UCHARAT(locinput) & c1_c2_mask)
8630                                                                 != c1_masked)
8631                             {
8632                                 locinput++;
8633                             }
8634                         }
8635                     }
8636                     n = locinput - ST.oldloc;
8637                 }
8638                 if (locinput > ST.maxpos)
8639                     sayNO;
8640                 if (n) {
8641                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8642                      * at b; check that everything between oldloc and
8643                      * locinput matches */
8644                     char *li = ST.oldloc;
8645                     ST.count += n;
8646                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
8647                         sayNO;
8648                     assert(n == REG_INFTY || locinput == li);
8649                 }
8650             }
8651
8652           curly_try_B_min:
8653             CURLY_SETPAREN(ST.paren, ST.count);
8654             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
8655                             script_run_begin);
8656             NOT_REACHED; /* NOTREACHED */
8657
8658
8659           curly_try_B_max:
8660             /* a successful greedy match: now try to match B */
8661             {
8662                 bool could_match = locinput <  loceol;
8663
8664                 /* If it could work, try it. */
8665                 if (ST.c1 != CHRTEST_VOID && could_match) {
8666                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8667                     {
8668                         could_match =  memEQ(locinput, ST.c1_utf8,
8669                                              UTF8_SAFE_SKIP(locinput,
8670                                                             reginfo->strend))
8671                                     || memEQ(locinput, ST.c2_utf8,
8672                                              UTF8_SAFE_SKIP(locinput,
8673                                                             reginfo->strend));
8674                     }
8675                     else {
8676                         could_match =   UCHARAT(locinput) == ST.c1
8677                                      || UCHARAT(locinput) == ST.c2;
8678                     }
8679                 }
8680                 if (ST.c1 == CHRTEST_VOID || could_match) {
8681                     CURLY_SETPAREN(ST.paren, ST.count);
8682                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
8683                                     script_run_begin);
8684                     NOT_REACHED; /* NOTREACHED */
8685                 }
8686             }
8687             /* FALLTHROUGH */
8688
8689         case CURLY_B_max_fail:
8690             /* failed to find B in a greedy match */
8691
8692             REGCP_UNWIND(ST.cp);
8693             if (ST.paren) {
8694                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8695             }
8696             /*  back up. */
8697             if (--ST.count < ST.min)
8698                 sayNO;
8699             locinput = HOPc(locinput, -1);
8700             goto curly_try_B_max;
8701
8702 #undef ST
8703
8704         case END: /*  last op of main pattern  */
8705           fake_end:
8706             if (cur_eval) {
8707                 /* we've just finished A in /(??{A})B/; now continue with B */
8708                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8709                 st->u.eval.prev_rex = rex_sv;           /* inner */
8710
8711                 /* Save *all* the positions. */
8712                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8713                 rex_sv = CUR_EVAL.prev_rex;
8714                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8715                 SET_reg_curpm(rex_sv);
8716                 rex = ReANY(rex_sv);
8717                 rexi = RXi_GET(rex);
8718
8719                 st->u.eval.prev_curlyx = cur_curlyx;
8720                 cur_curlyx = CUR_EVAL.prev_curlyx;
8721
8722                 REGCP_SET(st->u.eval.lastcp);
8723
8724                 /* Restore parens of the outer rex without popping the
8725                  * savestack */
8726                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8727
8728                 st->u.eval.prev_eval = cur_eval;
8729                 cur_eval = CUR_EVAL.prev_eval;
8730                 DEBUG_EXECUTE_r(
8731                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8732                                       depth, cur_eval););
8733                 if ( nochange_depth )
8734                     nochange_depth--;
8735
8736                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8737
8738                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
8739                                     st->u.eval.prev_eval->u.eval.B,
8740                                     locinput, loceol, script_run_begin);
8741             }
8742
8743             if (locinput < reginfo->till) {
8744                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8745                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8746                                       PL_colors[4],
8747                                       (long)(locinput - startpos),
8748                                       (long)(reginfo->till - startpos),
8749                                       PL_colors[5]));
8750
8751                 sayNO_SILENT;           /* Cannot match: too short. */
8752             }
8753             sayYES;                     /* Success! */
8754
8755         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8756             DEBUG_EXECUTE_r(
8757             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8758                 depth, PL_colors[4], PL_colors[5]));
8759             sayYES;                     /* Success! */
8760
8761 #undef  ST
8762 #define ST st->u.ifmatch
8763
8764         case SUSPEND:   /* (?>A) */
8765             ST.wanted = 1;
8766             ST.start = locinput;
8767             ST.end = loceol;
8768             ST.count = 1;
8769             goto do_ifmatch;
8770
8771         case UNLESSM:   /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
8772             ST.wanted = 0;
8773             goto ifmatch_trivial_fail_test;
8774
8775         case IFMATCH:   /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
8776             ST.wanted = 1;
8777           ifmatch_trivial_fail_test:
8778             ST.count = scan->next_off + 1; /* next_off repurposed to be
8779                                               lookbehind count, requires
8780                                               non-zero flags */
8781             if (! scan->flags) {    /* 'flags' zero means lookahed */
8782
8783                 /* Lookahead starts here and ends at the normal place */
8784                 ST.start = locinput;
8785                 ST.end = loceol;
8786             }
8787             else {
8788                 PERL_UINT_FAST8_T back_count = scan->flags;
8789                 char * s;
8790
8791                 /* Lookbehind can look beyond the current position */
8792                 ST.end = loceol;
8793
8794                 /* ... and starts at the first place in the input that is in
8795                  * the range of the possible start positions */
8796                 for (; ST.count > 0; ST.count--, back_count--) {
8797                     s = HOPBACKc(locinput, back_count);
8798                     if (s) {
8799                         ST.start = s;
8800                         goto do_ifmatch;
8801                     }
8802                 }
8803
8804                 /* If the lookbehind doesn't start in the actual string, is a
8805                  * trivial match failure */
8806                 if (logical) {
8807                     logical = 0;
8808                     sw = 1 - cBOOL(ST.wanted);
8809                 }
8810                 else if (ST.wanted)
8811                     sayNO;
8812
8813                 /* Here, we didn't want it to match, so is actually success */
8814                 next = scan + ARG(scan);
8815                 if (next == scan)
8816                     next = NULL;
8817                 break;
8818             }
8819
8820           do_ifmatch:
8821             ST.me = scan;
8822             ST.logical = logical;
8823             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8824
8825             /* execute body of (?...A) */
8826             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
8827                                 ST.end, script_run_begin);
8828             NOT_REACHED; /* NOTREACHED */
8829
8830         {
8831             bool matched;
8832
8833         case IFMATCH_A_fail: /* body of (?...A) failed */
8834             if (! ST.logical && ST.count > 1) {
8835
8836                 /* It isn't a real failure until we've tried all starting
8837                  * positions.  Move to the next starting position and retry */
8838                 ST.count--;
8839                 ST.start = HOPc(ST.start, 1);
8840                 scan = ST.me;
8841                 logical = ST.logical;
8842                 goto do_ifmatch;
8843             }
8844
8845             /* Here, all starting positions have been tried. */
8846             matched = FALSE;
8847             goto ifmatch_done;
8848
8849         case IFMATCH_A: /* body of (?...A) succeeded */
8850             matched = TRUE;
8851           ifmatch_done:
8852             sw = matched == ST.wanted;
8853             if (! ST.logical && !sw) {
8854                 sayNO;
8855             }
8856
8857             if (OP(ST.me) != SUSPEND) {
8858                 /* restore old position except for (?>...) */
8859                 locinput = st->locinput;
8860                 loceol = st->loceol;
8861                 script_run_begin = st->sr0;
8862             }
8863             scan = ST.me + ARG(ST.me);
8864             if (scan == ST.me)
8865                 scan = NULL;
8866             continue; /* execute B */
8867         }
8868
8869 #undef ST
8870
8871         case LONGJMP: /*  alternative with many branches compiles to
8872                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8873             next = scan + ARG(scan);
8874             if (next == scan)
8875                 next = NULL;
8876             break;
8877
8878         case COMMIT:  /*  (*COMMIT)  */
8879             reginfo->cutpoint = loceol;
8880             /* FALLTHROUGH */
8881
8882         case PRUNE:   /*  (*PRUNE)   */
8883             if (scan->flags)
8884                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8885             PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
8886                             script_run_begin);
8887             NOT_REACHED; /* NOTREACHED */
8888
8889         case COMMIT_next_fail:
8890             no_final = 1;
8891             /* FALLTHROUGH */
8892             sayNO;
8893             NOT_REACHED; /* NOTREACHED */
8894
8895         case OPFAIL:   /* (*FAIL)  */
8896             if (scan->flags)
8897                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8898             if (logical) {
8899                 /* deal with (?(?!)X|Y) properly,
8900                  * make sure we trigger the no branch
8901                  * of the trailing IFTHEN structure*/
8902                 sw= 0;
8903                 break;
8904             } else {
8905                 sayNO;
8906             }
8907             NOT_REACHED; /* NOTREACHED */
8908
8909 #define ST st->u.mark
8910         case MARKPOINT: /*  (*MARK:foo)  */
8911             ST.prev_mark = mark_state;
8912             ST.mark_name = sv_commit = sv_yes_mark
8913                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8914             mark_state = st;
8915             ST.mark_loc = locinput;
8916             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
8917                                 script_run_begin);
8918             NOT_REACHED; /* NOTREACHED */
8919
8920         case MARKPOINT_next:
8921             mark_state = ST.prev_mark;
8922             sayYES;
8923             NOT_REACHED; /* NOTREACHED */
8924
8925         case MARKPOINT_next_fail:
8926             if (popmark && sv_eq(ST.mark_name,popmark))
8927             {
8928                 if (ST.mark_loc > startpoint)
8929                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8930                 popmark = NULL; /* we found our mark */
8931                 sv_commit = ST.mark_name;
8932
8933                 DEBUG_EXECUTE_r({
8934                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8935                             depth,
8936                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8937                 });
8938             }
8939             mark_state = ST.prev_mark;
8940             sv_yes_mark = mark_state ?
8941                 mark_state->u.mark.mark_name : NULL;
8942             sayNO;
8943             NOT_REACHED; /* NOTREACHED */
8944
8945         case SKIP:  /*  (*SKIP)  */
8946             if (!scan->flags) {
8947                 /* (*SKIP) : if we fail we cut here*/
8948                 ST.mark_name = NULL;
8949                 ST.mark_loc = locinput;
8950                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
8951                                 script_run_begin);
8952             } else {
8953                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8954                    otherwise do nothing.  Meaning we need to scan
8955                  */
8956                 regmatch_state *cur = mark_state;
8957                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8958
8959                 while (cur) {
8960                     if ( sv_eq( cur->u.mark.mark_name,
8961                                 find ) )
8962                     {
8963                         ST.mark_name = find;
8964                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
8965                                          script_run_begin);
8966                     }
8967                     cur = cur->u.mark.prev_mark;
8968                 }
8969             }
8970             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8971             break;
8972
8973         case SKIP_next_fail:
8974             if (ST.mark_name) {
8975                 /* (*CUT:NAME) - Set up to search for the name as we
8976                    collapse the stack*/
8977                 popmark = ST.mark_name;
8978             } else {
8979                 /* (*CUT) - No name, we cut here.*/
8980                 if (ST.mark_loc > startpoint)
8981                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8982                 /* but we set sv_commit to latest mark_name if there
8983                    is one so they can test to see how things lead to this
8984                    cut */
8985                 if (mark_state)
8986                     sv_commit=mark_state->u.mark.mark_name;
8987             }
8988             no_final = 1;
8989             sayNO;
8990             NOT_REACHED; /* NOTREACHED */
8991 #undef ST
8992
8993         case LNBREAK: /* \R */
8994             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
8995                 locinput += n;
8996             } else
8997                 sayNO;
8998             break;
8999
9000         default:
9001             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9002                           PTR2UV(scan), OP(scan));
9003             Perl_croak(aTHX_ "regexp memory corruption");
9004
9005         /* this is a point to jump to in order to increment
9006          * locinput by one character */
9007           increment_locinput:
9008             assert(!NEXTCHR_IS_EOS);
9009             if (utf8_target) {
9010                 locinput += PL_utf8skip[nextchr];
9011                 /* locinput is allowed to go 1 char off the end (signifying
9012                  * EOS), but not 2+ */
9013                 if (locinput >  loceol)
9014                     sayNO;
9015             }
9016             else
9017                 locinput++;
9018             break;
9019
9020         } /* end switch */
9021
9022         /* switch break jumps here */
9023         scan = next; /* prepare to execute the next op and ... */
9024         continue;    /* ... jump back to the top, reusing st */
9025         /* NOTREACHED */
9026
9027       push_yes_state:
9028         /* push a state that backtracks on success */
9029         st->u.yes.prev_yes_state = yes_state;
9030         yes_state = st;
9031         /* FALLTHROUGH */
9032       push_state:
9033         /* push a new regex state, then continue at scan  */
9034         {
9035             regmatch_state *newst;
9036
9037             DEBUG_STACK_r({
9038                 regmatch_state *cur = st;
9039                 regmatch_state *curyes = yes_state;
9040                 U32 i;
9041                 regmatch_slab *slab = PL_regmatch_slab;
9042                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9043                     if (cur < SLAB_FIRST(slab)) {
9044                         slab = slab->prev;
9045                         cur = SLAB_LAST(slab);
9046                     }
9047                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9048                         depth,
9049                         i ? "    " : "push",
9050                         depth - i, PL_reg_name[cur->resume_state],
9051                         (curyes == cur) ? "yes" : ""
9052                     );
9053                     if (curyes == cur)
9054                         curyes = cur->u.yes.prev_yes_state;
9055                 }
9056             } else
9057                 DEBUG_STATE_pp("push")
9058             );
9059             depth++;
9060             st->locinput = locinput;
9061             st->loceol = loceol;
9062             st->sr0 = script_run_begin;
9063             newst = st+1;
9064             if (newst >  SLAB_LAST(PL_regmatch_slab))
9065                 newst = S_push_slab(aTHX);
9066             PL_regmatch_state = newst;
9067
9068             locinput = pushinput;
9069             loceol = pusheol;
9070             script_run_begin = pushsr0;
9071             st = newst;
9072             continue;
9073             /* NOTREACHED */
9074         }
9075     }
9076 #ifdef SOLARIS_BAD_OPTIMIZER
9077 #  undef PL_charclass
9078 #endif
9079
9080     /*
9081     * We get here only if there's trouble -- normally "case END" is
9082     * the terminating point.
9083     */
9084     Perl_croak(aTHX_ "corrupted regexp pointers");
9085     NOT_REACHED; /* NOTREACHED */
9086
9087   yes:
9088     if (yes_state) {
9089         /* we have successfully completed a subexpression, but we must now
9090          * pop to the state marked by yes_state and continue from there */
9091         assert(st != yes_state);
9092 #ifdef DEBUGGING
9093         while (st != yes_state) {
9094             st--;
9095             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9096                 PL_regmatch_slab = PL_regmatch_slab->prev;
9097                 st = SLAB_LAST(PL_regmatch_slab);
9098             }
9099             DEBUG_STATE_r({
9100                 if (no_final) {
9101                     DEBUG_STATE_pp("pop (no final)");
9102                 } else {
9103                     DEBUG_STATE_pp("pop (yes)");
9104                 }
9105             });
9106             depth--;
9107         }
9108 #else
9109         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9110             || yes_state > SLAB_LAST(PL_regmatch_slab))
9111         {
9112             /* not in this slab, pop slab */
9113             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9114             PL_regmatch_slab = PL_regmatch_slab->prev;
9115             st = SLAB_LAST(PL_regmatch_slab);
9116         }
9117         depth -= (st - yes_state);
9118 #endif
9119         st = yes_state;
9120         yes_state = st->u.yes.prev_yes_state;
9121         PL_regmatch_state = st;
9122
9123         if (no_final) {
9124             locinput= st->locinput;
9125             loceol= st->loceol;
9126             script_run_begin = st->sr0;
9127         }
9128         state_num = st->resume_state + no_final;
9129         goto reenter_switch;
9130     }
9131
9132     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9133                           PL_colors[4], PL_colors[5]));
9134
9135     if (reginfo->info_aux_eval) {
9136         /* each successfully executed (?{...}) block does the equivalent of
9137          *   local $^R = do {...}
9138          * When popping the save stack, all these locals would be undone;
9139          * bypass this by setting the outermost saved $^R to the latest
9140          * value */
9141         /* I dont know if this is needed or works properly now.
9142          * see code related to PL_replgv elsewhere in this file.
9143          * Yves
9144          */
9145         if (oreplsv != GvSV(PL_replgv)) {
9146             sv_setsv(oreplsv, GvSV(PL_replgv));
9147             SvSETMAGIC(oreplsv);
9148         }
9149     }
9150     result = 1;
9151     goto final_exit;
9152
9153   no:
9154     DEBUG_EXECUTE_r(
9155         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9156             depth,
9157             PL_colors[4], PL_colors[5])
9158         );
9159
9160   no_silent:
9161     if (no_final) {
9162         if (yes_state) {
9163             goto yes;
9164         } else {
9165             goto final_exit;
9166         }
9167     }
9168     if (depth) {
9169         /* there's a previous state to backtrack to */
9170         st--;
9171         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9172             PL_regmatch_slab = PL_regmatch_slab->prev;
9173             st = SLAB_LAST(PL_regmatch_slab);
9174         }
9175         PL_regmatch_state = st;
9176         locinput= st->locinput;
9177         loceol= st->loceol;
9178         script_run_begin = st->sr0;
9179
9180         DEBUG_STATE_pp("pop");
9181         depth--;
9182         if (yes_state == st)
9183             yes_state = st->u.yes.prev_yes_state;
9184
9185         state_num = st->resume_state + 1; /* failure = success + 1 */
9186         PERL_ASYNC_CHECK();
9187         goto reenter_switch;
9188     }
9189     result = 0;
9190
9191   final_exit:
9192     if (rex->intflags & PREGf_VERBARG_SEEN) {
9193         SV *sv_err = get_sv("REGERROR", 1);
9194         SV *sv_mrk = get_sv("REGMARK", 1);
9195         if (result) {
9196             sv_commit = &PL_sv_no;
9197             if (!sv_yes_mark)
9198                 sv_yes_mark = &PL_sv_yes;
9199         } else {
9200             if (!sv_commit)
9201                 sv_commit = &PL_sv_yes;
9202             sv_yes_mark = &PL_sv_no;
9203         }
9204         assert(sv_err);
9205         assert(sv_mrk);
9206         sv_setsv(sv_err, sv_commit);
9207         sv_setsv(sv_mrk, sv_yes_mark);
9208     }
9209
9210
9211     if (last_pushed_cv) {
9212         dSP;
9213         /* see "Some notes about MULTICALL" above */
9214         POP_MULTICALL;
9215         PERL_UNUSED_VAR(SP);
9216     }
9217     else
9218         LEAVE_SCOPE(orig_savestack_ix);
9219
9220     assert(!result ||  locinput - reginfo->strbeg >= 0);
9221     return result ?  locinput - reginfo->strbeg : -1;
9222 }
9223
9224 /*
9225  - regrepeat - repeatedly match something simple, report how many
9226  *
9227  * What 'simple' means is a node which can be the operand of a quantifier like
9228  * '+', or {1,3}
9229  *
9230  * startposp - pointer to a pointer to the start position.  This is updated
9231  *             to point to the byte following the highest successful
9232  *             match.
9233  * p         - the regnode to be repeatedly matched against.
9234  * loceol    - pointer to the end position beyond which we aren't supposed to
9235  *             look.
9236  * reginfo   - struct holding match state, such as utf8_target
9237  * max       - maximum number of things to match.
9238  * depth     - (for debugging) backtracking depth.
9239  */
9240 STATIC I32
9241 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9242             char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9243 {
9244     dVAR;
9245     char *scan;     /* Pointer to current position in target string */
9246     I32 c;
9247     char *this_eol = loceol;   /* potentially adjusted version. */
9248     I32 hardcount = 0;  /* How many matches so far */
9249     bool utf8_target = reginfo->is_utf8_target;
9250     unsigned int to_complement = 0;  /* Invert the result? */
9251     UV utf8_flags = 0;
9252     _char_class_number classnum;
9253
9254     PERL_ARGS_ASSERT_REGREPEAT;
9255
9256     /* This routine is structured so that we switch on the input OP.  Each OP
9257      * case: statement contains a loop to repeatedly apply the OP, advancing
9258      * the input until it fails, or reaches the end of the input, or until it
9259      * reaches the upper limit of matches. */
9260
9261     scan = *startposp;
9262     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
9263                                max */
9264         max = I32_MAX;
9265     else if (! utf8_target && this_eol - scan > max)
9266         this_eol = scan + max;
9267
9268     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
9269      * to the maximum of how far we should go in it (leaving it set to the real
9270      * end, if the maximum permissible would take us beyond that).  This allows
9271      * us to make the loop exit condition that we haven't gone past <this_eol> to
9272      * also mean that we haven't exceeded the max permissible count, saving a
9273      * test each time through the loops.  But it assumes that the OP matches a
9274      * single byte, which is true for most of the OPs below when applied to a
9275      * non-UTF-8 target.  Those relatively few OPs that don't have this
9276      * characteristic will have to compensate.
9277      *
9278      * There is no adjustment for UTF-8 targets, as the number of bytes per
9279      * character varies.  OPs will have to test both that the count is less
9280      * than the max permissible (using <hardcount> to keep track), and that we
9281      * are still within the bounds of the string (using <this_eol>.  A few OPs
9282      * match a single byte no matter what the encoding.  They can omit the max
9283      * test if, for the UTF-8 case, they do the adjustment that was skipped
9284      * above.
9285      *
9286      * Thus, the code above sets things up for the common case; and exceptional
9287      * cases need extra work; the common case is to make sure <scan> doesn't
9288      * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9289      * count doesn't exceed the maximum permissible */
9290
9291     switch (OP(p)) {
9292     case REG_ANY:
9293         if (utf8_target) {
9294             while (scan < this_eol && hardcount < max && *scan != '\n') {
9295                 scan += UTF8SKIP(scan);
9296                 hardcount++;
9297             }
9298         } else {
9299             scan = (char *) memchr(scan, '\n', this_eol - scan);
9300             if (! scan) {
9301                 scan = this_eol;
9302             }
9303         }
9304         break;
9305     case SANY:
9306         if (utf8_target) {
9307             while (scan < this_eol && hardcount < max) {
9308                 scan += UTF8SKIP(scan);
9309                 hardcount++;
9310             }
9311         }
9312         else
9313             scan = this_eol;
9314         break;
9315     case EXACTL:
9316         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9317         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9318             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9319         }
9320         goto do_exact;
9321
9322     case EXACT_ONLY8:
9323         if (! utf8_target) {
9324             break;
9325         }
9326         /* FALLTHROUGH */
9327     case EXACT:
9328       do_exact:
9329         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9330
9331         c = (U8)*STRING(p);
9332
9333         /* Can use a simple find if the pattern char to match on is invariant
9334          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9335          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9336          * true iff it doesn't matter if the argument is in UTF-8 or not */
9337         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9338             if (utf8_target && this_eol - scan > max) {
9339                 /* We didn't adjust <this_eol> because is UTF-8, but ok to do so,
9340                  * since here, to match at all, 1 char == 1 byte */
9341                 this_eol = scan + max;
9342             }
9343             scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9344         }
9345         else if (reginfo->is_utf8_pat) {
9346             if (utf8_target) {
9347                 STRLEN scan_char_len;
9348
9349                 /* When both target and pattern are UTF-8, we have to do
9350                  * string EQ */
9351                 while (hardcount < max
9352                        && scan < this_eol
9353                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9354                        && memEQ(scan, STRING(p), scan_char_len))
9355                 {
9356                     scan += scan_char_len;
9357                     hardcount++;
9358                 }
9359             }
9360             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9361
9362                 /* Target isn't utf8; convert the character in the UTF-8
9363                  * pattern to non-UTF8, and do a simple find */
9364                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9365                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9366             } /* else pattern char is above Latin1, can't possibly match the
9367                  non-UTF-8 target */
9368         }
9369         else {
9370
9371             /* Here, the string must be utf8; pattern isn't, and <c> is
9372              * different in utf8 than not, so can't compare them directly.
9373              * Outside the loop, find the two utf8 bytes that represent c, and
9374              * then look for those in sequence in the utf8 string */
9375             U8 high = UTF8_TWO_BYTE_HI(c);
9376             U8 low = UTF8_TWO_BYTE_LO(c);
9377
9378             while (hardcount < max
9379                     && scan + 1 < this_eol
9380                     && UCHARAT(scan) == high
9381                     && UCHARAT(scan + 1) == low)
9382             {
9383                 scan += 2;
9384                 hardcount++;
9385             }
9386         }
9387         break;
9388
9389     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9390         assert(! reginfo->is_utf8_pat);
9391         /* FALLTHROUGH */
9392     case EXACTFAA:
9393         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9394         if (reginfo->is_utf8_pat || ! utf8_target) {
9395
9396             /* The possible presence of a MICRO SIGN in the pattern forbids us
9397              * to view a non-UTF-8 pattern as folded when there is a UTF-8
9398              * target.  */
9399             utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
9400         }
9401         goto do_exactf;
9402
9403     case EXACTFL:
9404         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9405         utf8_flags = FOLDEQ_LOCALE;
9406         goto do_exactf;
9407
9408     case EXACTF:   /* This node only generated for non-utf8 patterns */
9409         assert(! reginfo->is_utf8_pat);
9410         goto do_exactf;
9411
9412     case EXACTFLU8:
9413         if (! utf8_target) {
9414             break;
9415         }
9416         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9417                                     | FOLDEQ_S2_FOLDS_SANE;
9418         goto do_exactf;
9419
9420     case EXACTFU_ONLY8:
9421         if (! utf8_target) {
9422             break;
9423         }
9424         assert(reginfo->is_utf8_pat);
9425         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9426         goto do_exactf;
9427
9428     case EXACTFU:
9429         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9430         /* FALLTHROUGH */
9431
9432     case EXACTFUP:
9433
9434       do_exactf: {
9435         int c1, c2;
9436         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9437
9438         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9439
9440         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9441                                         reginfo))
9442         {
9443             if (c1 == CHRTEST_VOID) {
9444                 /* Use full Unicode fold matching */
9445                 char *tmpeol = loceol;
9446                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9447                 while (hardcount < max
9448                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9449                                              STRING(p), NULL, pat_len,
9450                                              reginfo->is_utf8_pat, utf8_flags))
9451                 {
9452                     scan = tmpeol;
9453                     tmpeol = loceol;
9454                     hardcount++;
9455                 }
9456             }
9457             else if (utf8_target) {
9458                 if (c1 == c2) {
9459                     while (scan < this_eol
9460                            && hardcount < max
9461                            && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9462                                                                   loceol)))
9463                     {
9464                         scan += UTF8SKIP(c1_utf8);
9465                         hardcount++;
9466                     }
9467                 }
9468                 else {
9469                     while (scan < this_eol
9470                            && hardcount < max
9471                            && (   memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9472                                                                      loceol))
9473                                || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
9474                                                                      loceol))))
9475                     {
9476                         scan += UTF8_SAFE_SKIP(scan, loceol);
9477                         hardcount++;
9478                     }
9479                 }
9480             }
9481             else if (c1 == c2) {
9482                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c1);
9483             }
9484             else {
9485                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9486                  * a conditional each time through the loop if the characters
9487                  * differ only in a single bit, as is the usual situation */
9488                 U8 c1_c2_bits_differing = c1 ^ c2;
9489
9490                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9491                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9492
9493                     scan = (char *) find_span_end_mask((U8 *) scan,
9494                                                        (U8 *) this_eol,
9495                                                        c1 & c1_c2_mask,
9496                                                        c1_c2_mask);
9497                 }
9498                 else {
9499                     while (    scan < this_eol
9500                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9501                     {
9502                         scan++;
9503                     }
9504                 }
9505             }
9506         }
9507         break;
9508     }
9509     case ANYOFPOSIXL:
9510     case ANYOFL:
9511         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9512
9513         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9514             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9515         }
9516         /* FALLTHROUGH */
9517     case ANYOFD:
9518     case ANYOF:
9519         if (utf8_target) {
9520             while (hardcount < max
9521                    && scan < this_eol
9522                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
9523             {
9524                 scan += UTF8SKIP(scan);
9525                 hardcount++;
9526             }
9527         }
9528         else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
9529             while (scan < this_eol
9530                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9531                 scan++;
9532         }
9533         else {
9534             while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9535                 scan++;
9536         }
9537         break;
9538
9539     case ANYOFM:
9540         if (utf8_target && this_eol - scan > max) {
9541
9542             /* We didn't adjust <this_eol> at the beginning of this routine
9543              * because is UTF-8, but it is actually ok to do so, since here, to
9544              * match, 1 char == 1 byte. */
9545             this_eol = scan + max;
9546         }
9547
9548         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9549         break;
9550
9551     case NANYOFM:
9552         if (utf8_target) {
9553             while (     hardcount < max
9554                    &&   scan < this_eol
9555                    &&  (*scan & FLAGS(p)) != ARG(p))
9556             {
9557                 scan += UTF8SKIP(scan);
9558                 hardcount++;
9559             }
9560         }
9561         else {
9562             scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9563         }
9564         break;
9565
9566     case ANYOFH:
9567         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9568             if (ANYOF_FLAGS(p)) {   /* If we know the first byte of what
9569                                        matches, we can avoid calling reginclass
9570                                      */
9571                 while (   hardcount < max
9572                        && scan < this_eol
9573                        && (U8) *scan == ANYOF_FLAGS(p)
9574                        && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
9575                                                                   TRUE))
9576                 {
9577                     scan += UTF8SKIP(scan);
9578                     hardcount++;
9579                 }
9580             }
9581             else while (  hardcount < max
9582                         && scan < this_eol
9583                         && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9584             {
9585                 scan += UTF8SKIP(scan);
9586                 hardcount++;
9587             }
9588         }
9589         break;
9590
9591     /* The argument (FLAGS) to all the POSIX node types is the class number */
9592
9593     case NPOSIXL:
9594         to_complement = 1;
9595         /* FALLTHROUGH */
9596
9597     case POSIXL:
9598         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9599         if (! utf8_target) {
9600             while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9601                                                                    *scan)))
9602             {
9603                 scan++;
9604             }
9605         } else {
9606             while (hardcount < max && scan < this_eol
9607                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9608                                                                   (U8 *) scan,
9609                                                                   (U8 *) this_eol)))
9610             {
9611                 scan += UTF8SKIP(scan);
9612                 hardcount++;
9613             }
9614         }
9615         break;
9616
9617     case POSIXD:
9618         if (utf8_target) {
9619             goto utf8_posix;
9620         }
9621         /* FALLTHROUGH */
9622
9623     case POSIXA:
9624         if (utf8_target && this_eol - scan > max) {
9625
9626             /* We didn't adjust <this_eol> at the beginning of this routine
9627              * because is UTF-8, but it is actually ok to do so, since here, to
9628              * match, 1 char == 1 byte. */
9629             this_eol = scan + max;
9630         }
9631         while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9632             scan++;
9633         }
9634         break;
9635
9636     case NPOSIXD:
9637         if (utf8_target) {
9638             to_complement = 1;
9639             goto utf8_posix;
9640         }
9641         /* FALLTHROUGH */
9642
9643     case NPOSIXA:
9644         if (! utf8_target) {
9645             while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9646                 scan++;
9647             }
9648         }
9649         else {
9650
9651             /* The complement of something that matches only ASCII matches all
9652              * non-ASCII, plus everything in ASCII that isn't in the class. */
9653             while (hardcount < max && scan < this_eol
9654                    && (   ! isASCII_utf8_safe(scan, loceol)
9655                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9656             {
9657                 scan += UTF8SKIP(scan);
9658                 hardcount++;
9659             }
9660         }
9661         break;
9662
9663     case NPOSIXU:
9664         to_complement = 1;
9665         /* FALLTHROUGH */
9666
9667     case POSIXU:
9668         if (! utf8_target) {
9669             while (scan < this_eol && to_complement
9670                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9671             {
9672                 scan++;
9673             }
9674         }
9675         else {
9676           utf8_posix:
9677             classnum = (_char_class_number) FLAGS(p);
9678             switch (classnum) {
9679                 default:
9680                     while (   hardcount < max && scan < this_eol
9681                            && to_complement ^ cBOOL(_invlist_contains_cp(
9682                                               PL_XPosix_ptrs[classnum],
9683                                               utf8_to_uvchr_buf((U8 *) scan,
9684                                                                 (U8 *) this_eol,
9685                                                                 NULL))))
9686                     {
9687                         scan += UTF8SKIP(scan);
9688                         hardcount++;
9689                     }
9690                     break;
9691
9692                     /* For the classes below, the knowledge of how to handle
9693                      * every code point is compiled in to Perl via a macro.
9694                      * This code is written for making the loops as tight as
9695                      * possible.  It could be refactored to save space instead.
9696                      * */
9697
9698                 case _CC_ENUM_SPACE:
9699                     while (hardcount < max
9700                            && scan < this_eol
9701                            && (to_complement
9702                                ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
9703                     {
9704                         scan += UTF8SKIP(scan);
9705                         hardcount++;
9706                     }
9707                     break;
9708                 case _CC_ENUM_BLANK:
9709                     while (hardcount < max
9710                            && scan < this_eol
9711                            && (to_complement
9712                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
9713                     {
9714                         scan += UTF8SKIP(scan);
9715                         hardcount++;
9716                     }
9717                     break;
9718                 case _CC_ENUM_XDIGIT:
9719                     while (hardcount < max
9720                            && scan < this_eol
9721                            && (to_complement
9722                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
9723                     {
9724                         scan += UTF8SKIP(scan);
9725                         hardcount++;
9726                     }
9727                     break;
9728                 case _CC_ENUM_VERTSPACE:
9729                     while (hardcount < max
9730                            && scan < this_eol
9731                            && (to_complement
9732                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
9733                     {
9734                         scan += UTF8SKIP(scan);
9735                         hardcount++;
9736                     }
9737                     break;
9738                 case _CC_ENUM_CNTRL:
9739                     while (hardcount < max
9740                            && scan < this_eol
9741                            && (to_complement
9742                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
9743                     {
9744                         scan += UTF8SKIP(scan);
9745                         hardcount++;
9746                     }
9747                     break;
9748             }
9749         }
9750         break;
9751
9752     case LNBREAK:
9753         if (utf8_target) {
9754             while (hardcount < max && scan < this_eol &&
9755                     (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
9756                 scan += c;
9757                 hardcount++;
9758             }
9759         } else {
9760             /* LNBREAK can match one or two latin chars, which is ok, but we
9761              * have to use hardcount in this situation, and throw away the
9762              * adjustment to <this_eol> done before the switch statement */
9763             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9764                 scan+=c;
9765                 hardcount++;
9766             }
9767         }
9768         break;
9769
9770     case BOUNDL:
9771     case NBOUNDL:
9772         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9773         /* FALLTHROUGH */
9774     case BOUND:
9775     case BOUNDA:
9776     case BOUNDU:
9777     case EOS:
9778     case GPOS:
9779     case KEEPS:
9780     case NBOUND:
9781     case NBOUNDA:
9782     case NBOUNDU:
9783     case OPFAIL:
9784     case SBOL:
9785     case SEOL:
9786         /* These are all 0 width, so match right here or not at all. */
9787         break;
9788
9789     default:
9790         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9791         NOT_REACHED; /* NOTREACHED */
9792
9793     }
9794
9795     if (hardcount)
9796         c = hardcount;
9797     else
9798         c = scan - *startposp;
9799     *startposp = scan;
9800
9801     DEBUG_r({
9802         GET_RE_DEBUG_FLAGS_DECL;
9803         DEBUG_EXECUTE_r({
9804             SV * const prop = sv_newmortal();
9805             regprop(prog, prop, p, reginfo, NULL);
9806             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9807                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9808         });
9809     });
9810
9811     return(c);
9812 }
9813
9814 /*
9815  - reginclass - determine if a character falls into a character class
9816
9817   n is the ANYOF-type regnode
9818   p is the target string
9819   p_end points to one byte beyond the end of the target string
9820   utf8_target tells whether p is in UTF-8.
9821
9822   Returns true if matched; false otherwise.
9823
9824   Note that this can be a synthetic start class, a combination of various
9825   nodes, so things you think might be mutually exclusive, such as locale,
9826   aren't.  It can match both locale and non-locale
9827
9828  */
9829
9830 STATIC bool
9831 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9832 {
9833     dVAR;
9834     const char flags = (OP(n) == ANYOFH) ? 0 : ANYOF_FLAGS(n);
9835     bool match = FALSE;
9836     UV c = *p;
9837
9838     PERL_ARGS_ASSERT_REGINCLASS;
9839
9840     /* If c is not already the code point, get it.  Note that
9841      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9842     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9843         STRLEN c_len = 0;
9844         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9845         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9846         if (c_len == (STRLEN)-1) {
9847             _force_out_malformed_utf8_message(p, p_end,
9848                                               utf8n_flags,
9849                                               1 /* 1 means die */ );
9850             NOT_REACHED; /* NOTREACHED */
9851         }
9852         if (     c > 255
9853             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
9854             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
9855         {
9856             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9857         }
9858     }
9859
9860     /* If this character is potentially in the bitmap, check it */
9861     if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
9862         if (ANYOF_BITMAP_TEST(n, c))
9863             match = TRUE;
9864         else if ((flags
9865                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9866                   && OP(n) == ANYOFD
9867                   && ! utf8_target
9868                   && ! isASCII(c))
9869         {
9870             match = TRUE;
9871         }
9872         else if (flags & ANYOF_LOCALE_FLAGS) {
9873             if (  (flags & ANYOFL_FOLD)
9874                 && c < sizeof(PL_fold_locale)
9875                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9876             {
9877                 match = TRUE;
9878             }
9879             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
9880                      && c <= U8_MAX  /* param to isFOO_lc() */
9881             ) {
9882
9883                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9884                  * if the class includes the Posix character class given by
9885                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9886                  * complemented Posix class given by int(bit/2).  So we loop
9887                  * through the bits, each time changing whether we complement
9888                  * the result or not.  Suppose for the sake of illustration
9889                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9890                  * is set, it means there is a match for this ANYOF node if the
9891                  * character is in the class given by the expression (0 / 2 = 0
9892                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9893                  * and since 'to_complement' is 0, the result will stay TRUE,
9894                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9895                  * bit 1 is 1.  That means there is a match if the character
9896                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9897                  * but will on bit 1.  On the second iteration 'to_complement'
9898                  * will be 1, so the exclusive or will reverse things, so we
9899                  * are testing for \W.  On the third iteration, 'to_complement'
9900                  * will be 0, and we would be testing for \s; the fourth
9901                  * iteration would test for \S, etc.
9902                  *
9903                  * Note that this code assumes that all the classes are closed
9904                  * under folding.  For example, if a character matches \w, then
9905                  * its fold does too; and vice versa.  This should be true for
9906                  * any well-behaved locale for all the currently defined Posix
9907                  * classes, except for :lower: and :upper:, which are handled
9908                  * by the pseudo-class :cased: which matches if either of the
9909                  * other two does.  To get rid of this assumption, an outer
9910                  * loop could be used below to iterate over both the source
9911                  * character, and its fold (if different) */
9912
9913                 int count = 0;
9914                 int to_complement = 0;
9915
9916                 while (count < ANYOF_MAX) {
9917                     if (ANYOF_POSIXL_TEST(n, count)
9918                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9919                     {
9920                         match = TRUE;
9921                         break;
9922                     }
9923                     count++;
9924                     to_complement ^= 1;
9925                 }
9926             }
9927         }
9928     }
9929
9930
9931     /* If the bitmap didn't (or couldn't) match, and something outside the
9932      * bitmap could match, try that. */
9933     if (!match) {
9934         if (c >= NUM_ANYOF_CODE_POINTS
9935             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9936         {
9937             match = TRUE;       /* Everything above the bitmap matches */
9938         }
9939             /* Here doesn't match everything above the bitmap.  If there is
9940              * some information available beyond the bitmap, we may find a
9941              * match in it.  If so, this is most likely because the code point
9942              * is outside the bitmap range.  But rarely, it could be because of
9943              * some other reason.  If so, various flags are set to indicate
9944              * this possibility.  On ANYOFD nodes, there may be matches that
9945              * happen only when the target string is UTF-8; or for other node
9946              * types, because runtime lookup is needed, regardless of the
9947              * UTF-8ness of the target string.  Finally, under /il, there may
9948              * be some matches only possible if the locale is a UTF-8 one. */
9949         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9950                  && (   c >= NUM_ANYOF_CODE_POINTS
9951                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9952                          && (   UNLIKELY(OP(n) != ANYOFD)
9953                              || (utf8_target && ! isASCII_uni(c)
9954 #                               if NUM_ANYOF_CODE_POINTS > 256
9955                                                                  && c < 256
9956 #                               endif
9957                                 )))
9958                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9959                          && IN_UTF8_CTYPE_LOCALE)))
9960         {
9961             SV* only_utf8_locale = NULL;
9962             SV * const definition = _get_regclass_nonbitmap_data(prog, n, TRUE,
9963                                                    0, &only_utf8_locale, NULL);
9964             if (definition) {
9965                 U8 utf8_buffer[2];
9966                 U8 * utf8_p;
9967                 if (utf8_target) {
9968                     utf8_p = (U8 *) p;
9969                 } else { /* Convert to utf8 */
9970                     utf8_p = utf8_buffer;
9971                     append_utf8_from_native_byte(*p, &utf8_p);
9972                     utf8_p = utf8_buffer;
9973                 }
9974
9975                 /* Turkish locales have these hard-coded rules overriding
9976                  * normal ones */
9977                 if (   UNLIKELY(PL_in_utf8_turkic_locale)
9978                     && isALPHA_FOLD_EQ(*p, 'i'))
9979                 {
9980                     if (*p == 'i') {
9981                         if (_invlist_contains_cp(definition,
9982                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
9983                         {
9984                             match = TRUE;
9985                         }
9986                     }
9987                     else if (*p == 'I') {
9988                         if (_invlist_contains_cp(definition,
9989                                                 LATIN_SMALL_LETTER_DOTLESS_I))
9990                         {
9991                             match = TRUE;
9992                         }
9993                     }
9994                 }
9995                 else if (_invlist_contains_cp(definition, c)) {
9996                     match = TRUE;
9997                 }
9998             }
9999             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10000                 match = _invlist_contains_cp(only_utf8_locale, c);
10001             }
10002         }
10003
10004         /* In a Turkic locale under folding, hard-code the I i case pair
10005          * matches */
10006         if (     UNLIKELY(PL_in_utf8_turkic_locale)
10007             && ! match
10008             &&   (flags & ANYOFL_FOLD)
10009             &&   utf8_target)
10010         {
10011             if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10012                 if (ANYOF_BITMAP_TEST(n, 'i')) {
10013                     match = TRUE;
10014                 }
10015             }
10016             else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10017                 if (ANYOF_BITMAP_TEST(n, 'I')) {
10018                     match = TRUE;
10019                 }
10020             }
10021         }
10022
10023         if (UNICODE_IS_SUPER(c)
10024             && (flags
10025                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10026             && OP(n) != ANYOFD
10027             && ckWARN_d(WARN_NON_UNICODE))
10028         {
10029             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10030                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10031         }
10032     }
10033
10034 #if ANYOF_INVERT != 1
10035     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10036      * use it, don't */
10037 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10038 #endif
10039
10040     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10041     return (flags & ANYOF_INVERT) ^ match;
10042 }
10043
10044 STATIC U8 *
10045 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10046 {
10047     /* return the position 'off' UTF-8 characters away from 's', forward if
10048      * 'off' >= 0, backwards if negative.  But don't go outside of position
10049      * 'lim', which better be < s  if off < 0 */
10050
10051     PERL_ARGS_ASSERT_REGHOP3;
10052
10053     if (off >= 0) {
10054         while (off-- && s < lim) {
10055             /* XXX could check well-formedness here */
10056             U8 *new_s = s + UTF8SKIP(s);
10057             if (new_s > lim) /* lim may be in the middle of a long character */
10058                 return s;
10059             s = new_s;
10060         }
10061     }
10062     else {
10063         while (off++ && s > lim) {
10064             s--;
10065             if (UTF8_IS_CONTINUED(*s)) {
10066                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10067                     s--;
10068                 if (! UTF8_IS_START(*s)) {
10069                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10070                 }
10071             }
10072             /* XXX could check well-formedness here */
10073         }
10074     }
10075     return s;
10076 }
10077
10078 STATIC U8 *
10079 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10080 {
10081     PERL_ARGS_ASSERT_REGHOP4;
10082
10083     if (off >= 0) {
10084         while (off-- && s < rlim) {
10085             /* XXX could check well-formedness here */
10086             s += UTF8SKIP(s);
10087         }
10088     }
10089     else {
10090         while (off++ && s > llim) {
10091             s--;
10092             if (UTF8_IS_CONTINUED(*s)) {
10093                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10094                     s--;
10095                 if (! UTF8_IS_START(*s)) {
10096                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10097                 }
10098             }
10099             /* XXX could check well-formedness here */
10100         }
10101     }
10102     return s;
10103 }
10104
10105 /* like reghop3, but returns NULL on overrun, rather than returning last
10106  * char pos */
10107
10108 STATIC U8 *
10109 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10110 {
10111     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10112
10113     if (off >= 0) {
10114         while (off-- && s < lim) {
10115             /* XXX could check well-formedness here */
10116             s += UTF8SKIP(s);
10117         }
10118         if (off >= 0)
10119             return NULL;
10120     }
10121     else {
10122         while (off++ && s > lim) {
10123             s--;
10124             if (UTF8_IS_CONTINUED(*s)) {
10125                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10126                     s--;
10127                 if (! UTF8_IS_START(*s)) {
10128                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10129                 }
10130             }
10131             /* XXX could check well-formedness here */
10132         }
10133         if (off <= 0)
10134             return NULL;
10135     }
10136     return s;
10137 }
10138
10139
10140 /* when executing a regex that may have (?{}), extra stuff needs setting
10141    up that will be visible to the called code, even before the current
10142    match has finished. In particular:
10143
10144    * $_ is localised to the SV currently being matched;
10145    * pos($_) is created if necessary, ready to be updated on each call-out
10146      to code;
10147    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10148      isn't set until the current pattern is successfully finished), so that
10149      $1 etc of the match-so-far can be seen;
10150    * save the old values of subbeg etc of the current regex, and  set then
10151      to the current string (again, this is normally only done at the end
10152      of execution)
10153 */
10154
10155 static void
10156 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10157 {
10158     MAGIC *mg;
10159     regexp *const rex = ReANY(reginfo->prog);
10160     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10161
10162     eval_state->rex = rex;
10163
10164     if (reginfo->sv) {
10165         /* Make $_ available to executed code. */
10166         if (reginfo->sv != DEFSV) {
10167             SAVE_DEFSV;
10168             DEFSV_set(reginfo->sv);
10169         }
10170
10171         if (!(mg = mg_find_mglob(reginfo->sv))) {
10172             /* prepare for quick setting of pos */
10173             mg = sv_magicext_mglob(reginfo->sv);
10174             mg->mg_len = -1;
10175         }
10176         eval_state->pos_magic = mg;
10177         eval_state->pos       = mg->mg_len;
10178         eval_state->pos_flags = mg->mg_flags;
10179     }
10180     else
10181         eval_state->pos_magic = NULL;
10182
10183     if (!PL_reg_curpm) {
10184         /* PL_reg_curpm is a fake PMOP that we can attach the current
10185          * regex to and point PL_curpm at, so that $1 et al are visible
10186          * within a /(?{})/. It's just allocated once per interpreter the
10187          * first time its needed */
10188         Newxz(PL_reg_curpm, 1, PMOP);
10189 #ifdef USE_ITHREADS
10190         {
10191             SV* const repointer = &PL_sv_undef;
10192             /* this regexp is also owned by the new PL_reg_curpm, which
10193                will try to free it.  */
10194             av_push(PL_regex_padav, repointer);
10195             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10196             PL_regex_pad = AvARRAY(PL_regex_padav);
10197         }
10198 #endif
10199     }
10200     SET_reg_curpm(reginfo->prog);
10201     eval_state->curpm = PL_curpm;
10202     PL_curpm_under = PL_curpm;
10203     PL_curpm = PL_reg_curpm;
10204     if (RXp_MATCH_COPIED(rex)) {
10205         /*  Here is a serious problem: we cannot rewrite subbeg,
10206             since it may be needed if this match fails.  Thus
10207             $` inside (?{}) could fail... */
10208         eval_state->subbeg     = rex->subbeg;
10209         eval_state->sublen     = rex->sublen;
10210         eval_state->suboffset  = rex->suboffset;
10211         eval_state->subcoffset = rex->subcoffset;
10212 #ifdef PERL_ANY_COW
10213         eval_state->saved_copy = rex->saved_copy;
10214 #endif
10215         RXp_MATCH_COPIED_off(rex);
10216     }
10217     else
10218         eval_state->subbeg = NULL;
10219     rex->subbeg = (char *)reginfo->strbeg;
10220     rex->suboffset = 0;
10221     rex->subcoffset = 0;
10222     rex->sublen = reginfo->strend - reginfo->strbeg;
10223 }
10224
10225
10226 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10227
10228 static void
10229 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10230 {
10231     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10232     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10233     regmatch_slab *s;
10234
10235     Safefree(aux->poscache);
10236
10237     if (eval_state) {
10238
10239         /* undo the effects of S_setup_eval_state() */
10240
10241         if (eval_state->subbeg) {
10242             regexp * const rex = eval_state->rex;
10243             rex->subbeg     = eval_state->subbeg;
10244             rex->sublen     = eval_state->sublen;
10245             rex->suboffset  = eval_state->suboffset;
10246             rex->subcoffset = eval_state->subcoffset;
10247 #ifdef PERL_ANY_COW
10248             rex->saved_copy = eval_state->saved_copy;
10249 #endif
10250             RXp_MATCH_COPIED_on(rex);
10251         }
10252         if (eval_state->pos_magic)
10253         {
10254             eval_state->pos_magic->mg_len = eval_state->pos;
10255             eval_state->pos_magic->mg_flags =
10256                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10257                | (eval_state->pos_flags & MGf_BYTES);
10258         }
10259
10260         PL_curpm = eval_state->curpm;
10261     }
10262
10263     PL_regmatch_state = aux->old_regmatch_state;
10264     PL_regmatch_slab  = aux->old_regmatch_slab;
10265
10266     /* free all slabs above current one - this must be the last action
10267      * of this function, as aux and eval_state are allocated within
10268      * slabs and may be freed here */
10269
10270     s = PL_regmatch_slab->next;
10271     if (s) {
10272         PL_regmatch_slab->next = NULL;
10273         while (s) {
10274             regmatch_slab * const osl = s;
10275             s = s->next;
10276             Safefree(osl);
10277         }
10278     }
10279 }
10280
10281
10282 STATIC void
10283 S_to_utf8_substr(pTHX_ regexp *prog)
10284 {
10285     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10286      * on the converted value */
10287
10288     int i = 1;
10289
10290     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10291
10292     do {
10293         if (prog->substrs->data[i].substr
10294             && !prog->substrs->data[i].utf8_substr) {
10295             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10296             prog->substrs->data[i].utf8_substr = sv;
10297             sv_utf8_upgrade(sv);
10298             if (SvVALID(prog->substrs->data[i].substr)) {
10299                 if (SvTAIL(prog->substrs->data[i].substr)) {
10300                     /* Trim the trailing \n that fbm_compile added last
10301                        time.  */
10302                     SvCUR_set(sv, SvCUR(sv) - 1);
10303                     /* Whilst this makes the SV technically "invalid" (as its
10304                        buffer is no longer followed by "\0") when fbm_compile()
10305                        adds the "\n" back, a "\0" is restored.  */
10306                     fbm_compile(sv, FBMcf_TAIL);
10307                 } else
10308                     fbm_compile(sv, 0);
10309             }
10310             if (prog->substrs->data[i].substr == prog->check_substr)
10311                 prog->check_utf8 = sv;
10312         }
10313     } while (i--);
10314 }
10315
10316 STATIC bool
10317 S_to_byte_substr(pTHX_ regexp *prog)
10318 {
10319     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10320      * on the converted value; returns FALSE if can't be converted. */
10321
10322     int i = 1;
10323
10324     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10325
10326     do {
10327         if (prog->substrs->data[i].utf8_substr
10328             && !prog->substrs->data[i].substr) {
10329             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10330             if (! sv_utf8_downgrade(sv, TRUE)) {
10331                 return FALSE;
10332             }
10333             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10334                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10335                     /* Trim the trailing \n that fbm_compile added last
10336                         time.  */
10337                     SvCUR_set(sv, SvCUR(sv) - 1);
10338                     fbm_compile(sv, FBMcf_TAIL);
10339                 } else
10340                     fbm_compile(sv, 0);
10341             }
10342             prog->substrs->data[i].substr = sv;
10343             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10344                 prog->check_substr = sv;
10345         }
10346     } while (i--);
10347
10348     return TRUE;
10349 }
10350
10351 #ifndef PERL_IN_XSUB_RE
10352
10353 bool
10354 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10355 {
10356     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10357      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10358      * the larger string bounded by 'strbeg' and 'strend'.
10359      *
10360      * 'cp' needs to be assigned (if not a future version of the Unicode
10361      * Standard could make it something that combines with adjacent characters,
10362      * so code using it would then break), and there has to be a GCB break
10363      * before and after the character. */
10364
10365     dVAR;
10366
10367     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10368     const U8 * prev_cp_start;
10369
10370     PERL_ARGS_ASSERT__IS_GRAPHEME;
10371
10372     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10373         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10374     {
10375         /* These are considered graphemes */
10376         return TRUE;
10377     }
10378
10379     /* Otherwise, unassigned code points are forbidden */
10380     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10381                                     _invlist_search(PL_Assigned_invlist, cp))))
10382     {
10383         return FALSE;
10384     }
10385
10386     cp_gcb_val = getGCB_VAL_CP(cp);
10387
10388     /* Find the GCB value of the previous code point in the input */
10389     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10390     if (UNLIKELY(prev_cp_start == s)) {
10391         prev_cp_gcb_val = GCB_EDGE;
10392     }
10393     else {
10394         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10395     }
10396
10397     /* And check that is a grapheme boundary */
10398     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10399                 TRUE /* is UTF-8 encoded */ ))
10400     {
10401         return FALSE;
10402     }
10403
10404     /* Similarly verify there is a break between the current character and the
10405      * following one */
10406     s += UTF8SKIP(s);
10407     if (s >= strend) {
10408         next_cp_gcb_val = GCB_EDGE;
10409     }
10410     else {
10411         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10412     }
10413
10414     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10415 }
10416
10417 /*
10418 =head1 Unicode Support
10419
10420 =for apidoc isSCRIPT_RUN
10421
10422 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10423 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10424 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10425 two degenerate cases given below, this function returns TRUE iff all code
10426 points in it come from any combination of three "scripts" given by the Unicode
10427 "Script Extensions" property: Common, Inherited, and possibly one other.
10428 Additionally all decimal digits must come from the same consecutive sequence of
10429 10.
10430
10431 For example, if all the characters in the sequence are Greek, or Common, or
10432 Inherited, this function will return TRUE, provided any decimal digits in it
10433 are from the same block of digits in Common.  (These are the ASCII digits
10434 "0".."9" and additionally a block for full width forms of these, and several
10435 others used in mathematical notation.)   For scripts (unlike Greek) that have
10436 their own digits defined this will accept either digits from that set or from
10437 one of the Common digit sets, but not a combination of the two.  Some scripts,
10438 such as Arabic, have more than one set of digits.  All digits must come from
10439 the same set for this function to return TRUE.
10440
10441 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10442 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10443 C<SCX_INVALID> if the function returns FALSE.
10444
10445 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10446 will be C<SCX_INVALID>.
10447
10448 If the sequence contains a single code point which is unassigned to a character
10449 in the version of Unicode being used, the function will return TRUE, and the
10450 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10451 in the input sequence will result in the function treating the input as not
10452 being a script run.
10453
10454 The returned script will be C<SCX_Inherited> iff all the code points in it are
10455 from the Inherited script.
10456
10457 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10458 it are from the Inherited or Common scripts.
10459
10460 =cut
10461
10462 */
10463
10464 bool
10465 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10466 {
10467     /* Basically, it looks at each character in the sequence to see if the
10468      * above conditions are met; if not it fails.  It uses an inversion map to
10469      * find the enum corresponding to the script of each character.  But this
10470      * is complicated by the fact that a few code points can be in any of
10471      * several scripts.  The data has been constructed so that there are
10472      * additional enum values (all negative) for these situations.  The
10473      * absolute value of those is an index into another table which contains
10474      * pointers to auxiliary tables for each such situation.  Each aux array
10475      * lists all the scripts for the given situation.  There is another,
10476      * parallel, table that gives the number of entries in each aux table.
10477      * These are all defined in charclass_invlists.h */
10478
10479     /* XXX Here are the additional things UTS 39 says could be done:
10480      *
10481      * Forbid sequences of the same nonspacing mark
10482      *
10483      * Check to see that all the characters are in the sets of exemplar
10484      * characters for at least one language in the Unicode Common Locale Data
10485      * Repository [CLDR]. */
10486
10487     dVAR;
10488
10489     /* Things that match /\d/u */
10490     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10491     UV * decimals_array = invlist_array(decimals_invlist);
10492
10493     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10494      * not currently known) */
10495     UV zero_of_run = 0;
10496
10497     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10498     SCX_enum script_of_char = SCX_INVALID;
10499
10500     /* If the script remains not fully determined from iteration to iteration,
10501      * this is the current intersection of the possiblities.  */
10502     SCX_enum * intersection = NULL;
10503     PERL_UINT_FAST8_T intersection_len = 0;
10504
10505     bool retval = TRUE;
10506     SCX_enum * ret_script = NULL;
10507
10508     assert(send >= s);
10509
10510     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10511
10512     /* All code points in 0..255 are either Common or Latin, so must be a
10513      * script run.  We can return immediately unless we need to know which
10514      * script it is. */
10515     if (! utf8_target && LIKELY(send > s)) {
10516         if (ret_script == NULL) {
10517             return TRUE;
10518         }
10519
10520         /* If any character is Latin, the run is Latin */
10521         while (s < send) {
10522             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10523                 *ret_script = SCX_Latin;
10524                 return TRUE;
10525             }
10526         }
10527
10528         /* Here, all are Common */
10529         *ret_script = SCX_Common;
10530         return TRUE;
10531     }
10532
10533     /* Look at each character in the sequence */
10534     while (s < send) {
10535         /* If the current character being examined is a digit, this is the code
10536          * point of the zero for its sequence of 10 */
10537         UV zero_of_char;
10538
10539         UV cp;
10540
10541         /* The code allows all scripts to use the ASCII digits.  This is
10542          * because they are in the Common script.  Hence any ASCII ones found
10543          * are ok, unless and until a digit from another set has already been
10544          * encountered.  digit ranges in Common are not similarly blessed) */
10545         if (UNLIKELY(isDIGIT(*s))) {
10546             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10547                 retval = FALSE;
10548                 break;
10549             }
10550             if (zero_of_run) {
10551                 if (zero_of_run != '0') {
10552                     retval = FALSE;
10553                     break;
10554                 }
10555             }
10556             else {
10557                 zero_of_run = '0';
10558             }
10559             s++;
10560             continue;
10561         }
10562
10563         /* Here, isn't an ASCII digit.  Find the code point of the character */
10564         if (! UTF8_IS_INVARIANT(*s)) {
10565             Size_t len;
10566             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10567             s += len;
10568         }
10569         else {
10570             cp = *(s++);
10571         }
10572
10573         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10574          * digit in that script.  We can skip the rest of this code for this
10575          * character. */
10576         if (UNLIKELY(   zero_of_run
10577                      && cp >= zero_of_run
10578                      && cp - zero_of_run <= 9))
10579         {
10580             continue;
10581         }
10582
10583         /* Find the character's script.  The correct values are hard-coded here
10584          * for small-enough code points. */
10585         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10586                                unlikely to change */
10587             if (       cp > 255
10588                 || (   isALPHA_L1(cp)
10589                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10590             {
10591                 script_of_char = SCX_Latin;
10592             }
10593             else {
10594                 script_of_char = SCX_Common;
10595             }
10596         }
10597         else {
10598             script_of_char = _Perl_SCX_invmap[
10599                                        _invlist_search(PL_SCX_invlist, cp)];
10600         }
10601
10602         /* We arbitrarily accept a single unassigned character, but not in
10603          * combination with anything else, and not a run of them. */
10604         if (   UNLIKELY(script_of_run == SCX_Unknown)
10605             || UNLIKELY(   script_of_run != SCX_INVALID
10606                         && script_of_char == SCX_Unknown))
10607         {
10608             retval = FALSE;
10609             break;
10610         }
10611
10612         /* For the first character, or the run is inherited, the run's script
10613          * is set to the char's */
10614         if (   UNLIKELY(script_of_run == SCX_INVALID)
10615             || UNLIKELY(script_of_run == SCX_Inherited))
10616         {
10617             script_of_run = script_of_char;
10618         }
10619
10620         /* For the character's script to be Unknown, it must be the first
10621          * character in the sequence (for otherwise a test above would have
10622          * prevented us from reaching here), and we have set the run's script
10623          * to it.  Nothing further to be done for this character */
10624         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10625             continue;
10626         }
10627
10628         /* We accept 'inherited' script characters currently even at the
10629          * beginning.  (We know that no characters in Inherited are digits, or
10630          * we'd have to check for that) */
10631         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10632             continue;
10633         }
10634
10635         /* If the run so far is Common, and the new character isn't, change the
10636          * run's script to that of this character */
10637         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10638             script_of_run = script_of_char;
10639         }
10640
10641         /* Now we can see if the script of the new character is the same as
10642          * that of the run */
10643         if (LIKELY(script_of_char == script_of_run)) {
10644             /* By far the most common case */
10645             goto scripts_match;
10646         }
10647
10648         /* Here, the script of the run isn't Common.  But characters in Common
10649          * match any script */
10650         if (script_of_char == SCX_Common) {
10651             goto scripts_match;
10652         }
10653
10654 #ifndef HAS_SCX_AUX_TABLES
10655
10656         /* Too early a Unicode version to have a code point belonging to more
10657          * than one script, so, if the scripts don't exactly match, fail */
10658         PERL_UNUSED_VAR(intersection_len);
10659         retval = FALSE;
10660         break;
10661
10662 #else
10663
10664         /* Here there is no exact match between the character's script and the
10665          * run's.  And we've handled the special cases of scripts Unknown,
10666          * Inherited, and Common.
10667          *
10668          * Negative script numbers signify that the value may be any of several
10669          * scripts, and we need to look at auxiliary information to make our
10670          * deterimination.  But if both are non-negative, we can fail now */
10671         if (LIKELY(script_of_char >= 0)) {
10672             const SCX_enum * search_in;
10673             PERL_UINT_FAST8_T search_in_len;
10674             PERL_UINT_FAST8_T i;
10675
10676             if (LIKELY(script_of_run >= 0)) {
10677                 retval = FALSE;
10678                 break;
10679             }
10680
10681             /* Use the previously constructed set of possible scripts, if any.
10682              * */
10683             if (intersection) {
10684                 search_in = intersection;
10685                 search_in_len = intersection_len;
10686             }
10687             else {
10688                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10689                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10690             }
10691
10692             for (i = 0; i < search_in_len; i++) {
10693                 if (search_in[i] == script_of_char) {
10694                     script_of_run = script_of_char;
10695                     goto scripts_match;
10696                 }
10697             }
10698
10699             retval = FALSE;
10700             break;
10701         }
10702         else if (LIKELY(script_of_run >= 0)) {
10703             /* script of character could be one of several, but run is a single
10704              * script */
10705             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10706             const PERL_UINT_FAST8_T search_in_len
10707                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10708             PERL_UINT_FAST8_T i;
10709
10710             for (i = 0; i < search_in_len; i++) {
10711                 if (search_in[i] == script_of_run) {
10712                     script_of_char = script_of_run;
10713                     goto scripts_match;
10714                 }
10715             }
10716
10717             retval = FALSE;
10718             break;
10719         }
10720         else {
10721             /* Both run and char could be in one of several scripts.  If the
10722              * intersection is empty, then this character isn't in this script
10723              * run.  Otherwise, we need to calculate the intersection to use
10724              * for future iterations of the loop, unless we are already at the
10725              * final character */
10726             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10727             const PERL_UINT_FAST8_T char_len
10728                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10729             const SCX_enum * search_run;
10730             PERL_UINT_FAST8_T run_len;
10731
10732             SCX_enum * new_overlap = NULL;
10733             PERL_UINT_FAST8_T i, j;
10734
10735             if (intersection) {
10736                 search_run = intersection;
10737                 run_len = intersection_len;
10738             }
10739             else {
10740                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10741                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10742             }
10743
10744             intersection_len = 0;
10745
10746             for (i = 0; i < run_len; i++) {
10747                 for (j = 0; j < char_len; j++) {
10748                     if (search_run[i] == search_char[j]) {
10749
10750                         /* Here, the script at i,j matches.  That means this
10751                          * character is in the run.  But continue on to find
10752                          * the complete intersection, for the next loop
10753                          * iteration, and for the digit check after it.
10754                          *
10755                          * On the first found common script, we malloc space
10756                          * for the intersection list for the worst case of the
10757                          * intersection, which is the minimum of the number of
10758                          * scripts remaining in each set. */
10759                         if (intersection_len == 0) {
10760                             Newx(new_overlap,
10761                                  MIN(run_len - i, char_len - j),
10762                                  SCX_enum);
10763                         }
10764                         new_overlap[intersection_len++] = search_run[i];
10765                     }
10766                 }
10767             }
10768
10769             /* Here we've looked through everything.  If they have no scripts
10770              * in common, not a run */
10771             if (intersection_len == 0) {
10772                 retval = FALSE;
10773                 break;
10774             }
10775
10776             /* If there is only a single script in common, set to that.
10777              * Otherwise, use the intersection going forward */
10778             Safefree(intersection);
10779             intersection = NULL;
10780             if (intersection_len == 1) {
10781                 script_of_run = script_of_char = new_overlap[0];
10782                 Safefree(new_overlap);
10783                 new_overlap = NULL;
10784             }
10785             else {
10786                 intersection = new_overlap;
10787             }
10788         }
10789
10790 #endif
10791
10792   scripts_match:
10793
10794         /* Here, the script of the character is compatible with that of the
10795          * run.  That means that in most cases, it continues the script run.
10796          * Either it and the run match exactly, or one or both can be in any of
10797          * several scripts, and the intersection is not empty.  However, if the
10798          * character is a decimal digit, it could still mean failure if it is
10799          * from the wrong sequence of 10.  So, we need to look at if it's a
10800          * digit.  We've already handled the 10 decimal digits, and the next
10801          * lowest one is this one: */
10802         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
10803             continue;   /* Not a digit; this character is part of the run */
10804         }
10805
10806         /* If we have a definitive '0' for the script of this character, we
10807          * know that for this to be a digit, it must be in the range of +0..+9
10808          * of that zero. */
10809         if (   script_of_char >= 0
10810             && (zero_of_char = script_zeros[script_of_char]))
10811         {
10812             if (   cp < zero_of_char
10813                 || cp > zero_of_char + 9)
10814             {
10815                 continue;   /* Not a digit; this character is part of the run
10816                              */
10817             }
10818
10819         }
10820         else {  /* Need to look up if this character is a digit or not */
10821             SSize_t index_of_zero_of_char;
10822             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
10823             if (     UNLIKELY(index_of_zero_of_char < 0)
10824                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
10825             {
10826                 continue;   /* Not a digit; this character is part of the run.
10827                              */
10828             }
10829
10830             zero_of_char = decimals_array[index_of_zero_of_char];
10831         }
10832
10833         /* Here, the character is a decimal digit, and the zero of its sequence
10834          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
10835          * they better be the same. */
10836         if (zero_of_run) {
10837             if (zero_of_run != zero_of_char) {
10838                 retval = FALSE;
10839                 break;
10840             }
10841         }
10842         else {  /* Otherwise we now have a zero for this run */
10843             zero_of_run = zero_of_char;
10844         }
10845     } /* end of looping through CLOSESR text */
10846
10847     Safefree(intersection);
10848
10849     if (ret_script != NULL) {
10850         if (retval) {
10851             *ret_script = script_of_run;
10852         }
10853         else {
10854             *ret_script = SCX_INVALID;
10855         }
10856     }
10857
10858     return retval;
10859 }
10860
10861 #endif /* ifndef PERL_IN_XSUB_RE */
10862
10863 /*
10864  * ex: set ts=8 sts=4 sw=4 et:
10865  */