regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char non_utf8_target_but_utf8_required[]
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /*
 111  * Forwards.
 112  */
 113
 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 115
 116 #define HOPc(pos,off) \
 117         (char *)(reginfo->is_utf8_target \
 118             ? reghop3((U8*)pos, off, \
 119                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 120             : (U8*)(pos + off))
 121
 122 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 123 #define HOPBACK3(pos, off, lim) \
 124         (reginfo->is_utf8_target                          \
 125             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 126             : (pos - off >= lim)                                 \
 127                 ? (U8*)pos - off                                 \
 128                 : NULL)
 129
 130 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 131
 132 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 133 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 134
 135 /* lim must be +ve. Returns NULL on overshoot */
 136 #define HOPMAYBE3(pos,off,lim) \
 137         (reginfo->is_utf8_target                        \
 138             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 139             : ((U8*)pos + off <= lim)                   \
 140                 ? (U8*)pos + off                        \
 141                 : NULL)
 142
 143 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 144  * off must be >=0; args should be vars rather than expressions */
 145 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 146     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 147     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 148 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 149
 150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 151     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 152     : (U8*)(pos + off))
 153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 154
 155 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 156 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 157
 158 /* for use after a quantifier and before an EXACT-like node -- japhy */
 159 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 160  *
 161  * NOTE that *nothing* that affects backtracking should be in here, specifically
 162  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 163  * node that is in between two EXACT like nodes when ascertaining what the required
 164  * "follow" character is. This should probably be moved to regex compile time
 165  * although it may be done at run time beause of the REF possibility - more
 166  * investigation required. -- demerphq
 167 */
 168 #define JUMPABLE(rn) (                                                             \
 169     OP(rn) == OPEN ||                                                              \
 170     (OP(rn) == CLOSE &&                                                            \
 171      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 172     OP(rn) == EVAL ||                                                              \
 173     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 174     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 175     OP(rn) == KEEPS ||                                                             \
 176     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 177 )
 178 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 179
 180 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 181
 182 /*
 183   Search for mandatory following text node; for lookahead, the text must
 184   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 185 */
 186 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 187     while (JUMPABLE(rn)) { \
 188         const OPCODE type = OP(rn); \
 189         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 190             rn = NEXTOPER(NEXTOPER(rn)); \
 191         else if (type == PLUS) \
 192             rn = NEXTOPER(rn); \
 193         else if (type == IFMATCH) \
 194             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 195         else rn += NEXT_OFF(rn); \
 196     } \
 197 } STMT_END
 198
 199 #define SLAB_FIRST(s) (&(s)->states[0])
 200 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 201
 202 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 203 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 204 static regmatch_state * S_push_slab(pTHX);
 205
 206 #define REGCP_PAREN_ELEMS 3
 207 #define REGCP_OTHER_ELEMS 3
 208 #define REGCP_FRAME_ELEMS 1
 209 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 210  * are needed for the regexp context stack bookkeeping. */
 211
 212 STATIC CHECKPOINT
 213 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 214 {
 215     const int retval = PL_savestack_ix;
 216     const int paren_elems_to_push =
 217                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 218     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 219     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 220     I32 p;
 221     DECLARE_AND_GET_RE_DEBUG_FLAGS;
 222
 223     PERL_ARGS_ASSERT_REGCPPUSH;
 224
 225     if (paren_elems_to_push < 0)
 226         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 227                    (int)paren_elems_to_push, (int)maxopenparen,
 228                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 229
 230     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 231         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 232                    " out of range (%lu-%ld)",
 233                    total_elems,
 234                    (unsigned long)maxopenparen,
 235                    (long)parenfloor);
 236
 237     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 238
 239     DEBUG_BUFFERS_r(
 240         if ((int)maxopenparen > (int)parenfloor)
 241             Perl_re_exec_indentf( aTHX_
 242                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 243                 depth,
 244                 PTR2UV(rex),
 245                 PTR2UV(rex->offs)
 246             );
 247     );
 248     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 249 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 250         SSPUSHIV(rex->offs[p].end);
 251         SSPUSHIV(rex->offs[p].start);
 252         SSPUSHINT(rex->offs[p].start_tmp);
 253         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 254             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 255             depth,
 256             (UV)p,
 257             (IV)rex->offs[p].start,
 258             (IV)rex->offs[p].start_tmp,
 259             (IV)rex->offs[p].end
 260         ));
 261     }
 262 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 263     SSPUSHINT(maxopenparen);
 264     SSPUSHINT(rex->lastparen);
 265     SSPUSHINT(rex->lastcloseparen);
 266     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 267
 268     return retval;
 269 }
 270
 271 /* These are needed since we do not localize EVAL nodes: */
 272 #define REGCP_SET(cp)                                           \
 273     DEBUG_STATE_r(                                              \
 274         Perl_re_exec_indentf( aTHX_                             \
 275             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 276             depth, (IV)PL_savestack_ix                          \
 277         )                                                       \
 278     );                                                          \
 279     cp = PL_savestack_ix
 280
 281 #define REGCP_UNWIND(cp)                                        \
 282     DEBUG_STATE_r(                                              \
 283         if (cp != PL_savestack_ix)                              \
 284             Perl_re_exec_indentf( aTHX_                         \
 285                 "Clearing an EVAL scope, savestack=%"           \
 286                 IVdf "..%" IVdf "\n",                           \
 287                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 288             )                                                   \
 289     );                                                          \
 290     regcpblow(cp)
 291
 292 /* set the start and end positions of capture ix */
 293 #define CLOSE_CAPTURE(ix, s, e)                                            \
 294     rex->offs[ix].start = s;                                               \
 295     rex->offs[ix].end = e;                                                 \
 296     if (ix > rex->lastparen)                                               \
 297         rex->lastparen = ix;                                               \
 298     rex->lastcloseparen = ix;                                              \
 299     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
 300         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
 301         depth,                                                             \
 302         PTR2UV(rex),                                                       \
 303         PTR2UV(rex->offs),                                                 \
 304         (UV)ix,                                                            \
 305         (IV)rex->offs[ix].start,                                           \
 306         (IV)rex->offs[ix].end,                                             \
 307         (UV)rex->lastparen                                                 \
 308     ))
 309
 310 #define UNWIND_PAREN(lp, lcp)               \
 311     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
 312         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
 313         depth,                              \
 314         PTR2UV(rex),                        \
 315         PTR2UV(rex->offs),                  \
 316         (UV)(lp),                           \
 317         (UV)(rex->lastparen),               \
 318         (UV)(lcp)                           \
 319     ));                                     \
 320     for (n = rex->lastparen; n > lp; n--)   \
 321         rex->offs[n].end = -1;              \
 322     rex->lastparen = n;                     \
 323     rex->lastcloseparen = lcp;
 324
 325
 326 STATIC void
 327 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 328 {
 329     UV i;
 330     U32 paren;
 331     DECLARE_AND_GET_RE_DEBUG_FLAGS;
 332
 333     PERL_ARGS_ASSERT_REGCPPOP;
 334
 335     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 336     i = SSPOPUV;
 337     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 338     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 339     rex->lastcloseparen = SSPOPINT;
 340     rex->lastparen = SSPOPINT;
 341     *maxopenparen_p = SSPOPINT;
 342
 343     i -= REGCP_OTHER_ELEMS;
 344     /* Now restore the parentheses context. */
 345     DEBUG_BUFFERS_r(
 346         if (i || rex->lastparen + 1 <= rex->nparens)
 347             Perl_re_exec_indentf( aTHX_
 348                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 349                 depth,
 350                 PTR2UV(rex),
 351                 PTR2UV(rex->offs)
 352             );
 353     );
 354     paren = *maxopenparen_p;
 355     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 356         SSize_t tmps;
 357         rex->offs[paren].start_tmp = SSPOPINT;
 358         rex->offs[paren].start = SSPOPIV;
 359         tmps = SSPOPIV;
 360         if (paren <= rex->lastparen)
 361             rex->offs[paren].end = tmps;
 362         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 363             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 364             depth,
 365             (UV)paren,
 366             (IV)rex->offs[paren].start,
 367             (IV)rex->offs[paren].start_tmp,
 368             (IV)rex->offs[paren].end,
 369             (paren > rex->lastparen ? "(skipped)" : ""));
 370         );
 371         paren--;
 372     }
 373 #if 1
 374     /* It would seem that the similar code in regtry()
 375      * already takes care of this, and in fact it is in
 376      * a better location to since this code can #if 0-ed out
 377      * but the code in regtry() is needed or otherwise tests
 378      * requiring null fields (pat.t#187 and split.t#{13,14}
 379      * (as of patchlevel 7877)  will fail.  Then again,
 380      * this code seems to be necessary or otherwise
 381      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 382      * --jhi updated by dapm */
 383     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 384         if (i > *maxopenparen_p)
 385             rex->offs[i].start = -1;
 386         rex->offs[i].end = -1;
 387         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 388             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 389             depth,
 390             (UV)i,
 391             (i > *maxopenparen_p) ? "-1" : "  "
 392         ));
 393     }
 394 #endif
 395 }
 396
 397 /* restore the parens and associated vars at savestack position ix,
 398  * but without popping the stack */
 399
 400 STATIC void
 401 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 402 {
 403     I32 tmpix = PL_savestack_ix;
 404     PERL_ARGS_ASSERT_REGCP_RESTORE;
 405
 406     PL_savestack_ix = ix;
 407     regcppop(rex, maxopenparen_p);
 408     PL_savestack_ix = tmpix;
 409 }
 410
 411 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 412
 413 #ifndef PERL_IN_XSUB_RE
 414
 415 bool
 416 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 417 {
 418     /* Returns a boolean as to whether or not 'character' is a member of the
 419      * Posix character class given by 'classnum' that should be equivalent to a
 420      * value in the typedef '_char_class_number'.
 421      *
 422      * Ideally this could be replaced by a just an array of function pointers
 423      * to the C library functions that implement the macros this calls.
 424      * However, to compile, the precise function signatures are required, and
 425      * these may vary from platform to to platform.  To avoid having to figure
 426      * out what those all are on each platform, I (khw) am using this method,
 427      * which adds an extra layer of function call overhead (unless the C
 428      * optimizer strips it away).  But we don't particularly care about
 429      * performance with locales anyway. */
 430
 431     switch ((_char_class_number) classnum) {
 432         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 433         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 434         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 435         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 436         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 437                                         || isUPPER_LC(character);
 438         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 439         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 440         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 441         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 442         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 443         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 444         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 445         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 446         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 447         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 448         default:    /* VERTSPACE should never occur in locales */
 449             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 450     }
 451
 452     NOT_REACHED; /* NOTREACHED */
 453     return FALSE;
 454 }
 455
 456 #endif
 457
 458 PERL_STATIC_INLINE I32
 459 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
 460 {
 461     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
 462      * folded.  Works on all folds representable without UTF-8, except for
 463      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
 464      * check that the strings each have at least 'len' characters.
 465      *
 466      * There is almost an identical API function where s2 need not be folded:
 467      * Perl_foldEQ_latin1() */
 468
 469     const U8 *a = (const U8 *)s1;
 470     const U8 *b = (const U8 *)s2;
 471
 472     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
 473
 474     assert(len >= 0);
 475
 476     while (len--) {
 477         assert(! isUPPER_L1(*b));
 478         if (toLOWER_L1(*a) != *b) {
 479             return 0;
 480         }
 481         a++, b++;
 482     }
 483     return 1;
 484 }
 485
 486 STATIC bool
 487 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 488 {
 489     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 490      * 'character' is a member of the Posix character class given by 'classnum'
 491      * that should be equivalent to a value in the typedef
 492      * '_char_class_number'.
 493      *
 494      * This just calls isFOO_lc on the code point for the character if it is in
 495      * the range 0-255.  Outside that range, all characters use Unicode
 496      * rules, ignoring any locale.  So use the Unicode function if this class
 497      * requires an inversion list, and use the Unicode macro otherwise. */
 498
 499     dVAR;
 500
 501     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 502
 503     if (UTF8_IS_INVARIANT(*character)) {
 504         return isFOO_lc(classnum, *character);
 505     }
 506     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 507         return isFOO_lc(classnum,
 508                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 509     }
 510
 511     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 512
 513     switch ((_char_class_number) classnum) {
 514         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 515         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 516         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 517         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 518         default:
 519             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 520                                         utf8_to_uvchr_buf(character, e, NULL));
 521     }
 522
 523     return FALSE; /* Things like CNTRL are always below 256 */
 524 }
 525
 526 STATIC U8 *
 527 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 528 {
 529     /* Returns the position of the first byte in the sequence between 's' and
 530      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 531      * */
 532
 533     PERL_ARGS_ASSERT_FIND_SPAN_END;
 534
 535     assert(send >= s);
 536
 537     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 538                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 539                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 540     {
 541         PERL_UINTMAX_T span_word;
 542
 543         /* Process per-byte until reach word boundary.  XXX This loop could be
 544          * eliminated if we knew that this platform had fast unaligned reads */
 545         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 546             if (*s != span_byte) {
 547                 return s;
 548             }
 549             s++;
 550         }
 551
 552         /* Create a word filled with the bytes we are spanning */
 553         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 554
 555         /* Process per-word as long as we have at least a full word left */
 556         do {
 557
 558             /* Keep going if the whole word is composed of 'span_byte's */
 559             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 560                 s += PERL_WORDSIZE;
 561                 continue;
 562             }
 563
 564             /* Here, at least one byte in the word isn't 'span_byte'. */
 565
 566 #ifdef EBCDIC
 567
 568             break;
 569
 570 #else
 571
 572             /* This xor leaves 1 bits only in those non-matching bytes */
 573             span_word ^= * (PERL_UINTMAX_T *) s;
 574
 575             /* Make sure the upper bit of each non-matching byte is set.  This
 576              * makes each such byte look like an ASCII platform variant byte */
 577             span_word |= span_word << 1;
 578             span_word |= span_word << 2;
 579             span_word |= span_word << 4;
 580
 581             /* That reduces the problem to what this function solves */
 582             return s + variant_byte_number(span_word);
 583
 584 #endif
 585
 586         } while (s + PERL_WORDSIZE <= send);
 587     }
 588
 589     /* Process the straggler bytes beyond the final word boundary */
 590     while (s < send) {
 591         if (*s != span_byte) {
 592             return s;
 593         }
 594         s++;
 595     }
 596
 597     return s;
 598 }
 599
 600 STATIC U8 *
 601 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 602 {
 603     /* Returns the position of the first byte in the sequence between 's'
 604      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 605      * returns 'send' if none found.  It uses word-level operations instead of
 606      * byte to speed up the process */
 607
 608     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 609
 610     assert(send >= s);
 611     assert((byte & mask) == byte);
 612
 613 #ifndef EBCDIC
 614
 615     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 616                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 617                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 618     {
 619         PERL_UINTMAX_T word, mask_word;
 620
 621         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 622             if (((*s) & mask) == byte) {
 623                 return s;
 624             }
 625             s++;
 626         }
 627
 628         word      = PERL_COUNT_MULTIPLIER * byte;
 629         mask_word = PERL_COUNT_MULTIPLIER * mask;
 630
 631         do {
 632             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 633
 634             /* If 'masked' contains bytes with the bit pattern of 'byte' within
 635              * it, xoring with 'word' will leave each of the 8 bits in such
 636              * bytes be 0, and no byte containing any other bit pattern will be
 637              * 0. */
 638             masked ^= word;
 639
 640             /* This causes the most significant bit to be set to 1 for any
 641              * bytes in the word that aren't completely 0 */
 642             masked |= masked << 1;
 643             masked |= masked << 2;
 644             masked |= masked << 4;
 645
 646             /* The msbits are the same as what marks a byte as variant, so we
 647              * can use this mask.  If all msbits are 1, the word doesn't
 648              * contain 'byte' */
 649             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 650                 s += PERL_WORDSIZE;
 651                 continue;
 652             }
 653
 654             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 655              * and any that are, are 0.  Complement and re-AND to swap that */
 656             masked = ~ masked;
 657             masked &= PERL_VARIANTS_WORD_MASK;
 658
 659             /* This reduces the problem to that solved by this function */
 660             s += variant_byte_number(masked);
 661             return s;
 662
 663         } while (s + PERL_WORDSIZE <= send);
 664     }
 665
 666 #endif
 667
 668     while (s < send) {
 669         if (((*s) & mask) == byte) {
 670             return s;
 671         }
 672         s++;
 673     }
 674
 675     return s;
 676 }
 677
 678 STATIC U8 *
 679 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 680 {
 681     /* Returns the position of the first byte in the sequence between 's' and
 682      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 683      * 'span_byte' should have been ANDed with 'mask' in the call of this
 684      * function.  Returns 'send' if none found.  Works like find_span_end(),
 685      * except for the AND */
 686
 687     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 688
 689     assert(send >= s);
 690     assert((span_byte & mask) == span_byte);
 691
 692     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 693                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 694                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 695     {
 696         PERL_UINTMAX_T span_word, mask_word;
 697
 698         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 699             if (((*s) & mask) != span_byte) {
 700                 return s;
 701             }
 702             s++;
 703         }
 704
 705         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 706         mask_word = PERL_COUNT_MULTIPLIER * mask;
 707
 708         do {
 709             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 710
 711             if (masked == span_word) {
 712                 s += PERL_WORDSIZE;
 713                 continue;
 714             }
 715
 716 #ifdef EBCDIC
 717
 718             break;
 719
 720 #else
 721
 722             masked ^= span_word;
 723             masked |= masked << 1;
 724             masked |= masked << 2;
 725             masked |= masked << 4;
 726             return s + variant_byte_number(masked);
 727
 728 #endif
 729
 730         } while (s + PERL_WORDSIZE <= send);
 731     }
 732
 733     while (s < send) {
 734         if (((*s) & mask) != span_byte) {
 735             return s;
 736         }
 737         s++;
 738     }
 739
 740     return s;
 741 }
 742
 743 /*
 744  * pregexec and friends
 745  */
 746
 747 #ifndef PERL_IN_XSUB_RE
 748 /*
 749  - pregexec - match a regexp against a string
 750  */
 751 I32
 752 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 753          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 754 /* stringarg: the point in the string at which to begin matching */
 755 /* strend:    pointer to null at end of string */
 756 /* strbeg:    real beginning of string */
 757 /* minend:    end of match must be >= minend bytes after stringarg. */
 758 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 759  *            itself is accessed via the pointers above */
 760 /* nosave:    For optimizations. */
 761 {
 762     PERL_ARGS_ASSERT_PREGEXEC;
 763
 764     return
 765         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 766                       nosave ? 0 : REXEC_COPY_STR);
 767 }
 768 #endif
 769
 770
 771
 772 /* re_intuit_start():
 773  *
 774  * Based on some optimiser hints, try to find the earliest position in the
 775  * string where the regex could match.
 776  *
 777  *   rx:     the regex to match against
 778  *   sv:     the SV being matched: only used for utf8 flag; the string
 779  *           itself is accessed via the pointers below. Note that on
 780  *           something like an overloaded SV, SvPOK(sv) may be false
 781  *           and the string pointers may point to something unrelated to
 782  *           the SV itself.
 783  *   strbeg: real beginning of string
 784  *   strpos: the point in the string at which to begin matching
 785  *   strend: pointer to the byte following the last char of the string
 786  *   flags   currently unused; set to 0
 787  *   data:   currently unused; set to NULL
 788  *
 789  * The basic idea of re_intuit_start() is to use some known information
 790  * about the pattern, namely:
 791  *
 792  *   a) the longest known anchored substring (i.e. one that's at a
 793  *      constant offset from the beginning of the pattern; but not
 794  *      necessarily at a fixed offset from the beginning of the
 795  *      string);
 796  *   b) the longest floating substring (i.e. one that's not at a constant
 797  *      offset from the beginning of the pattern);
 798  *   c) Whether the pattern is anchored to the string; either
 799  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 800  *      or anchored to pos(): /\G/;
 801  *   d) A start class: a real or synthetic character class which
 802  *      represents which characters are legal at the start of the pattern;
 803  *
 804  * to either quickly reject the match, or to find the earliest position
 805  * within the string at which the pattern might match, thus avoiding
 806  * running the full NFA engine at those earlier locations, only to
 807  * eventually fail and retry further along.
 808  *
 809  * Returns NULL if the pattern can't match, or returns the address within
 810  * the string which is the earliest place the match could occur.
 811  *
 812  * The longest of the anchored and floating substrings is called 'check'
 813  * and is checked first. The other is called 'other' and is checked
 814  * second. The 'other' substring may not be present.  For example,
 815  *
 816  *    /(abc|xyz)ABC\d{0,3}DEFG/
 817  *
 818  * will have
 819  *
 820  *   check substr (float)    = "DEFG", offset 6..9 chars
 821  *   other substr (anchored) = "ABC",  offset 3..3 chars
 822  *   stclass = [ax]
 823  *
 824  * Be aware that during the course of this function, sometimes 'anchored'
 825  * refers to a substring being anchored relative to the start of the
 826  * pattern, and sometimes to the pattern itself being anchored relative to
 827  * the string. For example:
 828  *
 829  *   /\dabc/:   "abc" is anchored to the pattern;
 830  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 831  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 832  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 833  *                    but the pattern is anchored to the string.
 834  */
 835
 836 char *
 837 Perl_re_intuit_start(pTHX_
 838                     REGEXP * const rx,
 839                     SV *sv,
 840                     const char * const strbeg,
 841                     char *strpos,
 842                     char *strend,
 843                     const U32 flags,
 844                     re_scream_pos_data *data)
 845 {
 846     struct regexp *const prog = ReANY(rx);
 847     SSize_t start_shift = prog->check_offset_min;
 848     /* Should be nonnegative! */
 849     SSize_t end_shift   = 0;
 850     /* current lowest pos in string where the regex can start matching */
 851     char *rx_origin = strpos;
 852     SV *check;
 853     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 854     U8   other_ix = 1 - prog->substrs->check_ix;
 855     bool ml_anch = 0;
 856     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 857     char *check_at = NULL;              /* check substr found at this pos */
 858     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 859     RXi_GET_DECL(prog,progi);
 860     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 861     regmatch_info *const reginfo = &reginfo_buf;
 862     DECLARE_AND_GET_RE_DEBUG_FLAGS;
 863
 864     PERL_ARGS_ASSERT_RE_INTUIT_START;
 865     PERL_UNUSED_ARG(flags);
 866     PERL_UNUSED_ARG(data);
 867
 868     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 869                 "Intuit: trying to determine minimum start position...\n"));
 870
 871     /* for now, assume that all substr offsets are positive. If at some point
 872      * in the future someone wants to do clever things with lookbehind and
 873      * -ve offsets, they'll need to fix up any code in this function
 874      * which uses these offsets. See the thread beginning
 875      * <20140113145929.GF27210@iabyn.com>
 876      */
 877     assert(prog->substrs->data[0].min_offset >= 0);
 878     assert(prog->substrs->data[0].max_offset >= 0);
 879     assert(prog->substrs->data[1].min_offset >= 0);
 880     assert(prog->substrs->data[1].max_offset >= 0);
 881     assert(prog->substrs->data[2].min_offset >= 0);
 882     assert(prog->substrs->data[2].max_offset >= 0);
 883
 884     /* for now, assume that if both present, that the floating substring
 885      * doesn't start before the anchored substring.
 886      * If you break this assumption (e.g. doing better optimisations
 887      * with lookahead/behind), then you'll need to audit the code in this
 888      * function carefully first
 889      */
 890     assert(
 891             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 892               && (prog->float_utf8    || prog->float_substr))
 893            || (prog->float_min_offset >= prog->anchored_offset));
 894
 895     /* byte rather than char calculation for efficiency. It fails
 896      * to quickly reject some cases that can't match, but will reject
 897      * them later after doing full char arithmetic */
 898     if (prog->minlen > strend - strpos) {
 899         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 900                               "  String too short...\n"));
 901         goto fail;
 902     }
 903
 904     RXp_MATCH_UTF8_set(prog, utf8_target);
 905     reginfo->is_utf8_target = cBOOL(utf8_target);
 906     reginfo->info_aux = NULL;
 907     reginfo->strbeg = strbeg;
 908     reginfo->strend = strend;
 909     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 910     reginfo->intuit = 1;
 911     /* not actually used within intuit, but zero for safety anyway */
 912     reginfo->poscache_maxiter = 0;
 913
 914     if (utf8_target) {
 915         if ((!prog->anchored_utf8 && prog->anchored_substr)
 916                 || (!prog->float_utf8 && prog->float_substr))
 917             to_utf8_substr(prog);
 918         check = prog->check_utf8;
 919     } else {
 920         if (!prog->check_substr && prog->check_utf8) {
 921             if (! to_byte_substr(prog)) {
 922                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 923             }
 924         }
 925         check = prog->check_substr;
 926     }
 927
 928     /* dump the various substring data */
 929     DEBUG_OPTIMISE_MORE_r({
 930         int i;
 931         for (i=0; i<=2; i++) {
 932             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 933                                   : prog->substrs->data[i].substr);
 934             if (!sv)
 935                 continue;
 936
 937             Perl_re_printf( aTHX_
 938                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
 939                 " useful=%" IVdf " utf8=%d [%s]\n",
 940                 i,
 941                 (IV)prog->substrs->data[i].min_offset,
 942                 (IV)prog->substrs->data[i].max_offset,
 943                 (IV)prog->substrs->data[i].end_shift,
 944                 BmUSEFUL(sv),
 945                 utf8_target ? 1 : 0,
 946                 SvPEEK(sv));
 947         }
 948     });
 949
 950     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 951
 952         /* ml_anch: check after \n?
 953          *
 954          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 955          * with /.*.../, these flags will have been added by the
 956          * compiler:
 957          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 958          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 959          */
 960         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 961                    && !(prog->intflags & PREGf_IMPLICIT);
 962
 963         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 964             /* we are only allowed to match at BOS or \G */
 965
 966             /* trivially reject if there's a BOS anchor and we're not at BOS.
 967              *
 968              * Note that we don't try to do a similar quick reject for
 969              * \G, since generally the caller will have calculated strpos
 970              * based on pos() and gofs, so the string is already correctly
 971              * anchored by definition; and handling the exceptions would
 972              * be too fiddly (e.g. REXEC_IGNOREPOS).
 973              */
 974             if (   strpos != strbeg
 975                 && (prog->intflags & PREGf_ANCH_SBOL))
 976             {
 977                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 978                                 "  Not at start...\n"));
 979                 goto fail;
 980             }
 981
 982             /* in the presence of an anchor, the anchored (relative to the
 983              * start of the regex) substr must also be anchored relative
 984              * to strpos. So quickly reject if substr isn't found there.
 985              * This works for \G too, because the caller will already have
 986              * subtracted gofs from pos, and gofs is the offset from the
 987              * \G to the start of the regex. For example, in /.abc\Gdef/,
 988              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 989              * caller will have set strpos=pos()-4; we look for the substr
 990              * at position pos()-4+1, which lines up with the "a" */
 991
 992             if (prog->check_offset_min == prog->check_offset_max) {
 993                 /* Substring at constant offset from beg-of-str... */
 994                 SSize_t slen = SvCUR(check);
 995                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 996
 997                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 998                     "  Looking for check substr at fixed offset %" IVdf "...\n",
 999                     (IV)prog->check_offset_min));
1000
1001                 if (SvTAIL(check)) {
1002                     /* In this case, the regex is anchored at the end too.
1003                      * Unless it's a multiline match, the lengths must match
1004                      * exactly, give or take a \n.  NB: slen >= 1 since
1005                      * the last char of check is \n */
1006                     if (!multiline
1007                         && (   strend - s > slen
1008                             || strend - s < slen - 1
1009                             || (strend - s == slen && strend[-1] != '\n')))
1010                     {
1011                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1012                                             "  String too long...\n"));
1013                         goto fail_finish;
1014                     }
1015                     /* Now should match s[0..slen-2] */
1016                     slen--;
1017                 }
1018                 if (slen && (strend - s < slen
1019                     || *SvPVX_const(check) != *s
1020                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1021                 {
1022                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1023                                     "  String not equal...\n"));
1024                     goto fail_finish;
1025                 }
1026
1027                 check_at = s;
1028                 goto success_at_start;
1029             }
1030         }
1031     }
1032
1033     end_shift = prog->check_end_shift;
1034
1035 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1036     if (end_shift < 0)
1037         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1038                    (IV)end_shift, RX_PRECOMP(rx));
1039 #endif
1040
1041   restart:
1042
1043     /* This is the (re)entry point of the main loop in this function.
1044      * The goal of this loop is to:
1045      * 1) find the "check" substring in the region rx_origin..strend
1046      *    (adjusted by start_shift / end_shift). If not found, reject
1047      *    immediately.
1048      * 2) If it exists, look for the "other" substr too if defined; for
1049      *    example, if the check substr maps to the anchored substr, then
1050      *    check the floating substr, and vice-versa. If not found, go
1051      *    back to (1) with rx_origin suitably incremented.
1052      * 3) If we find an rx_origin position that doesn't contradict
1053      *    either of the substrings, then check the possible additional
1054      *    constraints on rx_origin of /^.../m or a known start class.
1055      *    If these fail, then depending on which constraints fail, jump
1056      *    back to here, or to various other re-entry points further along
1057      *    that skip some of the first steps.
1058      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1059      *    substring. If the start position was determined to be at the
1060      *    beginning of the string  - so, not rejected, but not optimised,
1061      *    since we have to run regmatch from position 0 - decrement the
1062      *    BmUSEFUL() count. Otherwise increment it.
1063      */
1064
1065
1066     /* first, look for the 'check' substring */
1067
1068     {
1069         U8* start_point;
1070         U8* end_point;
1071
1072         DEBUG_OPTIMISE_MORE_r({
1073             Perl_re_printf( aTHX_
1074                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1075                 " Start shift: %" IVdf " End shift %" IVdf
1076                 " Real end Shift: %" IVdf "\n",
1077                 (IV)(rx_origin - strbeg),
1078                 (IV)prog->check_offset_min,
1079                 (IV)start_shift,
1080                 (IV)end_shift,
1081                 (IV)prog->check_end_shift);
1082         });
1083
1084         end_point = HOPBACK3(strend, end_shift, rx_origin);
1085         if (!end_point)
1086             goto fail_finish;
1087         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1088         if (!start_point)
1089             goto fail_finish;
1090
1091
1092         /* If the regex is absolutely anchored to either the start of the
1093          * string (SBOL) or to pos() (ANCH_GPOS), then
1094          * check_offset_max represents an upper bound on the string where
1095          * the substr could start. For the ANCH_GPOS case, we assume that
1096          * the caller of intuit will have already set strpos to
1097          * pos()-gofs, so in this case strpos + offset_max will still be
1098          * an upper bound on the substr.
1099          */
1100         if (!ml_anch
1101             && prog->intflags & PREGf_ANCH
1102             && prog->check_offset_max != SSize_t_MAX)
1103         {
1104             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1105             const char * const anchor =
1106                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1107             SSize_t targ_len = (char*)end_point - anchor;
1108
1109             if (check_len > targ_len) {
1110                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1111                               "Target string too short to match required substring...\n"));
1112                 goto fail_finish;
1113             }
1114
1115             /* do a bytes rather than chars comparison. It's conservative;
1116              * so it skips doing the HOP if the result can't possibly end
1117              * up earlier than the old value of end_point.
1118              */
1119             assert(anchor + check_len <= (char *)end_point);
1120             if (prog->check_offset_max + check_len < targ_len) {
1121                 end_point = HOP3lim((U8*)anchor,
1122                                 prog->check_offset_max,
1123                                 end_point - check_len
1124                             )
1125                             + check_len;
1126                 if (end_point < start_point)
1127                     goto fail_finish;
1128             }
1129         }
1130
1131         check_at = fbm_instr( start_point, end_point,
1132                       check, multiline ? FBMrf_MULTILINE : 0);
1133
1134         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1135             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1136             (IV)((char*)start_point - strbeg),
1137             (IV)((char*)end_point   - strbeg),
1138             (IV)(check_at ? check_at - strbeg : -1)
1139         ));
1140
1141         /* Update the count-of-usability, remove useless subpatterns,
1142             unshift s.  */
1143
1144         DEBUG_EXECUTE_r({
1145             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1146                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1147             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1148                               (check_at ? "Found" : "Did not find"),
1149                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1150                     ? "anchored" : "floating"),
1151                 quoted,
1152                 RE_SV_TAIL(check),
1153                 (check_at ? " at offset " : "...\n") );
1154         });
1155
1156         if (!check_at)
1157             goto fail_finish;
1158         /* set rx_origin to the minimum position where the regex could start
1159          * matching, given the constraint of the just-matched check substring.
1160          * But don't set it lower than previously.
1161          */
1162
1163         if (check_at - rx_origin > prog->check_offset_max)
1164             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1165         /* Finish the diagnostic message */
1166         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1167             "%ld (rx_origin now %" IVdf ")...\n",
1168             (long)(check_at - strbeg),
1169             (IV)(rx_origin - strbeg)
1170         ));
1171     }
1172
1173
1174     /* now look for the 'other' substring if defined */
1175
1176     if (prog->substrs->data[other_ix].utf8_substr
1177         || prog->substrs->data[other_ix].substr)
1178     {
1179         /* Take into account the "other" substring. */
1180         char *last, *last1;
1181         char *s;
1182         SV* must;
1183         struct reg_substr_datum *other;
1184
1185       do_other_substr:
1186         other = &prog->substrs->data[other_ix];
1187         if (!utf8_target && !other->substr) {
1188             if (!to_byte_substr(prog)) {
1189                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1190             }
1191         }
1192
1193         /* if "other" is anchored:
1194          * we've previously found a floating substr starting at check_at.
1195          * This means that the regex origin must lie somewhere
1196          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1197          * and max:                 HOP3(check_at, -check_offset_min)
1198          * (except that min will be >= strpos)
1199          * So the fixed  substr must lie somewhere between
1200          *  HOP3(min, anchored_offset)
1201          *  HOP3(max, anchored_offset) + SvCUR(substr)
1202          */
1203
1204         /* if "other" is floating
1205          * Calculate last1, the absolute latest point where the
1206          * floating substr could start in the string, ignoring any
1207          * constraints from the earlier fixed match. It is calculated
1208          * as follows:
1209          *
1210          * strend - prog->minlen (in chars) is the absolute latest
1211          * position within the string where the origin of the regex
1212          * could appear. The latest start point for the floating
1213          * substr is float_min_offset(*) on from the start of the
1214          * regex.  last1 simply combines thee two offsets.
1215          *
1216          * (*) You might think the latest start point should be
1217          * float_max_offset from the regex origin, and technically
1218          * you'd be correct. However, consider
1219          *    /a\d{2,4}bcd\w/
1220          * Here, float min, max are 3,5 and minlen is 7.
1221          * This can match either
1222          *    /a\d\dbcd\w/
1223          *    /a\d\d\dbcd\w/
1224          *    /a\d\d\d\dbcd\w/
1225          * In the first case, the regex matches minlen chars; in the
1226          * second, minlen+1, in the third, minlen+2.
1227          * In the first case, the floating offset is 3 (which equals
1228          * float_min), in the second, 4, and in the third, 5 (which
1229          * equals float_max). In all cases, the floating string bcd
1230          * can never start more than 4 chars from the end of the
1231          * string, which equals minlen - float_min. As the substring
1232          * starts to match more than float_min from the start of the
1233          * regex, it makes the regex match more than minlen chars,
1234          * and the two cancel each other out. So we can always use
1235          * float_min - minlen, rather than float_max - minlen for the
1236          * latest position in the string.
1237          *
1238          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1239          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1240          */
1241
1242         assert(prog->minlen >= other->min_offset);
1243         last1 = HOP3c(strend,
1244                         other->min_offset - prog->minlen, strbeg);
1245
1246         if (other_ix) {/* i.e. if (other-is-float) */
1247             /* last is the latest point where the floating substr could
1248              * start, *given* any constraints from the earlier fixed
1249              * match. This constraint is that the floating string starts
1250              * <= float_max_offset chars from the regex origin (rx_origin).
1251              * If this value is less than last1, use it instead.
1252              */
1253             assert(rx_origin <= last1);
1254             last =
1255                 /* this condition handles the offset==infinity case, and
1256                  * is a short-cut otherwise. Although it's comparing a
1257                  * byte offset to a char length, it does so in a safe way,
1258                  * since 1 char always occupies 1 or more bytes,
1259                  * so if a string range is  (last1 - rx_origin) bytes,
1260                  * it will be less than or equal to  (last1 - rx_origin)
1261                  * chars; meaning it errs towards doing the accurate HOP3
1262                  * rather than just using last1 as a short-cut */
1263                 (last1 - rx_origin) < other->max_offset
1264                     ? last1
1265                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1266         }
1267         else {
1268             assert(strpos + start_shift <= check_at);
1269             last = HOP4c(check_at, other->min_offset - start_shift,
1270                         strbeg, strend);
1271         }
1272
1273         s = HOP3c(rx_origin, other->min_offset, strend);
1274         if (s < other_last)     /* These positions already checked */
1275             s = other_last;
1276
1277         must = utf8_target ? other->utf8_substr : other->substr;
1278         assert(SvPOK(must));
1279         {
1280             char *from = s;
1281             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1282
1283             if (to > strend)
1284                 to = strend;
1285             if (from > to) {
1286                 s = NULL;
1287                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1288                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1289                     (IV)(from - strbeg),
1290                     (IV)(to   - strbeg)
1291                 ));
1292             }
1293             else {
1294                 s = fbm_instr(
1295                     (unsigned char*)from,
1296                     (unsigned char*)to,
1297                     must,
1298                     multiline ? FBMrf_MULTILINE : 0
1299                 );
1300                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1301                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1302                     (IV)(from - strbeg),
1303                     (IV)(to   - strbeg),
1304                     (IV)(s ? s - strbeg : -1)
1305                 ));
1306             }
1307         }
1308
1309         DEBUG_EXECUTE_r({
1310             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1311                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1312             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1313                 s ? "Found" : "Contradicts",
1314                 other_ix ? "floating" : "anchored",
1315                 quoted, RE_SV_TAIL(must));
1316         });
1317
1318
1319         if (!s) {
1320             /* last1 is latest possible substr location. If we didn't
1321              * find it before there, we never will */
1322             if (last >= last1) {
1323                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1324                                         "; giving up...\n"));
1325                 goto fail_finish;
1326             }
1327
1328             /* try to find the check substr again at a later
1329              * position. Maybe next time we'll find the "other" substr
1330              * in range too */
1331             other_last = HOP3c(last, 1, strend) /* highest failure */;
1332             rx_origin =
1333                 other_ix /* i.e. if other-is-float */
1334                     ? HOP3c(rx_origin, 1, strend)
1335                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1336             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1337                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1338                 (other_ix ? "floating" : "anchored"),
1339                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1340                 (IV)(rx_origin - strbeg)
1341             ));
1342             goto restart;
1343         }
1344         else {
1345             if (other_ix) { /* if (other-is-float) */
1346                 /* other_last is set to s, not s+1, since its possible for
1347                  * a floating substr to fail first time, then succeed
1348                  * second time at the same floating position; e.g.:
1349                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1350                  * The first time round, anchored and float match at
1351                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1352                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1353                  */
1354                 other_last = s;
1355             }
1356             else {
1357                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1358                 other_last = HOP3c(s, 1, strend);
1359             }
1360             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1361                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1362                   (long)(s - strbeg),
1363                 (IV)(rx_origin - strbeg)
1364               ));
1365
1366         }
1367     }
1368     else {
1369         DEBUG_OPTIMISE_MORE_r(
1370             Perl_re_printf( aTHX_
1371                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1372                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1373                 " strend:%" IVdf "\n",
1374                 (IV)prog->check_offset_min,
1375                 (IV)prog->check_offset_max,
1376                 (IV)(check_at-strbeg),
1377                 (IV)(rx_origin-strbeg),
1378                 (IV)(rx_origin-check_at),
1379                 (IV)(strend-strbeg)
1380             )
1381         );
1382     }
1383
1384   postprocess_substr_matches:
1385
1386     /* handle the extra constraint of /^.../m if present */
1387
1388     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1389         char *s;
1390
1391         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1392                         "  looking for /^/m anchor"));
1393
1394         /* we have failed the constraint of a \n before rx_origin.
1395          * Find the next \n, if any, even if it's beyond the current
1396          * anchored and/or floating substrings. Whether we should be
1397          * scanning ahead for the next \n or the next substr is debatable.
1398          * On the one hand you'd expect rare substrings to appear less
1399          * often than \n's. On the other hand, searching for \n means
1400          * we're effectively flipping between check_substr and "\n" on each
1401          * iteration as the current "rarest" string candidate, which
1402          * means for example that we'll quickly reject the whole string if
1403          * hasn't got a \n, rather than trying every substr position
1404          * first
1405          */
1406
1407         s = HOP3c(strend, - prog->minlen, strpos);
1408         if (s <= rx_origin ||
1409             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1410         {
1411             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1412                             "  Did not find /%s^%s/m...\n",
1413                             PL_colors[0], PL_colors[1]));
1414             goto fail_finish;
1415         }
1416
1417         /* earliest possible origin is 1 char after the \n.
1418          * (since *rx_origin == '\n', it's safe to ++ here rather than
1419          * HOP(rx_origin, 1)) */
1420         rx_origin++;
1421
1422         if (prog->substrs->check_ix == 0  /* check is anchored */
1423             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1424         {
1425             /* Position contradicts check-string; either because
1426              * check was anchored (and thus has no wiggle room),
1427              * or check was float and rx_origin is above the float range */
1428             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1429                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1430                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1431             goto restart;
1432         }
1433
1434         /* if we get here, the check substr must have been float,
1435          * is in range, and we may or may not have had an anchored
1436          * "other" substr which still contradicts */
1437         assert(prog->substrs->check_ix); /* check is float */
1438
1439         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1440             /* whoops, the anchored "other" substr exists, so we still
1441              * contradict. On the other hand, the float "check" substr
1442              * didn't contradict, so just retry the anchored "other"
1443              * substr */
1444             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1445                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1446                 PL_colors[0], PL_colors[1],
1447                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1448                 (IV)(rx_origin - strbeg)
1449             ));
1450             goto do_other_substr;
1451         }
1452
1453         /* success: we don't contradict the found floating substring
1454          * (and there's no anchored substr). */
1455         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1456             "  Found /%s^%s/m with rx_origin %ld...\n",
1457             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1458     }
1459     else {
1460         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1461             "  (multiline anchor test skipped)\n"));
1462     }
1463
1464   success_at_start:
1465
1466
1467     /* if we have a starting character class, then test that extra constraint.
1468      * (trie stclasses are too expensive to use here, we are better off to
1469      * leave it to regmatch itself) */
1470
1471     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1472         const U8* const str = (U8*)STRING(progi->regstclass);
1473
1474         /* XXX this value could be pre-computed */
1475         const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1476                     ?  (reginfo->is_utf8_pat
1477                         ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1478                         : (SSize_t)STR_LEN(progi->regstclass))
1479                     : 1);
1480         char * endpos;
1481         char *s;
1482         /* latest pos that a matching float substr constrains rx start to */
1483         char *rx_max_float = NULL;
1484
1485         /* if the current rx_origin is anchored, either by satisfying an
1486          * anchored substring constraint, or a /^.../m constraint, then we
1487          * can reject the current origin if the start class isn't found
1488          * at the current position. If we have a float-only match, then
1489          * rx_origin is constrained to a range; so look for the start class
1490          * in that range. if neither, then look for the start class in the
1491          * whole rest of the string */
1492
1493         /* XXX DAPM it's not clear what the minlen test is for, and why
1494          * it's not used in the floating case. Nothing in the test suite
1495          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1496          * Here are some old comments, which may or may not be correct:
1497          *
1498          *   minlen == 0 is possible if regstclass is \b or \B,
1499          *   and the fixed substr is ''$.
1500          *   Since minlen is already taken into account, rx_origin+1 is
1501          *   before strend; accidentally, minlen >= 1 guaranties no false
1502          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1503          *   0) below assumes that regstclass does not come from lookahead...
1504          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1505          *   This leaves EXACTF-ish only, which are dealt with in
1506          *   find_byclass().
1507          */
1508
1509         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1510             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1511         else if (prog->float_substr || prog->float_utf8) {
1512             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1513             endpos = HOP3clim(rx_max_float, cl_l, strend);
1514         }
1515         else
1516             endpos= strend;
1517
1518         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1519             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1520             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1521               (IV)start_shift, (IV)(check_at - strbeg),
1522               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1523
1524         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1525                             reginfo);
1526         if (!s) {
1527             if (endpos == strend) {
1528                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1529                                 "  Could not match STCLASS...\n") );
1530                 goto fail;
1531             }
1532             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1533                                "  This position contradicts STCLASS...\n") );
1534             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1535                         && !(prog->intflags & PREGf_IMPLICIT))
1536                 goto fail;
1537
1538             /* Contradict one of substrings */
1539             if (prog->anchored_substr || prog->anchored_utf8) {
1540                 if (prog->substrs->check_ix == 1) { /* check is float */
1541                     /* Have both, check_string is floating */
1542                     assert(rx_origin + start_shift <= check_at);
1543                     if (rx_origin + start_shift != check_at) {
1544                         /* not at latest position float substr could match:
1545                          * Recheck anchored substring, but not floating.
1546                          * The condition above is in bytes rather than
1547                          * chars for efficiency. It's conservative, in
1548                          * that it errs on the side of doing 'goto
1549                          * do_other_substr'. In this case, at worst,
1550                          * an extra anchored search may get done, but in
1551                          * practice the extra fbm_instr() is likely to
1552                          * get skipped anyway. */
1553                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1554                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1555                             (long)(other_last - strbeg),
1556                             (IV)(rx_origin - strbeg)
1557                         ));
1558                         goto do_other_substr;
1559                     }
1560                 }
1561             }
1562             else {
1563                 /* float-only */
1564
1565                 if (ml_anch) {
1566                     /* In the presence of ml_anch, we might be able to
1567                      * find another \n without breaking the current float
1568                      * constraint. */
1569
1570                     /* strictly speaking this should be HOP3c(..., 1, ...),
1571                      * but since we goto a block of code that's going to
1572                      * search for the next \n if any, its safe here */
1573                     rx_origin++;
1574                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1575                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1576                               PL_colors[0], PL_colors[1],
1577                               (long)(rx_origin - strbeg)) );
1578                     goto postprocess_substr_matches;
1579                 }
1580
1581                 /* strictly speaking this can never be true; but might
1582                  * be if we ever allow intuit without substrings */
1583                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1584                     goto fail;
1585
1586                 rx_origin = rx_max_float;
1587             }
1588
1589             /* at this point, any matching substrings have been
1590              * contradicted. Start again... */
1591
1592             rx_origin = HOP3c(rx_origin, 1, strend);
1593
1594             /* uses bytes rather than char calculations for efficiency.
1595              * It's conservative: it errs on the side of doing 'goto restart',
1596              * where there is code that does a proper char-based test */
1597             if (rx_origin + start_shift + end_shift > strend) {
1598                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1599                                        "  Could not match STCLASS...\n") );
1600                 goto fail;
1601             }
1602             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1603                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1604                 (prog->substrs->check_ix ? "floating" : "anchored"),
1605                 (long)(rx_origin + start_shift - strbeg),
1606                 (IV)(rx_origin - strbeg)
1607             ));
1608             goto restart;
1609         }
1610
1611         /* Success !!! */
1612
1613         if (rx_origin != s) {
1614             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1615                         "  By STCLASS: moving %ld --> %ld\n",
1616                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1617                    );
1618         }
1619         else {
1620             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1621                                   "  Does not contradict STCLASS...\n");
1622                    );
1623         }
1624     }
1625
1626     /* Decide whether using the substrings helped */
1627
1628     if (rx_origin != strpos) {
1629         /* Fixed substring is found far enough so that the match
1630            cannot start at strpos. */
1631
1632         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1633         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1634     }
1635     else {
1636         /* The found rx_origin position does not prohibit matching at
1637          * strpos, so calling intuit didn't gain us anything. Decrement
1638          * the BmUSEFUL() count on the check substring, and if we reach
1639          * zero, free it.  */
1640         if (!(prog->intflags & PREGf_NAUGHTY)
1641             && (utf8_target ? (
1642                 prog->check_utf8                /* Could be deleted already */
1643                 && --BmUSEFUL(prog->check_utf8) < 0
1644                 && (prog->check_utf8 == prog->float_utf8)
1645             ) : (
1646                 prog->check_substr              /* Could be deleted already */
1647                 && --BmUSEFUL(prog->check_substr) < 0
1648                 && (prog->check_substr == prog->float_substr)
1649             )))
1650         {
1651             /* If flags & SOMETHING - do not do it many times on the same match */
1652             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1653             /* XXX Does the destruction order has to change with utf8_target? */
1654             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1655             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1656             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1657             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1658             check = NULL;                       /* abort */
1659             /* XXXX This is a remnant of the old implementation.  It
1660                     looks wasteful, since now INTUIT can use many
1661                     other heuristics. */
1662             prog->extflags &= ~RXf_USE_INTUIT;
1663         }
1664     }
1665
1666     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1667             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1668              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1669
1670     return rx_origin;
1671
1672   fail_finish:                          /* Substring not found */
1673     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1674         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1675   fail:
1676     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1677                           PL_colors[4], PL_colors[5]));
1678     return NULL;
1679 }
1680
1681
1682 #define DECL_TRIE_TYPE(scan) \
1683     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1684                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1685                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1686                     trie_type = ((scan->flags == EXACT)                             \
1687                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1688                                  : (scan->flags == EXACTL)                          \
1689                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1690                                     : (scan->flags == EXACTFAA)                     \
1691                                       ? (utf8_target                                \
1692                                          ? trie_utf8_exactfa_fold                   \
1693                                          : trie_latin_utf8_exactfa_fold)            \
1694                                       : (scan->flags == EXACTFLU8                   \
1695                                          ? (utf8_target                             \
1696                                            ? trie_flu8                              \
1697                                            : trie_flu8_latin)                       \
1698                                          : (utf8_target                             \
1699                                            ? trie_utf8_fold                         \
1700                                            : trie_latin_utf8_fold)))
1701
1702 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1703  * 'foldbuf+sizeof(foldbuf)' */
1704 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1705 STMT_START {                                                                        \
1706     STRLEN skiplen;                                                                 \
1707     U8 flags = FOLD_FLAGS_FULL;                                                     \
1708     switch (trie_type) {                                                            \
1709     case trie_flu8:                                                                 \
1710         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1711         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1712             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1713         }                                                                           \
1714         goto do_trie_utf8_fold;                                                     \
1715     case trie_utf8_exactfa_fold:                                                    \
1716         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1717         /* FALLTHROUGH */                                                           \
1718     case trie_utf8_fold:                                                            \
1719       do_trie_utf8_fold:                                                            \
1720         if ( foldlen>0 ) {                                                          \
1721             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1722             foldlen -= len;                                                         \
1723             uscan += len;                                                           \
1724             len=0;                                                                  \
1725         } else {                                                                    \
1726             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1727                                                                             flags); \
1728             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1729             skiplen = UVCHR_SKIP( uvc );                                            \
1730             foldlen -= skiplen;                                                     \
1731             uscan = foldbuf + skiplen;                                              \
1732         }                                                                           \
1733         break;                                                                      \
1734     case trie_flu8_latin:                                                           \
1735         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1736         goto do_trie_latin_utf8_fold;                                               \
1737     case trie_latin_utf8_exactfa_fold:                                              \
1738         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1739         /* FALLTHROUGH */                                                           \
1740     case trie_latin_utf8_fold:                                                      \
1741       do_trie_latin_utf8_fold:                                                      \
1742         if ( foldlen>0 ) {                                                          \
1743             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1744             foldlen -= len;                                                         \
1745             uscan += len;                                                           \
1746             len=0;                                                                  \
1747         } else {                                                                    \
1748             len = 1;                                                                \
1749             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1750             skiplen = UVCHR_SKIP( uvc );                                            \
1751             foldlen -= skiplen;                                                     \
1752             uscan = foldbuf + skiplen;                                              \
1753         }                                                                           \
1754         break;                                                                      \
1755     case trie_utf8l:                                                                \
1756         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1757         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1758             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1759         }                                                                           \
1760         /* FALLTHROUGH */                                                           \
1761     case trie_utf8:                                                                 \
1762         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1763         break;                                                                      \
1764     case trie_plain:                                                                \
1765         uvc = (UV)*uc;                                                              \
1766         len = 1;                                                                    \
1767     }                                                                               \
1768     if (uvc < 256) {                                                                \
1769         charid = trie->charmap[ uvc ];                                              \
1770     }                                                                               \
1771     else {                                                                          \
1772         charid = 0;                                                                 \
1773         if (widecharmap) {                                                          \
1774             SV** const svpp = hv_fetch(widecharmap,                                 \
1775                         (char*)&uvc, sizeof(UV), 0);                                \
1776             if (svpp)                                                               \
1777                 charid = (U16)SvIV(*svpp);                                          \
1778         }                                                                           \
1779     }                                                                               \
1780 } STMT_END
1781
1782 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1783     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1784                 startpos, doutf8, depth)
1785
1786 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1787     STMT_START {                                            \
1788         while (s < strend) {                                \
1789             CODE                                            \
1790             s += ((UTF8)                                    \
1791                   ? UTF8_SAFE_SKIP(s, reginfo->strend)      \
1792                   : 1);                                     \
1793         }                                                   \
1794     } STMT_END
1795
1796 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1797     STMT_START {                                            \
1798         while (s < strend) {                                \
1799             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1800         }                                                   \
1801     } STMT_END
1802
1803 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1804     if (COND) {                                                \
1805         FBC_CHECK_AND_TRY                                      \
1806         s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
1807         previous_occurrence_end = s;                           \
1808     }                                                          \
1809     else {                                                     \
1810         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1811     }
1812
1813 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1814     if (utf8_target) {                                         \
1815         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1816     }                                                          \
1817     else {                                                     \
1818         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1819     }
1820
1821 /* We keep track of where the next character should start after an occurrence
1822  * of the one we're looking for.  Knowing that, we can see right away if the
1823  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1824  * don't accept the 2nd and succeeding adjacent occurrences */
1825 #define FBC_CHECK_AND_TRY                                           \
1826         if (   (   doevery                                          \
1827                 || s != previous_occurrence_end)                    \
1828             && (   reginfo->intuit                                  \
1829                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1830         {                                                           \
1831             goto got_it;                                            \
1832         }
1833
1834
1835 /* This differs from the above macros in that it calls a function which returns
1836  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1837  * there is no such occurrence. */
1838 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1839     while (s < strend) {                                    \
1840         s = (f);                                            \
1841         if (s >= strend) {                                  \
1842             break;                                          \
1843         }                                                   \
1844                                                             \
1845         FBC_CHECK_AND_TRY                                   \
1846         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1847         previous_occurrence_end = s;                        \
1848     }
1849
1850 /* This differs from the above macros in that it is passed a single byte that
1851  * is known to begin the next occurrence of the thing being looked for in 's'.
1852  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1853  * at that position. */
1854 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
1855     while (s < strend) {                                    \
1856         s = (char *) memchr(s, byte, strend -s);            \
1857         if (s == NULL) {                                    \
1858             s = (char *) strend;                            \
1859             break;                                          \
1860         }                                                   \
1861                                                             \
1862         if (COND) {                                         \
1863             FBC_CHECK_AND_TRY                               \
1864             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1865             previous_occurrence_end = s;                    \
1866         }                                                   \
1867         else {                                              \
1868             s += UTF8SKIP(s);                               \
1869         }                                                   \
1870     }
1871
1872 /* The three macros below are slightly different versions of the same logic.
1873  *
1874  * The first is for /a and /aa when the target string is UTF-8.  This can only
1875  * match ascii, but it must advance based on UTF-8.   The other two handle the
1876  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1877  * for the boundary (or non-boundary) between a word and non-word character.
1878  * The utf8 and non-utf8 cases have the same logic, but the details must be
1879  * different.  Find the "wordness" of the character just prior to this one, and
1880  * compare it with the wordness of this one.  If they differ, we have a
1881  * boundary.  At the beginning of the string, pretend that the previous
1882  * character was a new-line.
1883  *
1884  * All these macros uncleanly have side-effects with each other and outside
1885  * variables.  So far it's been too much trouble to clean-up
1886  *
1887  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1888  *               a word character or not.
1889  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1890  *               word/non-word
1891  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1892  *
1893  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1894  * are looking for a boundary or for a non-boundary.  If we are looking for a
1895  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1896  * see if this tentative match actually works, and if so, to quit the loop
1897  * here.  And vice-versa if we are looking for a non-boundary.
1898  *
1899  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1900  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1901  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1902  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1903  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1904  * complement.  But in that branch we complement tmp, meaning that at the
1905  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1906  * which means at the top of the loop in the next iteration, it is
1907  * TEST_NON_UTF8(s-1) */
1908 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1909     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1910     tmp = TEST_NON_UTF8(tmp);                                                  \
1911     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1912         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1913             tmp = !tmp;                                                        \
1914             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1915         }                                                                      \
1916         else {                                                                 \
1917             IF_FAIL;                                                           \
1918         }                                                                      \
1919     );                                                                         \
1920
1921 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1922  * TEST_UTF8 is a macro that for the same input code points returns identically
1923  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1924 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1925     if (s == reginfo->strbeg) {                                                \
1926         tmp = '\n';                                                            \
1927     }                                                                          \
1928     else { /* Back-up to the start of the previous character */                \
1929         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1930         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1931                                                        0, UTF8_ALLOW_DEFAULT); \
1932     }                                                                          \
1933     tmp = TEST_UV(tmp);                                                        \
1934     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1935         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1936             tmp = !tmp;                                                        \
1937             IF_SUCCESS;                                                        \
1938         }                                                                      \
1939         else {                                                                 \
1940             IF_FAIL;                                                           \
1941         }                                                                      \
1942     );
1943
1944 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1945  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1946  * macros below */
1947 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1948     if (utf8_target) {                                                         \
1949         UTF8_CODE                                                              \
1950     }                                                                          \
1951     else {  /* Not utf8 */                                                     \
1952         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1953         tmp = TEST_NON_UTF8(tmp);                                              \
1954         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
1955             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1956                 IF_SUCCESS;                                                    \
1957                 tmp = !tmp;                                                    \
1958             }                                                                  \
1959             else {                                                             \
1960                 IF_FAIL;                                                       \
1961             }                                                                  \
1962         );                                                                     \
1963     }                                                                          \
1964     /* Here, things have been set up by the previous code so that tmp is the   \
1965      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1966      * utf8ness of the target).  We also have to check if this matches against \
1967      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1968      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1969      * string */                                                               \
1970     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1971         IF_SUCCESS;                                                            \
1972     }                                                                          \
1973     else {                                                                     \
1974         IF_FAIL;                                                               \
1975     }
1976
1977 /* This is the macro to use when we want to see if something that looks like it
1978  * could match, actually does, and if so exits the loop.  It needs to be used
1979  * only for bounds checking macros, as it allows for matching beyond the end of
1980  * string (which should be zero length without having to look at the string
1981  * contents) */
1982 #define REXEC_FBC_TRYIT                                                     \
1983     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
1984         goto got_it
1985
1986 /* The only difference between the BOUND and NBOUND cases is that
1987  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1988  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1989  * with the other one being empty (PLACEHOLDER is defined as empty).
1990  *
1991  * The TEST_FOO parameters are for operating on different forms of input, but
1992  * all should be ones that return identically for the same underlying code
1993  * points */
1994 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1995     FBC_BOUND_COMMON(                                                          \
1996           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1997           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1998
1999 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2000     FBC_BOUND_COMMON(                                                          \
2001             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2002             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2003
2004 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2005     FBC_BOUND_COMMON(                                                          \
2006           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2007           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2008
2009 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2010     FBC_BOUND_COMMON(                                                          \
2011             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2012             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2013
2014 #ifdef DEBUGGING
2015 static IV
2016 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2017   IV cp_out = _invlist_search(invlist, cp_in);
2018   assert(cp_out >= 0);
2019   return cp_out;
2020 }
2021 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2022         invmap[S_get_break_val_cp_checked(invlist, cp)]
2023 #else
2024 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2025         invmap[_invlist_search(invlist, cp)]
2026 #endif
2027
2028 /* Takes a pointer to an inversion list, a pointer to its corresponding
2029  * inversion map, and a code point, and returns the code point's value
2030  * according to the two arrays.  It assumes that all code points have a value.
2031  * This is used as the base macro for macros for particular properties */
2032 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2033         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2034
2035 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2036  * of a code point, returning the value for the first code point in the string.
2037  * And it takes the particular macro name that finds the desired value given a
2038  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2039 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2040              (__ASSERT_(pos < strend)                                          \
2041                  /* Note assumes is valid UTF-8 */                             \
2042              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2043
2044 /* Returns the GCB value for the input code point */
2045 #define getGCB_VAL_CP(cp)                                                      \
2046           _generic_GET_BREAK_VAL_CP(                                           \
2047                                     PL_GCB_invlist,                            \
2048                                     _Perl_GCB_invmap,                          \
2049                                     (cp))
2050
2051 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2052  * bounded by pos and strend */
2053 #define getGCB_VAL_UTF8(pos, strend)                                           \
2054     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2055
2056 /* Returns the LB value for the input code point */
2057 #define getLB_VAL_CP(cp)                                                       \
2058           _generic_GET_BREAK_VAL_CP(                                           \
2059                                     PL_LB_invlist,                             \
2060                                     _Perl_LB_invmap,                           \
2061                                     (cp))
2062
2063 /* Returns the LB value for the first code point in the UTF-8 encoded string
2064  * bounded by pos and strend */
2065 #define getLB_VAL_UTF8(pos, strend)                                            \
2066     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2067
2068
2069 /* Returns the SB value for the input code point */
2070 #define getSB_VAL_CP(cp)                                                       \
2071           _generic_GET_BREAK_VAL_CP(                                           \
2072                                     PL_SB_invlist,                             \
2073                                     _Perl_SB_invmap,                     \
2074                                     (cp))
2075
2076 /* Returns the SB value for the first code point in the UTF-8 encoded string
2077  * bounded by pos and strend */
2078 #define getSB_VAL_UTF8(pos, strend)                                            \
2079     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2080
2081 /* Returns the WB value for the input code point */
2082 #define getWB_VAL_CP(cp)                                                       \
2083           _generic_GET_BREAK_VAL_CP(                                           \
2084                                     PL_WB_invlist,                             \
2085                                     _Perl_WB_invmap,                         \
2086                                     (cp))
2087
2088 /* Returns the WB value for the first code point in the UTF-8 encoded string
2089  * bounded by pos and strend */
2090 #define getWB_VAL_UTF8(pos, strend)                                            \
2091     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2092
2093 /* We know what class REx starts with.  Try to find this position... */
2094 /* if reginfo->intuit, its a dryrun */
2095 /* annoyingly all the vars in this routine have different names from their counterparts
2096    in regmatch. /grrr */
2097 STATIC char *
2098 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2099     const char *strend, regmatch_info *reginfo)
2100 {
2101     dVAR;
2102
2103     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2104     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2105
2106     char *pat_string;   /* The pattern's exactish string */
2107     char *pat_end;          /* ptr to end char of pat_string */
2108     re_fold_t folder;   /* Function for computing non-utf8 folds */
2109     const U8 *fold_array;   /* array for folding ords < 256 */
2110     STRLEN ln;
2111     STRLEN lnc;
2112     U8 c1;
2113     U8 c2;
2114     char *e = NULL;
2115
2116     /* In some cases we accept only the first occurence of 'x' in a sequence of
2117      * them.  This variable points to just beyond the end of the previous
2118      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2119      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2120      * hop back.) */
2121     char * previous_occurrence_end = 0;
2122
2123     I32 tmp;            /* Scratch variable */
2124     const bool utf8_target = reginfo->is_utf8_target;
2125     UV utf8_fold_flags = 0;
2126     const bool is_utf8_pat = reginfo->is_utf8_pat;
2127     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2128                                    with a result inverts that result, as 0^1 =
2129                                    1 and 1^1 = 0 */
2130     _char_class_number classnum;
2131
2132     RXi_GET_DECL(prog,progi);
2133
2134     PERL_ARGS_ASSERT_FIND_BYCLASS;
2135
2136     /* We know what class it must start with. */
2137     switch (OP(c)) {
2138     case ANYOFPOSIXL:
2139     case ANYOFL:
2140         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2141
2142         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2143             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2144         }
2145
2146         /* FALLTHROUGH */
2147     case ANYOFD:
2148     case ANYOF:
2149         if (utf8_target) {
2150             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2151                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2152         }
2153         else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2154             /* We know that s is in the bitmap range since the target isn't
2155              * UTF-8, so what happens for out-of-range values is not relevant,
2156              * so exclude that from the flags */
2157             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2158         }
2159         else {
2160             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2161         }
2162         break;
2163
2164     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2165         /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
2166          * use 0 */
2167         REXEC_FBC_FIND_NEXT_SCAN(0,
2168          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2169                                    (U8) ARG(c), FLAGS(c)));
2170         break;
2171
2172     case NANYOFM:   /* UTF-8ness does matter because can match UTF-8 variants.
2173                      */
2174         REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
2175          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2176                                    (U8) ARG(c), FLAGS(c)));
2177         break;
2178
2179     case ANYOFH:
2180         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2181             REXEC_FBC_CLASS_SCAN(TRUE,
2182                   (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2183                    && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2184         }
2185         break;
2186
2187     case ANYOFHb:
2188         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2189
2190             /* We know what the first byte of any matched string should be */
2191             U8 first_byte = FLAGS(c);
2192
2193             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2194                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2195         }
2196         break;
2197
2198     case ANYOFHr:
2199         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2200             REXEC_FBC_CLASS_SCAN(TRUE,
2201                   (   inRANGE(NATIVE_UTF8_TO_I8(*s),
2202                               LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2203                               HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2204                    && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2205         }
2206         break;
2207
2208     case ANYOFHs:
2209         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2210             REXEC_FBC_CLASS_SCAN(TRUE,
2211                   (   strend -s >= FLAGS(c)
2212                    && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
2213                    && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2214         }
2215         break;
2216
2217     case ANYOFR:
2218         if (utf8_target) {
2219             REXEC_FBC_CLASS_SCAN(TRUE,
2220                   (   NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2221                    && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2222                                                     (U8 *) strend,
2223                                                     NULL),
2224                                   ANYOFRbase(c), ANYOFRdelta(c))));
2225         }
2226         else {
2227             REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
2228                                                ANYOFRbase(c), ANYOFRdelta(c)));
2229         }
2230         break;
2231
2232     case ANYOFRb:
2233         if (utf8_target) {
2234
2235             /* We know what the first byte of any matched string should be */
2236             U8 first_byte = FLAGS(c);
2237
2238             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2239                       withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2240                                                     (U8 *) strend,
2241                                                     NULL),
2242                                   ANYOFRbase(c), ANYOFRdelta(c)));
2243         }
2244         else {
2245             REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
2246                                                ANYOFRbase(c), ANYOFRdelta(c)));
2247         }
2248         break;
2249
2250     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2251         assert(! is_utf8_pat);
2252         /* FALLTHROUGH */
2253     case EXACTFAA:
2254         if (is_utf8_pat) {
2255             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2256                              |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
2257             goto do_exactf_utf8;
2258         }
2259         else if (utf8_target) {
2260
2261             /* Here, and elsewhere in this file, the reason we can't consider a
2262              * non-UTF-8 pattern already folded in the presence of a UTF-8
2263              * target is because any MICRO SIGN in the pattern won't be folded.
2264              * Since the fold of the MICRO SIGN requires UTF-8 to represent, we
2265              * can consider a non-UTF-8 pattern folded when matching a
2266              * non-UTF-8 target */
2267             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2268             goto do_exactf_utf8;
2269         }
2270
2271         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2272          * which these functions don't handle anyway */
2273         fold_array = PL_fold_latin1;
2274         folder = foldEQ_latin1_s2_folded;
2275         goto do_exactf_non_utf8;
2276
2277     case EXACTF:   /* This node only generated for non-utf8 patterns */
2278         assert(! is_utf8_pat);
2279         if (utf8_target) {
2280             goto do_exactf_utf8;
2281         }
2282         fold_array = PL_fold;
2283         folder = foldEQ;
2284         goto do_exactf_non_utf8;
2285
2286     case EXACTFL:
2287         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2288         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2289             utf8_fold_flags = FOLDEQ_LOCALE;
2290             goto do_exactf_utf8;
2291         }
2292         fold_array = PL_fold_locale;
2293         folder = foldEQ_locale;
2294         goto do_exactf_non_utf8;
2295
2296     case EXACTFUP:      /* Problematic even though pattern isn't UTF-8.  Use
2297                            full functionality normally not done except for
2298                            UTF-8 */
2299         assert(! is_utf8_pat);
2300         goto do_exactf_utf8;
2301
2302     case EXACTFLU8:
2303             if (! utf8_target) {    /* All code points in this node require
2304                                        UTF-8 to express.  */
2305                 break;
2306             }
2307             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2308                                              | FOLDEQ_S2_FOLDS_SANE;
2309             goto do_exactf_utf8;
2310
2311     case EXACTFU_REQ8:
2312         if (! utf8_target) {
2313             break;
2314         }
2315         assert(is_utf8_pat);
2316         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2317         goto do_exactf_utf8;
2318
2319     case EXACTFU:
2320         if (is_utf8_pat || utf8_target) {
2321             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2322             goto do_exactf_utf8;
2323         }
2324
2325         /* Any 'ss' in the pattern should have been replaced by regcomp,
2326          * so we don't have to worry here about this single special case
2327          * in the Latin1 range */
2328         fold_array = PL_fold_latin1;
2329         folder = foldEQ_latin1_s2_folded;
2330
2331         /* FALLTHROUGH */
2332
2333       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2334                            are no glitches with fold-length differences
2335                            between the target string and pattern */
2336
2337         /* The idea in the non-utf8 EXACTF* cases is to first find the
2338          * first character of the EXACTF* node and then, if necessary,
2339          * case-insensitively compare the full text of the node.  c1 is the
2340          * first character.  c2 is its fold.  This logic will not work for
2341          * Unicode semantics and the german sharp ss, which hence should
2342          * not be compiled into a node that gets here. */
2343         pat_string = STRINGs(c);
2344         ln  = STR_LENs(c);      /* length to match in octets/bytes */
2345
2346         /* We know that we have to match at least 'ln' bytes (which is the
2347          * same as characters, since not utf8).  If we have to match 3
2348          * characters, and there are only 2 availabe, we know without
2349          * trying that it will fail; so don't start a match past the
2350          * required minimum number from the far end */
2351         e = HOP3c(strend, -((SSize_t)ln), s);
2352         if (e < s)
2353             break;
2354
2355         c1 = *pat_string;
2356         c2 = fold_array[c1];
2357         if (c1 == c2) { /* If char and fold are the same */
2358             while (s <= e) {
2359                 s = (char *) memchr(s, c1, e + 1 - s);
2360                 if (s == NULL) {
2361                     break;
2362                 }
2363
2364                 /* Check that the rest of the node matches */
2365                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2366                     && (reginfo->intuit || regtry(reginfo, &s)) )
2367                 {
2368                     goto got_it;
2369                 }
2370                 s++;
2371             }
2372         }
2373         else {
2374             U8 bits_differing = c1 ^ c2;
2375
2376             /* If the folds differ in one bit position only, we can mask to
2377              * match either of them, and can use this faster find method.  Both
2378              * ASCII and EBCDIC tend to have their case folds differ in only
2379              * one position, so this is very likely */
2380             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2381                 bits_differing = ~ bits_differing;
2382                 while (s <= e) {
2383                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2384                                         (c1 & bits_differing), bits_differing);
2385                     if (s > e) {
2386                         break;
2387                     }
2388
2389                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2390                         && (reginfo->intuit || regtry(reginfo, &s)) )
2391                     {
2392                         goto got_it;
2393                     }
2394                     s++;
2395                 }
2396             }
2397             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2398                        should actually happen only in EXACTFL nodes */
2399                 while (s <= e) {
2400                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2401                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2402                         && (reginfo->intuit || regtry(reginfo, &s)) )
2403                     {
2404                         goto got_it;
2405                     }
2406                     s++;
2407                 }
2408             }
2409         }
2410         break;
2411
2412       do_exactf_utf8:
2413       {
2414         unsigned expansion;
2415
2416         /* If one of the operands is in utf8, we can't use the simpler folding
2417          * above, due to the fact that many different characters can have the
2418          * same fold, or portion of a fold, or different- length fold */
2419         pat_string = STRINGs(c);
2420         ln  = STR_LENs(c);      /* length to match in octets/bytes */
2421         pat_end = pat_string + ln;
2422         lnc = is_utf8_pat       /* length to match in characters */
2423                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2424                 : ln;
2425
2426         /* We have 'lnc' characters to match in the pattern, but because of
2427          * multi-character folding, each character in the target can match
2428          * up to 3 characters (Unicode guarantees it will never exceed
2429          * this) if it is utf8-encoded; and up to 2 if not (based on the
2430          * fact that the Latin 1 folds are already determined, and the
2431          * only multi-char fold in that range is the sharp-s folding to
2432          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2433          * string character.  Adjust lnc accordingly, rounding up, so that
2434          * if we need to match at least 4+1/3 chars, that really is 5. */
2435         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2436         lnc = (lnc + expansion - 1) / expansion;
2437
2438         /* As in the non-UTF8 case, if we have to match 3 characters, and
2439          * only 2 are left, it's guaranteed to fail, so don't start a
2440          * match that would require us to go beyond the end of the string
2441          */
2442         e = HOP3c(strend, -((SSize_t)lnc), s);
2443
2444         /* XXX Note that we could recalculate e to stop the loop earlier,
2445          * as the worst case expansion above will rarely be met, and as we
2446          * go along we would usually find that e moves further to the left.
2447          * This would happen only after we reached the point in the loop
2448          * where if there were no expansion we should fail.  Unclear if
2449          * worth the expense */
2450
2451         while (s <= e) {
2452             char *my_strend= (char *)strend;
2453             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2454                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2455                 && (reginfo->intuit || regtry(reginfo, &s)) )
2456             {
2457                 goto got_it;
2458             }
2459             s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2460         }
2461         break;
2462     }
2463
2464     case BOUNDL:
2465         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2466         if (FLAGS(c) != TRADITIONAL_BOUND) {
2467             if (! IN_UTF8_CTYPE_LOCALE) {
2468                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2469                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2470             }
2471             goto do_boundu;
2472         }
2473
2474         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2475         break;
2476
2477     case NBOUNDL:
2478         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2479         if (FLAGS(c) != TRADITIONAL_BOUND) {
2480             if (! IN_UTF8_CTYPE_LOCALE) {
2481                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2482                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2483             }
2484             goto do_nboundu;
2485         }
2486
2487         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2488         break;
2489
2490     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2491                    meaning */
2492         assert(FLAGS(c) == TRADITIONAL_BOUND);
2493
2494         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2495         break;
2496
2497     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2498                    meaning */
2499         assert(FLAGS(c) == TRADITIONAL_BOUND);
2500
2501         FBC_BOUND_A(isWORDCHAR_A);
2502         break;
2503
2504     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2505                    meaning */
2506         assert(FLAGS(c) == TRADITIONAL_BOUND);
2507
2508         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2509         break;
2510
2511     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2512                    meaning */
2513         assert(FLAGS(c) == TRADITIONAL_BOUND);
2514
2515         FBC_NBOUND_A(isWORDCHAR_A);
2516         break;
2517
2518     case NBOUNDU:
2519         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2520             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2521             break;
2522         }
2523
2524       do_nboundu:
2525
2526         to_complement = 1;
2527         /* FALLTHROUGH */
2528
2529     case BOUNDU:
2530       do_boundu:
2531         switch((bound_type) FLAGS(c)) {
2532             case TRADITIONAL_BOUND:
2533                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2534                 break;
2535             case GCB_BOUND:
2536                 if (s == reginfo->strbeg) {
2537                     if (reginfo->intuit || regtry(reginfo, &s))
2538                     {
2539                         goto got_it;
2540                     }
2541
2542                     /* Didn't match.  Try at the next position (if there is one) */
2543                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2544                     if (UNLIKELY(s >= reginfo->strend)) {
2545                         break;
2546                     }
2547                 }
2548
2549                 if (utf8_target) {
2550                     GCB_enum before = getGCB_VAL_UTF8(
2551                                                reghop3((U8*)s, -1,
2552                                                        (U8*)(reginfo->strbeg)),
2553                                                (U8*) reginfo->strend);
2554                     while (s < strend) {
2555                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2556                                                         (U8*) reginfo->strend);
2557                         if (   (to_complement ^ isGCB(before,
2558                                                       after,
2559                                                       (U8*) reginfo->strbeg,
2560                                                       (U8*) s,
2561                                                       utf8_target))
2562                             && (reginfo->intuit || regtry(reginfo, &s)))
2563                         {
2564                             goto got_it;
2565                         }
2566                         before = after;
2567                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2568                     }
2569                 }
2570                 else {  /* Not utf8.  Everything is a GCB except between CR and
2571                            LF */
2572                     while (s < strend) {
2573                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2574                                               || UCHARAT(s) != '\n'))
2575                             && (reginfo->intuit || regtry(reginfo, &s)))
2576                         {
2577                             goto got_it;
2578                         }
2579                         s++;
2580                     }
2581                 }
2582
2583                 /* And, since this is a bound, it can match after the final
2584                  * character in the string */
2585                 if (   reginfo->intuit
2586                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2587                 {
2588                     goto got_it;
2589                 }
2590                 break;
2591
2592             case LB_BOUND:
2593                 if (s == reginfo->strbeg) {
2594                     if (reginfo->intuit || regtry(reginfo, &s)) {
2595                         goto got_it;
2596                     }
2597                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2598                     if (UNLIKELY(s >= reginfo->strend)) {
2599                         break;
2600                     }
2601                 }
2602
2603                 if (utf8_target) {
2604                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2605                                                                -1,
2606                                                                (U8*)(reginfo->strbeg)),
2607                                                        (U8*) reginfo->strend);
2608                     while (s < strend) {
2609                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2610                         if (to_complement ^ isLB(before,
2611                                                  after,
2612                                                  (U8*) reginfo->strbeg,
2613                                                  (U8*) s,
2614                                                  (U8*) reginfo->strend,
2615                                                  utf8_target)
2616                             && (reginfo->intuit || regtry(reginfo, &s)))
2617                         {
2618                             goto got_it;
2619                         }
2620                         before = after;
2621                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2622                     }
2623                 }
2624                 else {  /* Not utf8. */
2625                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2626                     while (s < strend) {
2627                         LB_enum after = getLB_VAL_CP((U8) *s);
2628                         if (to_complement ^ isLB(before,
2629                                                  after,
2630                                                  (U8*) reginfo->strbeg,
2631                                                  (U8*) s,
2632                                                  (U8*) reginfo->strend,
2633                                                  utf8_target)
2634                             && (reginfo->intuit || regtry(reginfo, &s)))
2635                         {
2636                             goto got_it;
2637                         }
2638                         before = after;
2639                         s++;
2640                     }
2641                 }
2642
2643                 if (   reginfo->intuit
2644                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2645                 {
2646                     goto got_it;
2647                 }
2648
2649                 break;
2650
2651             case SB_BOUND:
2652                 if (s == reginfo->strbeg) {
2653                     if (reginfo->intuit || regtry(reginfo, &s)) {
2654                         goto got_it;
2655                     }
2656                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2657                     if (UNLIKELY(s >= reginfo->strend)) {
2658                         break;
2659                     }
2660                 }
2661
2662                 if (utf8_target) {
2663                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2664                                                         -1,
2665                                                         (U8*)(reginfo->strbeg)),
2666                                                       (U8*) reginfo->strend);
2667                     while (s < strend) {
2668                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2669                                                          (U8*) reginfo->strend);
2670                         if ((to_complement ^ isSB(before,
2671                                                   after,
2672                                                   (U8*) reginfo->strbeg,
2673                                                   (U8*) s,
2674                                                   (U8*) reginfo->strend,
2675                                                   utf8_target))
2676                             && (reginfo->intuit || regtry(reginfo, &s)))
2677                         {
2678                             goto got_it;
2679                         }
2680                         before = after;
2681                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2682                     }
2683                 }
2684                 else {  /* Not utf8. */
2685                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2686                     while (s < strend) {
2687                         SB_enum after = getSB_VAL_CP((U8) *s);
2688                         if ((to_complement ^ isSB(before,
2689                                                   after,
2690                                                   (U8*) reginfo->strbeg,
2691                                                   (U8*) s,
2692                                                   (U8*) reginfo->strend,
2693                                                   utf8_target))
2694                             && (reginfo->intuit || regtry(reginfo, &s)))
2695                         {
2696                             goto got_it;
2697                         }
2698                         before = after;
2699                         s++;
2700                     }
2701                 }
2702
2703                 /* Here are at the final position in the target string.  The SB
2704                  * value is always true here, so matches, depending on other
2705                  * constraints */
2706                 if (   reginfo->intuit
2707                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2708                 {
2709                     goto got_it;
2710                 }
2711
2712                 break;
2713
2714             case WB_BOUND:
2715                 if (s == reginfo->strbeg) {
2716                     if (reginfo->intuit || regtry(reginfo, &s)) {
2717                         goto got_it;
2718                     }
2719                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2720                     if (UNLIKELY(s >= reginfo->strend)) {
2721                         break;
2722                     }
2723                 }
2724
2725                 if (utf8_target) {
2726                     /* We are at a boundary between char_sub_0 and char_sub_1.
2727                      * We also keep track of the value for char_sub_-1 as we
2728                      * loop through the line.   Context may be needed to make a
2729                      * determination, and if so, this can save having to
2730                      * recalculate it */
2731                     WB_enum previous = WB_UNKNOWN;
2732                     WB_enum before = getWB_VAL_UTF8(
2733                                               reghop3((U8*)s,
2734                                                       -1,
2735                                                       (U8*)(reginfo->strbeg)),
2736                                               (U8*) reginfo->strend);
2737                     while (s < strend) {
2738                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2739                                                         (U8*) reginfo->strend);
2740                         if ((to_complement ^ isWB(previous,
2741                                                   before,
2742                                                   after,
2743                                                   (U8*) reginfo->strbeg,
2744                                                   (U8*) s,
2745                                                   (U8*) reginfo->strend,
2746                                                   utf8_target))
2747                             && (reginfo->intuit || regtry(reginfo, &s)))
2748                         {
2749                             goto got_it;
2750                         }
2751                         previous = before;
2752                         before = after;
2753                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2754                     }
2755                 }
2756                 else {  /* Not utf8. */
2757                     WB_enum previous = WB_UNKNOWN;
2758                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2759                     while (s < strend) {
2760                         WB_enum after = getWB_VAL_CP((U8) *s);
2761                         if ((to_complement ^ isWB(previous,
2762                                                   before,
2763                                                   after,
2764                                                   (U8*) reginfo->strbeg,
2765                                                   (U8*) s,
2766                                                   (U8*) reginfo->strend,
2767                                                   utf8_target))
2768                             && (reginfo->intuit || regtry(reginfo, &s)))
2769                         {
2770                             goto got_it;
2771                         }
2772                         previous = before;
2773                         before = after;
2774                         s++;
2775                     }
2776                 }
2777
2778                 if (   reginfo->intuit
2779                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2780                 {
2781                     goto got_it;
2782                 }
2783         }
2784         break;
2785
2786     case LNBREAK:
2787         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2788                         is_LNBREAK_latin1_safe(s, strend)
2789         );
2790         break;
2791
2792     /* The argument to all the POSIX node types is the class number to pass to
2793      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2794
2795     case NPOSIXL:
2796         to_complement = 1;
2797         /* FALLTHROUGH */
2798
2799     case POSIXL:
2800         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2801         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2802                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2803         break;
2804
2805     case NPOSIXD:
2806         to_complement = 1;
2807         /* FALLTHROUGH */
2808
2809     case POSIXD:
2810         if (utf8_target) {
2811             goto posix_utf8;
2812         }
2813         goto posixa;
2814
2815     case NPOSIXA:
2816         if (utf8_target) {
2817             /* The complement of something that matches only ASCII matches all
2818              * non-ASCII, plus everything in ASCII that isn't in the class. */
2819             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2820                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2821             break;
2822         }
2823
2824         to_complement = 1;
2825         goto posixa;
2826
2827     case POSIXA:
2828         /* Don't need to worry about utf8, as it can match only a single
2829          * byte invariant character.  But we do anyway for performance reasons,
2830          * as otherwise we would have to examine all the continuation
2831          * characters */
2832         if (utf8_target) {
2833             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2834             break;
2835         }
2836
2837       posixa:
2838         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2839                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2840         break;
2841
2842     case NPOSIXU:
2843         to_complement = 1;
2844         /* FALLTHROUGH */
2845
2846     case POSIXU:
2847         if (! utf8_target) {
2848             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2849                                  to_complement ^ cBOOL(_generic_isCC(*s,
2850                                                                     FLAGS(c))));
2851         }
2852         else {
2853
2854           posix_utf8:
2855             classnum = (_char_class_number) FLAGS(c);
2856             switch (classnum) {
2857                 default:
2858                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2859                         to_complement ^ cBOOL(_invlist_contains_cp(
2860                                               PL_XPosix_ptrs[classnum],
2861                                               utf8_to_uvchr_buf((U8 *) s,
2862                                                                 (U8 *) strend,
2863                                                                 NULL))));
2864                     break;
2865                 case _CC_ENUM_SPACE:
2866                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2867                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2868                     break;
2869
2870                 case _CC_ENUM_BLANK:
2871                     REXEC_FBC_CLASS_SCAN(1,
2872                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2873                     break;
2874
2875                 case _CC_ENUM_XDIGIT:
2876                     REXEC_FBC_CLASS_SCAN(1,
2877                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2878                     break;
2879
2880                 case _CC_ENUM_VERTSPACE:
2881                     REXEC_FBC_CLASS_SCAN(1,
2882                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2883                     break;
2884
2885                 case _CC_ENUM_CNTRL:
2886                     REXEC_FBC_CLASS_SCAN(1,
2887                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2888                     break;
2889             }
2890         }
2891         break;
2892
2893     case AHOCORASICKC:
2894     case AHOCORASICK:
2895         {
2896             DECL_TRIE_TYPE(c);
2897             /* what trie are we using right now */
2898             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2899             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2900             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2901
2902             const char *last_start = strend - trie->minlen;
2903 #ifdef DEBUGGING
2904             const char *real_start = s;
2905 #endif
2906             STRLEN maxlen = trie->maxlen;
2907             SV *sv_points;
2908             U8 **points; /* map of where we were in the input string
2909                             when reading a given char. For ASCII this
2910                             is unnecessary overhead as the relationship
2911                             is always 1:1, but for Unicode, especially
2912                             case folded Unicode this is not true. */
2913             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2914             U8 *bitmap=NULL;
2915
2916
2917             DECLARE_AND_GET_RE_DEBUG_FLAGS;
2918
2919             /* We can't just allocate points here. We need to wrap it in
2920              * an SV so it gets freed properly if there is a croak while
2921              * running the match */
2922             ENTER;
2923             SAVETMPS;
2924             sv_points=newSV(maxlen * sizeof(U8 *));
2925             SvCUR_set(sv_points,
2926                 maxlen * sizeof(U8 *));
2927             SvPOK_on(sv_points);
2928             sv_2mortal(sv_points);
2929             points=(U8**)SvPV_nolen(sv_points );
2930             if ( trie_type != trie_utf8_fold
2931                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2932             {
2933                 if (trie->bitmap)
2934                     bitmap=(U8*)trie->bitmap;
2935                 else
2936                     bitmap=(U8*)ANYOF_BITMAP(c);
2937             }
2938             /* this is the Aho-Corasick algorithm modified a touch
2939                to include special handling for long "unknown char" sequences.
2940                The basic idea being that we use AC as long as we are dealing
2941                with a possible matching char, when we encounter an unknown char
2942                (and we have not encountered an accepting state) we scan forward
2943                until we find a legal starting char.
2944                AC matching is basically that of trie matching, except that when
2945                we encounter a failing transition, we fall back to the current
2946                states "fail state", and try the current char again, a process
2947                we repeat until we reach the root state, state 1, or a legal
2948                transition. If we fail on the root state then we can either
2949                terminate if we have reached an accepting state previously, or
2950                restart the entire process from the beginning if we have not.
2951
2952              */
2953             while (s <= last_start) {
2954                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2955                 U8 *uc = (U8*)s;
2956                 U16 charid = 0;
2957                 U32 base = 1;
2958                 U32 state = 1;
2959                 UV uvc = 0;
2960                 STRLEN len = 0;
2961                 STRLEN foldlen = 0;
2962                 U8 *uscan = (U8*)NULL;
2963                 U8 *leftmost = NULL;
2964 #ifdef DEBUGGING
2965                 U32 accepted_word= 0;
2966 #endif
2967                 U32 pointpos = 0;
2968
2969                 while ( state && uc <= (U8*)strend ) {
2970                     int failed=0;
2971                     U32 word = aho->states[ state ].wordnum;
2972
2973                     if( state==1 ) {
2974                         if ( bitmap ) {
2975                             DEBUG_TRIE_EXECUTE_r(
2976                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2977                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2978                                         (char *)uc, utf8_target, 0 );
2979                                     Perl_re_printf( aTHX_
2980                                         " Scanning for legal start char...\n");
2981                                 }
2982                             );
2983                             if (utf8_target) {
2984                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2985                                     uc += UTF8SKIP(uc);
2986                                 }
2987                             } else {
2988                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2989                                     uc++;
2990                                 }
2991                             }
2992                             s= (char *)uc;
2993                         }
2994                         if (uc >(U8*)last_start) break;
2995                     }
2996
2997                     if ( word ) {
2998                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2999                         if (!leftmost || lpos < leftmost) {
3000                             DEBUG_r(accepted_word=word);
3001                             leftmost= lpos;
3002                         }
3003                         if (base==0) break;
3004
3005                     }
3006                     points[pointpos++ % maxlen]= uc;
3007                     if (foldlen || uc < (U8*)strend) {
3008                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3009                                              (U8 *) strend, uscan, len, uvc,
3010                                              charid, foldlen, foldbuf,
3011                                              uniflags);
3012                         DEBUG_TRIE_EXECUTE_r({
3013                             dump_exec_pos( (char *)uc, c, strend,
3014                                         real_start, s, utf8_target, 0);
3015                             Perl_re_printf( aTHX_
3016                                 " Charid:%3u CP:%4" UVxf " ",
3017                                  charid, uvc);
3018                         });
3019                     }
3020                     else {
3021                         len = 0;
3022                         charid = 0;
3023                     }
3024
3025
3026                     do {
3027 #ifdef DEBUGGING
3028                         word = aho->states[ state ].wordnum;
3029 #endif
3030                         base = aho->states[ state ].trans.base;
3031
3032                         DEBUG_TRIE_EXECUTE_r({
3033                             if (failed)
3034                                 dump_exec_pos( (char *)uc, c, strend, real_start,
3035                                     s,   utf8_target, 0 );
3036                             Perl_re_printf( aTHX_
3037                                 "%sState: %4" UVxf ", word=%" UVxf,
3038                                 failed ? " Fail transition to " : "",
3039                                 (UV)state, (UV)word);
3040                         });
3041                         if ( base ) {
3042                             U32 tmp;
3043                             I32 offset;
3044                             if (charid &&
3045                                  ( ((offset = base + charid
3046                                     - 1 - trie->uniquecharcount)) >= 0)
3047                                  && ((U32)offset < trie->lasttrans)
3048                                  && trie->trans[offset].check == state
3049                                  && (tmp=trie->trans[offset].next))
3050                             {
3051                                 DEBUG_TRIE_EXECUTE_r(
3052                                     Perl_re_printf( aTHX_ " - legal\n"));
3053                                 state = tmp;
3054                                 break;
3055                             }
3056                             else {
3057                                 DEBUG_TRIE_EXECUTE_r(
3058                                     Perl_re_printf( aTHX_ " - fail\n"));
3059                                 failed = 1;
3060                                 state = aho->fail[state];
3061                             }
3062                         }
3063                         else {
3064                             /* we must be accepting here */
3065                             DEBUG_TRIE_EXECUTE_r(
3066                                     Perl_re_printf( aTHX_ " - accepting\n"));
3067                             failed = 1;
3068                             break;
3069                         }
3070                     } while(state);
3071                     uc += len;
3072                     if (failed) {
3073                         if (leftmost)
3074                             break;
3075                         if (!state) state = 1;
3076                     }
3077                 }
3078                 if ( aho->states[ state ].wordnum ) {
3079                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3080                     if (!leftmost || lpos < leftmost) {
3081                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3082                         leftmost = lpos;
3083                     }
3084                 }
3085                 if (leftmost) {
3086                     s = (char*)leftmost;
3087                     DEBUG_TRIE_EXECUTE_r({
3088                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3089                             (UV)accepted_word, (IV)(s - real_start)
3090                         );
3091                     });
3092                     if (reginfo->intuit || regtry(reginfo, &s)) {
3093                         FREETMPS;
3094                         LEAVE;
3095                         goto got_it;
3096                     }
3097                     if (s < reginfo->strend) {
3098                         s = HOPc(s,1);
3099                     }
3100                     DEBUG_TRIE_EXECUTE_r({
3101                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3102                     });
3103                 } else {
3104                     DEBUG_TRIE_EXECUTE_r(
3105                         Perl_re_printf( aTHX_ "No match.\n"));
3106                     break;
3107                 }
3108             }
3109             FREETMPS;
3110             LEAVE;
3111         }
3112         break;
3113     default:
3114         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3115     }
3116     return 0;
3117   got_it:
3118     return s;
3119 }
3120
3121 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3122  * flags have same meanings as with regexec_flags() */
3123
3124 static void
3125 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3126                             char *strbeg,
3127                             char *strend,
3128                             SV *sv,
3129                             U32 flags,
3130                             bool utf8_target)
3131 {
3132     struct regexp *const prog = ReANY(rx);
3133
3134     if (flags & REXEC_COPY_STR) {
3135 #ifdef PERL_ANY_COW
3136         if (SvCANCOW(sv)) {
3137             DEBUG_C(Perl_re_printf( aTHX_
3138                               "Copy on write: regexp capture, type %d\n",
3139                                     (int) SvTYPE(sv)));
3140             /* Create a new COW SV to share the match string and store
3141              * in saved_copy, unless the current COW SV in saved_copy
3142              * is valid and suitable for our purpose */
3143             if ((   prog->saved_copy
3144                  && SvIsCOW(prog->saved_copy)
3145                  && SvPOKp(prog->saved_copy)
3146                  && SvIsCOW(sv)
3147                  && SvPOKp(sv)
3148                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3149             {
3150                 /* just reuse saved_copy SV */
3151                 if (RXp_MATCH_COPIED(prog)) {
3152                     Safefree(prog->subbeg);
3153                     RXp_MATCH_COPIED_off(prog);
3154                 }
3155             }
3156             else {
3157                 /* create new COW SV to share string */
3158                 RXp_MATCH_COPY_FREE(prog);
3159                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3160             }
3161             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3162             assert (SvPOKp(prog->saved_copy));
3163             prog->sublen  = strend - strbeg;
3164             prog->suboffset = 0;
3165             prog->subcoffset = 0;
3166         } else
3167 #endif
3168         {
3169             SSize_t min = 0;
3170             SSize_t max = strend - strbeg;
3171             SSize_t sublen;
3172
3173             if (    (flags & REXEC_COPY_SKIP_POST)
3174                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3175                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3176             ) { /* don't copy $' part of string */
3177                 U32 n = 0;
3178                 max = -1;
3179                 /* calculate the right-most part of the string covered
3180                  * by a capture. Due to lookahead, this may be to
3181                  * the right of $&, so we have to scan all captures */
3182                 while (n <= prog->lastparen) {
3183                     if (prog->offs[n].end > max)
3184                         max = prog->offs[n].end;
3185                     n++;
3186                 }
3187                 if (max == -1)
3188                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3189                             ? prog->offs[0].start
3190                             : 0;
3191                 assert(max >= 0 && max <= strend - strbeg);
3192             }
3193
3194             if (    (flags & REXEC_COPY_SKIP_PRE)
3195                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3196                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3197             ) { /* don't copy $` part of string */
3198                 U32 n = 0;
3199                 min = max;
3200                 /* calculate the left-most part of the string covered
3201                  * by a capture. Due to lookbehind, this may be to
3202                  * the left of $&, so we have to scan all captures */
3203                 while (min && n <= prog->lastparen) {
3204                     if (   prog->offs[n].start != -1
3205                         && prog->offs[n].start < min)
3206                     {
3207                         min = prog->offs[n].start;
3208                     }
3209                     n++;
3210                 }
3211                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3212                     && min >  prog->offs[0].end
3213                 )
3214                     min = prog->offs[0].end;
3215
3216             }
3217
3218             assert(min >= 0 && min <= max && min <= strend - strbeg);
3219             sublen = max - min;
3220
3221             if (RXp_MATCH_COPIED(prog)) {
3222                 if (sublen > prog->sublen)
3223                     prog->subbeg =
3224                             (char*)saferealloc(prog->subbeg, sublen+1);
3225             }
3226             else
3227                 prog->subbeg = (char*)safemalloc(sublen+1);
3228             Copy(strbeg + min, prog->subbeg, sublen, char);
3229             prog->subbeg[sublen] = '\0';
3230             prog->suboffset = min;
3231             prog->sublen = sublen;
3232             RXp_MATCH_COPIED_on(prog);
3233         }
3234         prog->subcoffset = prog->suboffset;
3235         if (prog->suboffset && utf8_target) {
3236             /* Convert byte offset to chars.
3237              * XXX ideally should only compute this if @-/@+
3238              * has been seen, a la PL_sawampersand ??? */
3239
3240             /* If there's a direct correspondence between the
3241              * string which we're matching and the original SV,
3242              * then we can use the utf8 len cache associated with
3243              * the SV. In particular, it means that under //g,
3244              * sv_pos_b2u() will use the previously cached
3245              * position to speed up working out the new length of
3246              * subcoffset, rather than counting from the start of
3247              * the string each time. This stops
3248              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3249              * from going quadratic */
3250             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3251                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3252                                                 SV_GMAGIC|SV_CONST_RETURN);
3253             else
3254                 prog->subcoffset = utf8_length((U8*)strbeg,
3255                                     (U8*)(strbeg+prog->suboffset));
3256         }
3257     }
3258     else {
3259         RXp_MATCH_COPY_FREE(prog);
3260         prog->subbeg = strbeg;
3261         prog->suboffset = 0;
3262         prog->subcoffset = 0;
3263         prog->sublen = strend - strbeg;
3264     }
3265 }
3266
3267
3268
3269
3270 /*
3271  - regexec_flags - match a regexp against a string
3272  */
3273 I32
3274 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3275               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3276 /* stringarg: the point in the string at which to begin matching */
3277 /* strend:    pointer to null at end of string */
3278 /* strbeg:    real beginning of string */
3279 /* minend:    end of match must be >= minend bytes after stringarg. */
3280 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3281  *            itself is accessed via the pointers above */
3282 /* data:      May be used for some additional optimizations.
3283               Currently unused. */
3284 /* flags:     For optimizations. See REXEC_* in regexp.h */
3285
3286 {
3287     struct regexp *const prog = ReANY(rx);
3288     char *s;
3289     regnode *c;
3290     char *startpos;
3291     SSize_t minlen;             /* must match at least this many chars */
3292     SSize_t dontbother = 0;     /* how many characters not to try at end */
3293     const bool utf8_target = cBOOL(DO_UTF8(sv));
3294     I32 multiline;
3295     RXi_GET_DECL(prog,progi);
3296     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3297     regmatch_info *const reginfo = &reginfo_buf;
3298     regexp_paren_pair *swap = NULL;
3299     I32 oldsave;
3300     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3301
3302     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3303     PERL_UNUSED_ARG(data);
3304
3305     /* Be paranoid... */
3306     if (prog == NULL) {
3307         Perl_croak(aTHX_ "NULL regexp parameter");
3308     }
3309
3310     DEBUG_EXECUTE_r(
3311         debug_start_match(rx, utf8_target, stringarg, strend,
3312         "Matching");
3313     );
3314
3315     startpos = stringarg;
3316
3317     /* set these early as they may be used by the HOP macros below */
3318     reginfo->strbeg = strbeg;
3319     reginfo->strend = strend;
3320     reginfo->is_utf8_target = cBOOL(utf8_target);
3321
3322     if (prog->intflags & PREGf_GPOS_SEEN) {
3323         MAGIC *mg;
3324
3325         /* set reginfo->ganch, the position where \G can match */
3326
3327         reginfo->ganch =
3328             (flags & REXEC_IGNOREPOS)
3329             ? stringarg /* use start pos rather than pos() */
3330             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3331               /* Defined pos(): */
3332             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3333             : strbeg; /* pos() not defined; use start of string */
3334
3335         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3336             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3337
3338         /* in the presence of \G, we may need to start looking earlier in
3339          * the string than the suggested start point of stringarg:
3340          * if prog->gofs is set, then that's a known, fixed minimum
3341          * offset, such as
3342          * /..\G/:   gofs = 2
3343          * /ab|c\G/: gofs = 1
3344          * or if the minimum offset isn't known, then we have to go back
3345          * to the start of the string, e.g. /w+\G/
3346          */
3347
3348         if (prog->intflags & PREGf_ANCH_GPOS) {
3349             if (prog->gofs) {
3350                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3351                 if (!startpos ||
3352                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3353                 {
3354                     DEBUG_GPOS_r(Perl_re_printf( aTHX_
3355                             "fail: ganch-gofs before earliest possible start\n"));
3356                     return 0;
3357                 }
3358             }
3359             else
3360                 startpos = reginfo->ganch;
3361         }
3362         else if (prog->gofs) {
3363             startpos = HOPBACKc(startpos, prog->gofs);
3364             if (!startpos)
3365                 startpos = strbeg;
3366         }
3367         else if (prog->intflags & PREGf_GPOS_FLOAT)
3368             startpos = strbeg;
3369     }
3370
3371     minlen = prog->minlen;
3372     if ((startpos + minlen) > strend || startpos < strbeg) {
3373         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3374                         "Regex match can't succeed, so not even tried\n"));
3375         return 0;
3376     }
3377
3378     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3379      * which will call destuctors to reset PL_regmatch_state, free higher
3380      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3381      * regmatch_info_aux_eval */
3382
3383     oldsave = PL_savestack_ix;
3384
3385     s = startpos;
3386
3387     if ((prog->extflags & RXf_USE_INTUIT)
3388         && !(flags & REXEC_CHECKED))
3389     {
3390         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3391                                     flags, NULL);
3392         if (!s)
3393             return 0;
3394
3395         if (prog->extflags & RXf_CHECK_ALL) {
3396             /* we can match based purely on the result of INTUIT.
3397              * Set up captures etc just for $& and $-[0]
3398              * (an intuit-only match wont have $1,$2,..) */
3399             assert(!prog->nparens);
3400
3401             /* s/// doesn't like it if $& is earlier than where we asked it to
3402              * start searching (which can happen on something like /.\G/) */
3403             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3404                     && (s < stringarg))
3405             {
3406                 /* this should only be possible under \G */
3407                 assert(prog->intflags & PREGf_GPOS_SEEN);
3408                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3409                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3410                 goto phooey;
3411             }
3412
3413             /* match via INTUIT shouldn't have any captures.
3414              * Let @-, @+, $^N know */
3415             prog->lastparen = prog->lastcloseparen = 0;
3416             RXp_MATCH_UTF8_set(prog, utf8_target);
3417             prog->offs[0].start = s - strbeg;
3418             prog->offs[0].end = utf8_target
3419                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3420                 : s - strbeg + prog->minlenret;
3421             if ( !(flags & REXEC_NOT_FIRST) )
3422                 S_reg_set_capture_string(aTHX_ rx,
3423                                         strbeg, strend,
3424                                         sv, flags, utf8_target);
3425
3426             return 1;
3427         }
3428     }
3429
3430     multiline = prog->extflags & RXf_PMf_MULTILINE;
3431
3432     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3433         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3434                               "String too short [regexec_flags]...\n"));
3435         goto phooey;
3436     }
3437
3438     /* Check validity of program. */
3439     if (UCHARAT(progi->program) != REG_MAGIC) {
3440         Perl_croak(aTHX_ "corrupted regexp program");
3441     }
3442
3443     RXp_MATCH_TAINTED_off(prog);
3444     RXp_MATCH_UTF8_set(prog, utf8_target);
3445
3446     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3447     reginfo->intuit = 0;
3448     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3449     reginfo->warned = FALSE;
3450     reginfo->sv = sv;
3451     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3452     /* see how far we have to get to not match where we matched before */
3453     reginfo->till = stringarg + minend;
3454
3455     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3456         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3457            S_cleanup_regmatch_info_aux has executed (registered by
3458            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3459            magic belonging to this SV.
3460            Not newSVsv, either, as it does not COW.
3461         */
3462         reginfo->sv = newSV(0);
3463         SvSetSV_nosteal(reginfo->sv, sv);
3464         SAVEFREESV(reginfo->sv);
3465     }
3466
3467     /* reserve next 2 or 3 slots in PL_regmatch_state:
3468      * slot N+0: may currently be in use: skip it
3469      * slot N+1: use for regmatch_info_aux struct
3470      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3471      * slot N+3: ready for use by regmatch()
3472      */
3473
3474     {
3475         regmatch_state *old_regmatch_state;
3476         regmatch_slab  *old_regmatch_slab;
3477         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3478
3479         /* on first ever match, allocate first slab */
3480         if (!PL_regmatch_slab) {
3481             Newx(PL_regmatch_slab, 1, regmatch_slab);
3482             PL_regmatch_slab->prev = NULL;
3483             PL_regmatch_slab->next = NULL;
3484             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3485         }
3486
3487         old_regmatch_state = PL_regmatch_state;
3488         old_regmatch_slab  = PL_regmatch_slab;
3489
3490         for (i=0; i <= max; i++) {
3491             if (i == 1)
3492                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3493             else if (i ==2)
3494                 reginfo->info_aux_eval =
3495                 reginfo->info_aux->info_aux_eval =
3496                             &(PL_regmatch_state->u.info_aux_eval);
3497
3498             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3499                 PL_regmatch_state = S_push_slab(aTHX);
3500         }
3501
3502         /* note initial PL_regmatch_state position; at end of match we'll
3503          * pop back to there and free any higher slabs */
3504
3505         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3506         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3507         reginfo->info_aux->poscache = NULL;
3508
3509         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3510
3511         if ((prog->extflags & RXf_EVAL_SEEN))
3512             S_setup_eval_state(aTHX_ reginfo);
3513         else
3514             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3515     }
3516
3517     /* If there is a "must appear" string, look for it. */
3518
3519     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3520         /* We have to be careful. If the previous successful match
3521            was from this regex we don't want a subsequent partially
3522            successful match to clobber the old results.
3523            So when we detect this possibility we add a swap buffer
3524            to the re, and switch the buffer each match. If we fail,
3525            we switch it back; otherwise we leave it swapped.
3526         */
3527         swap = prog->offs;
3528         /* avoid leak if we die, or clean up anyway if match completes */
3529         SAVEFREEPV(swap);
3530         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3531         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3532             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3533             0,
3534             PTR2UV(prog),
3535             PTR2UV(swap),
3536             PTR2UV(prog->offs)
3537         ));
3538     }
3539
3540     if (prog->recurse_locinput)
3541         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3542
3543     /* Simplest case: anchored match need be tried only once, or with
3544      * MBOL, only at the beginning of each line.
3545      *
3546      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3547      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3548      * match at the start of the string then it won't match anywhere else
3549      * either; while with /.*.../, if it doesn't match at the beginning,
3550      * the earliest it could match is at the start of the next line */
3551
3552     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3553         char *end;
3554
3555         if (regtry(reginfo, &s))
3556             goto got_it;
3557
3558         if (!(prog->intflags & PREGf_ANCH_MBOL))
3559             goto phooey;
3560
3561         /* didn't match at start, try at other newline positions */
3562
3563         if (minlen)
3564             dontbother = minlen - 1;
3565         end = HOP3c(strend, -dontbother, strbeg) - 1;
3566
3567         /* skip to next newline */
3568
3569         while (s <= end) { /* note it could be possible to match at the end of the string */
3570             /* NB: newlines are the same in unicode as they are in latin */
3571             if (*s++ != '\n')
3572                 continue;
3573             if (prog->check_substr || prog->check_utf8) {
3574             /* note that with PREGf_IMPLICIT, intuit can only fail
3575              * or return the start position, so it's of limited utility.
3576              * Nevertheless, I made the decision that the potential for
3577              * quick fail was still worth it - DAPM */
3578                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3579                 if (!s)
3580                     goto phooey;
3581             }
3582             if (regtry(reginfo, &s))
3583                 goto got_it;
3584         }
3585         goto phooey;
3586     } /* end anchored search */
3587
3588     if (prog->intflags & PREGf_ANCH_GPOS)
3589     {
3590         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3591         assert(prog->intflags & PREGf_GPOS_SEEN);
3592         /* For anchored \G, the only position it can match from is
3593          * (ganch-gofs); we already set startpos to this above; if intuit
3594          * moved us on from there, we can't possibly succeed */
3595         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3596         if (s == startpos && regtry(reginfo, &s))
3597             goto got_it;
3598         goto phooey;
3599     }
3600
3601     /* Messy cases:  unanchored match. */
3602     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3603         /* we have /x+whatever/ */
3604         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3605         char ch;
3606 #ifdef DEBUGGING
3607         int did_match = 0;
3608 #endif
3609         if (utf8_target) {
3610             if (! prog->anchored_utf8) {
3611                 to_utf8_substr(prog);
3612             }
3613             ch = SvPVX_const(prog->anchored_utf8)[0];
3614             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3615                 if (*s == ch) {
3616                     DEBUG_EXECUTE_r( did_match = 1 );
3617                     if (regtry(reginfo, &s)) goto got_it;
3618                     s += UTF8_SAFE_SKIP(s, strend);
3619                     while (s < strend && *s == ch)
3620                         s += UTF8SKIP(s);
3621                 }
3622             );
3623
3624         }
3625         else {
3626             if (! prog->anchored_substr) {
3627                 if (! to_byte_substr(prog)) {
3628                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3629                 }
3630             }
3631             ch = SvPVX_const(prog->anchored_substr)[0];
3632             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3633                 if (*s == ch) {
3634                     DEBUG_EXECUTE_r( did_match = 1 );
3635                     if (regtry(reginfo, &s)) goto got_it;
3636                     s++;
3637                     while (s < strend && *s == ch)
3638                         s++;
3639                 }
3640             );
3641         }
3642         DEBUG_EXECUTE_r(if (!did_match)
3643                 Perl_re_printf( aTHX_
3644                                   "Did not find anchored character...\n")
3645                );
3646     }
3647     else if (prog->anchored_substr != NULL
3648               || prog->anchored_utf8 != NULL
3649               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3650                   && prog->float_max_offset < strend - s)) {
3651         SV *must;
3652         SSize_t back_max;
3653         SSize_t back_min;
3654         char *last;
3655         char *last1;            /* Last position checked before */
3656 #ifdef DEBUGGING
3657         int did_match = 0;
3658 #endif
3659         if (prog->anchored_substr || prog->anchored_utf8) {
3660             if (utf8_target) {
3661                 if (! prog->anchored_utf8) {
3662                     to_utf8_substr(prog);
3663                 }
3664                 must = prog->anchored_utf8;
3665             }
3666             else {
3667                 if (! prog->anchored_substr) {
3668                     if (! to_byte_substr(prog)) {
3669                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3670                     }
3671                 }
3672                 must = prog->anchored_substr;
3673             }
3674             back_max = back_min = prog->anchored_offset;
3675         } else {
3676             if (utf8_target) {
3677                 if (! prog->float_utf8) {
3678                     to_utf8_substr(prog);
3679                 }
3680                 must = prog->float_utf8;
3681             }
3682             else {
3683                 if (! prog->float_substr) {
3684                     if (! to_byte_substr(prog)) {
3685                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3686                     }
3687                 }
3688                 must = prog->float_substr;
3689             }
3690             back_max = prog->float_max_offset;
3691             back_min = prog->float_min_offset;
3692         }
3693
3694         if (back_min<0) {
3695             last = strend;
3696         } else {
3697             last = HOP3c(strend,        /* Cannot start after this */
3698                   -(SSize_t)(CHR_SVLEN(must)
3699                          - (SvTAIL(must) != 0) + back_min), strbeg);
3700         }
3701         if (s > reginfo->strbeg)
3702             last1 = HOPc(s, -1);
3703         else
3704             last1 = s - 1;      /* bogus */
3705
3706         /* XXXX check_substr already used to find "s", can optimize if
3707            check_substr==must. */
3708         dontbother = 0;
3709         strend = HOPc(strend, -dontbother);
3710         while ( (s <= last) &&
3711                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3712                                   (unsigned char*)strend, must,
3713                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3714             DEBUG_EXECUTE_r( did_match = 1 );
3715             if (HOPc(s, -back_max) > last1) {
3716                 last1 = HOPc(s, -back_min);
3717                 s = HOPc(s, -back_max);
3718             }
3719             else {
3720                 char * const t = (last1 >= reginfo->strbeg)
3721                                     ? HOPc(last1, 1) : last1 + 1;
3722
3723                 last1 = HOPc(s, -back_min);
3724                 s = t;
3725             }
3726             if (utf8_target) {
3727                 while (s <= last1) {
3728                     if (regtry(reginfo, &s))
3729                         goto got_it;
3730                     if (s >= last1) {
3731                         s++; /* to break out of outer loop */
3732                         break;
3733                     }
3734                     s += UTF8SKIP(s);
3735                 }
3736             }
3737             else {
3738                 while (s <= last1) {
3739                     if (regtry(reginfo, &s))
3740                         goto got_it;
3741                     s++;
3742                 }
3743             }
3744         }
3745         DEBUG_EXECUTE_r(if (!did_match) {
3746             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3747                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3748             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3749                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3750                                ? "anchored" : "floating"),
3751                 quoted, RE_SV_TAIL(must));
3752         });
3753         goto phooey;
3754     }
3755     else if ( (c = progi->regstclass) ) {
3756         if (minlen) {
3757             const OPCODE op = OP(progi->regstclass);
3758             /* don't bother with what can't match */
3759             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3760                 strend = HOPc(strend, -(minlen - 1));
3761         }
3762         DEBUG_EXECUTE_r({
3763             SV * const prop = sv_newmortal();
3764             regprop(prog, prop, c, reginfo, NULL);
3765             {
3766                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3767                     s,strend-s,PL_dump_re_max_len);
3768                 Perl_re_printf( aTHX_
3769                     "Matching stclass %.*s against %s (%d bytes)\n",
3770                     (int)SvCUR(prop), SvPVX_const(prop),
3771                      quoted, (int)(strend - s));
3772             }
3773         });
3774         if (find_byclass(prog, c, s, strend, reginfo))
3775             goto got_it;
3776         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3777     }
3778     else {
3779         dontbother = 0;
3780         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3781             /* Trim the end. */
3782             char *last= NULL;
3783             SV* float_real;
3784             STRLEN len;
3785             const char *little;
3786
3787             if (utf8_target) {
3788                 if (! prog->float_utf8) {
3789                     to_utf8_substr(prog);
3790                 }
3791                 float_real = prog->float_utf8;
3792             }
3793             else {
3794                 if (! prog->float_substr) {
3795                     if (! to_byte_substr(prog)) {
3796                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3797                     }
3798                 }
3799                 float_real = prog->float_substr;
3800             }
3801
3802             little = SvPV_const(float_real, len);
3803             if (SvTAIL(float_real)) {
3804                     /* This means that float_real contains an artificial \n on
3805                      * the end due to the presence of something like this:
3806                      * /foo$/ where we can match both "foo" and "foo\n" at the
3807                      * end of the string.  So we have to compare the end of the
3808                      * string first against the float_real without the \n and
3809                      * then against the full float_real with the string.  We
3810                      * have to watch out for cases where the string might be
3811                      * smaller than the float_real or the float_real without
3812                      * the \n. */
3813                     char *checkpos= strend - len;
3814                     DEBUG_OPTIMISE_r(
3815                         Perl_re_printf( aTHX_
3816                             "%sChecking for float_real.%s\n",
3817                             PL_colors[4], PL_colors[5]));
3818                     if (checkpos + 1 < strbeg) {
3819                         /* can't match, even if we remove the trailing \n
3820                          * string is too short to match */
3821                         DEBUG_EXECUTE_r(
3822                             Perl_re_printf( aTHX_
3823                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3824                                 PL_colors[4], PL_colors[5]));
3825                         goto phooey;
3826                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3827                         /* can match, the end of the string matches without the
3828                          * "\n" */
3829                         last = checkpos + 1;
3830                     } else if (checkpos < strbeg) {
3831                         /* cant match, string is too short when the "\n" is
3832                          * included */
3833                         DEBUG_EXECUTE_r(
3834                             Perl_re_printf( aTHX_
3835                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3836                                 PL_colors[4], PL_colors[5]));
3837                         goto phooey;
3838                     } else if (!multiline) {
3839                         /* non multiline match, so compare with the "\n" at the
3840                          * end of the string */
3841                         if (memEQ(checkpos, little, len)) {
3842                             last= checkpos;
3843                         } else {
3844                             DEBUG_EXECUTE_r(
3845                                 Perl_re_printf( aTHX_
3846                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3847                                     PL_colors[4], PL_colors[5]));
3848                             goto phooey;
3849                         }
3850                     } else {
3851                         /* multiline match, so we have to search for a place
3852                          * where the full string is located */
3853                         goto find_last;
3854                     }
3855             } else {
3856                   find_last:
3857                     if (len)
3858                         last = rninstr(s, strend, little, little + len);
3859                     else
3860                         last = strend;  /* matching "$" */
3861             }
3862             if (!last) {
3863                 /* at one point this block contained a comment which was
3864                  * probably incorrect, which said that this was a "should not
3865                  * happen" case.  Even if it was true when it was written I am
3866                  * pretty sure it is not anymore, so I have removed the comment
3867                  * and replaced it with this one. Yves */
3868                 DEBUG_EXECUTE_r(
3869                     Perl_re_printf( aTHX_
3870                         "%sString does not contain required substring, cannot match.%s\n",
3871                         PL_colors[4], PL_colors[5]
3872                     ));
3873                 goto phooey;
3874             }
3875             dontbother = strend - last + prog->float_min_offset;
3876         }
3877         if (minlen && (dontbother < minlen))
3878             dontbother = minlen - 1;
3879         strend -= dontbother;              /* this one's always in bytes! */
3880         /* We don't know much -- general case. */
3881         if (utf8_target) {
3882             for (;;) {
3883                 if (regtry(reginfo, &s))
3884                     goto got_it;
3885                 if (s >= strend)
3886                     break;
3887                 s += UTF8SKIP(s);
3888             };
3889         }
3890         else {
3891             do {
3892                 if (regtry(reginfo, &s))
3893                     goto got_it;
3894             } while (s++ < strend);
3895         }
3896     }
3897
3898     /* Failure. */
3899     goto phooey;
3900
3901   got_it:
3902     /* s/// doesn't like it if $& is earlier than where we asked it to
3903      * start searching (which can happen on something like /.\G/) */
3904     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3905             && (prog->offs[0].start < stringarg - strbeg))
3906     {
3907         /* this should only be possible under \G */
3908         assert(prog->intflags & PREGf_GPOS_SEEN);
3909         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3910             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3911         goto phooey;
3912     }
3913
3914     /* clean up; this will trigger destructors that will free all slabs
3915      * above the current one, and cleanup the regmatch_info_aux
3916      * and regmatch_info_aux_eval sructs */
3917
3918     LEAVE_SCOPE(oldsave);
3919
3920     if (RXp_PAREN_NAMES(prog))
3921         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3922
3923     /* make sure $`, $&, $', and $digit will work later */
3924     if ( !(flags & REXEC_NOT_FIRST) )
3925         S_reg_set_capture_string(aTHX_ rx,
3926                                     strbeg, reginfo->strend,
3927                                     sv, flags, utf8_target);
3928
3929     return 1;
3930
3931   phooey:
3932     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3933                           PL_colors[4], PL_colors[5]));
3934
3935     if (swap) {
3936         /* we failed :-( roll it back.
3937          * Since the swap buffer will be freed on scope exit which follows
3938          * shortly, restore the old captures by copying 'swap's original
3939          * data to the new offs buffer
3940          */
3941         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3942             "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
3943             0,
3944             PTR2UV(prog),
3945             PTR2UV(prog->offs),
3946             PTR2UV(swap)
3947         ));
3948
3949         Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
3950     }
3951
3952     /* clean up; this will trigger destructors that will free all slabs
3953      * above the current one, and cleanup the regmatch_info_aux
3954      * and regmatch_info_aux_eval sructs */
3955
3956     LEAVE_SCOPE(oldsave);
3957
3958     return 0;
3959 }
3960
3961
3962 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3963  * Do inc before dec, in case old and new rex are the same */
3964 #define SET_reg_curpm(Re2)                          \
3965     if (reginfo->info_aux_eval) {                   \
3966         (void)ReREFCNT_inc(Re2);                    \
3967         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3968         PM_SETRE((PL_reg_curpm), (Re2));            \
3969     }
3970
3971
3972 /*
3973  - regtry - try match at specific point
3974  */
3975 STATIC bool                     /* 0 failure, 1 success */
3976 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3977 {
3978     CHECKPOINT lastcp;
3979     REGEXP *const rx = reginfo->prog;
3980     regexp *const prog = ReANY(rx);
3981     SSize_t result;
3982 #ifdef DEBUGGING
3983     U32 depth = 0; /* used by REGCP_SET */
3984 #endif
3985     RXi_GET_DECL(prog,progi);
3986     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3987
3988     PERL_ARGS_ASSERT_REGTRY;
3989
3990     reginfo->cutpoint=NULL;
3991
3992     prog->offs[0].start = *startposp - reginfo->strbeg;
3993     prog->lastparen = 0;
3994     prog->lastcloseparen = 0;
3995
3996     /* XXXX What this code is doing here?!!!  There should be no need
3997        to do this again and again, prog->lastparen should take care of
3998        this!  --ilya*/
3999
4000     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4001      * Actually, the code in regcppop() (which Ilya may be meaning by
4002      * prog->lastparen), is not needed at all by the test suite
4003      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4004      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4005      * Meanwhile, this code *is* needed for the
4006      * above-mentioned test suite tests to succeed.  The common theme
4007      * on those tests seems to be returning null fields from matches.
4008      * --jhi updated by dapm */
4009
4010     /* After encountering a variant of the issue mentioned above I think
4011      * the point Ilya was making is that if we properly unwind whenever
4012      * we set lastparen to a smaller value then we should not need to do
4013      * this every time, only when needed. So if we have tests that fail if
4014      * we remove this, then it suggests somewhere else we are improperly
4015      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4016      * places it is called, and related regcp() routines. - Yves */
4017 #if 1
4018     if (prog->nparens) {
4019         regexp_paren_pair *pp = prog->offs;
4020         I32 i;
4021         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4022             ++pp;
4023             pp->start = -1;
4024             pp->end = -1;
4025         }
4026     }
4027 #endif
4028     REGCP_SET(lastcp);
4029     result = regmatch(reginfo, *startposp, progi->program + 1);
4030     if (result != -1) {
4031         prog->offs[0].end = result;
4032         return 1;
4033     }
4034     if (reginfo->cutpoint)
4035         *startposp= reginfo->cutpoint;
4036     REGCP_UNWIND(lastcp);
4037     return 0;
4038 }
4039
4040 /* this is used to determine how far from the left messages like
4041    'failed...' are printed in regexec.c. It should be set such that
4042    messages are inline with the regop output that created them.
4043 */
4044 #define REPORT_CODE_OFF 29
4045 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4046 #ifdef DEBUGGING
4047 int
4048 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4049 {
4050     va_list ap;
4051     int result;
4052     PerlIO *f= Perl_debug_log;
4053     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4054     va_start(ap, depth);
4055     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4056     result = PerlIO_vprintf(f, fmt, ap);
4057     va_end(ap);
4058     return result;
4059 }
4060 #endif /* DEBUGGING */
4061
4062 /* grab a new slab and return the first slot in it */
4063
4064 STATIC regmatch_state *
4065 S_push_slab(pTHX)
4066 {
4067     regmatch_slab *s = PL_regmatch_slab->next;
4068     if (!s) {
4069         Newx(s, 1, regmatch_slab);
4070         s->prev = PL_regmatch_slab;
4071         s->next = NULL;
4072         PL_regmatch_slab->next = s;
4073     }
4074     PL_regmatch_slab = s;
4075     return SLAB_FIRST(s);
4076 }
4077
4078 #ifdef DEBUGGING
4079
4080 STATIC void
4081 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4082     const char *start, const char *end, const char *blurb)
4083 {
4084     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4085
4086     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4087
4088     if (!PL_colorset)
4089             reginitcolors();
4090     {
4091         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4092             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4093
4094         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4095             start, end - start, PL_dump_re_max_len);
4096
4097         Perl_re_printf( aTHX_
4098             "%s%s REx%s %s against %s\n",
4099                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4100
4101         if (utf8_target||utf8_pat)
4102             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4103                 utf8_pat ? "pattern" : "",
4104                 utf8_pat && utf8_target ? " and " : "",
4105                 utf8_target ? "string" : ""
4106             );
4107     }
4108 }
4109
4110 STATIC void
4111 S_dump_exec_pos(pTHX_ const char *locinput,
4112                       const regnode *scan,
4113                       const char *loc_regeol,
4114                       const char *loc_bostr,
4115                       const char *loc_reg_starttry,
4116                       const bool utf8_target,
4117                       const U32 depth
4118                 )
4119 {
4120     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4121     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4122     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4123     /* The part of the string before starttry has one color
4124        (pref0_len chars), between starttry and current
4125        position another one (pref_len - pref0_len chars),
4126        after the current position the third one.
4127        We assume that pref0_len <= pref_len, otherwise we
4128        decrease pref0_len.  */
4129     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4130         ? (5 + taill) - l : locinput - loc_bostr;
4131     int pref0_len;
4132
4133     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4134
4135     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4136         pref_len++;
4137     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4138     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4139         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4140               ? (5 + taill) - pref_len : loc_regeol - locinput);
4141     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4142         l--;
4143     if (pref0_len < 0)
4144         pref0_len = 0;
4145     if (pref0_len > pref_len)
4146         pref0_len = pref_len;
4147     {
4148         const int is_uni = utf8_target ? 1 : 0;
4149
4150         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4151             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4152
4153         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4154                     (locinput - pref_len + pref0_len),
4155                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4156
4157         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4158                     locinput, loc_regeol - locinput, 10, 0, 1);
4159
4160         const STRLEN tlen=len0+len1+len2;
4161         Perl_re_printf( aTHX_
4162                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4163                     (IV)(locinput - loc_bostr),
4164                     len0, s0,
4165                     len1, s1,
4166                     (docolor ? "" : "> <"),
4167                     len2, s2,
4168                     (int)(tlen > 19 ? 0 :  19 - tlen),
4169                     "",
4170                     depth);
4171     }
4172 }
4173
4174 #endif
4175
4176 /* reg_check_named_buff_matched()
4177  * Checks to see if a named buffer has matched. The data array of
4178  * buffer numbers corresponding to the buffer is expected to reside
4179  * in the regexp->data->data array in the slot stored in the ARG() of
4180  * node involved. Note that this routine doesn't actually care about the
4181  * name, that information is not preserved from compilation to execution.
4182  * Returns the index of the leftmost defined buffer with the given name
4183  * or 0 if non of the buffers matched.
4184  */
4185 STATIC I32
4186 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4187 {
4188     I32 n;
4189     RXi_GET_DECL(rex,rexi);
4190     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4191     I32 *nums=(I32*)SvPVX(sv_dat);
4192
4193     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4194
4195     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4196         if ((I32)rex->lastparen >= nums[n] &&
4197             rex->offs[nums[n]].end != -1)
4198         {
4199             return nums[n];
4200         }
4201     }
4202     return 0;
4203 }
4204
4205 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4206 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4207 #define CHRTEST_NOT_A_CP_1 -999
4208 #define CHRTEST_NOT_A_CP_2 -998
4209
4210 static bool
4211 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4212         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4213 {
4214     /* This function determines if there are zero, one, two, or more characters
4215      * that match the first character of the passed-in EXACTish node
4216      * <text_node>, and if there are one or two, it returns them in the
4217      * passed-in pointers.
4218      *
4219      * If it determines that no possible character in the target string can
4220      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4221      * the first character in <text_node> requires UTF-8 to represent, and the
4222      * target string isn't in UTF-8.)
4223      *
4224      * If there are more than two characters that could match the beginning of
4225      * <text_node>, or if more context is required to determine a match or not,
4226      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4227      *
4228      * The motiviation behind this function is to allow the caller to set up
4229      * tight loops for matching.  If <text_node> is of type EXACT, there is
4230      * only one possible character that can match its first character, and so
4231      * the situation is quite simple.  But things get much more complicated if
4232      * folding is involved.  It may be that the first character of an EXACTFish
4233      * node doesn't participate in any possible fold, e.g., punctuation, so it
4234      * can be matched only by itself.  The vast majority of characters that are
4235      * in folds match just two things, their lower and upper-case equivalents.
4236      * But not all are like that; some have multiple possible matches, or match
4237      * sequences of more than one character.  This function sorts all that out.
4238      *
4239      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4240      * loop of trying to match A*, we know we can't exit where the thing
4241      * following it isn't a B.  And something can't be a B unless it is the
4242      * beginning of B.  By putting a quick test for that beginning in a tight
4243      * loop, we can rule out things that can't possibly be B without having to
4244      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4245      * character, we can make a tight loop matching A*, using the outputs of
4246      * this function.
4247      *
4248      * If the target string to match isn't in UTF-8, and there aren't
4249      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4250      * the one or two possible octets (which are characters in this situation)
4251      * that can match.  In all cases, if there is only one character that can
4252      * match, *<c1p> and *<c2p> will be identical.
4253      *
4254      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4255      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4256      * can match the beginning of <text_node>.  They should be declared with at
4257      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4258      * undefined what these contain.)  If one or both of the buffers are
4259      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4260      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4261      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4262      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4263      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4264
4265     const bool utf8_target = reginfo->is_utf8_target;
4266
4267     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4268     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4269     bool use_chrtest_void = FALSE;
4270     const bool is_utf8_pat = reginfo->is_utf8_pat;
4271
4272     /* Used when we have both utf8 input and utf8 output, to avoid converting
4273      * to/from code points */
4274     bool utf8_has_been_setup = FALSE;
4275
4276     dVAR;
4277
4278     U8 *pat = (U8*)STRING(text_node);
4279     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4280
4281     if (   OP(text_node) == EXACT
4282         || OP(text_node) == LEXACT
4283         || OP(text_node) == EXACT_REQ8
4284         || OP(text_node) == LEXACT_REQ8
4285         || OP(text_node) == EXACTL)
4286     {
4287
4288         /* In an exact node, only one thing can be matched, that first
4289          * character.  If both the pat and the target are UTF-8, we can just
4290          * copy the input to the output, avoiding finding the code point of
4291          * that character */
4292         if (!is_utf8_pat) {
4293             assert(   OP(text_node) != EXACT_REQ8
4294                    && OP(text_node) != LEXACT_REQ8);
4295             c2 = c1 = *pat;
4296         }
4297         else if (utf8_target) {
4298             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4299             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4300             utf8_has_been_setup = TRUE;
4301         }
4302         else if (   OP(text_node) == EXACT_REQ8
4303                  || OP(text_node) == LEXACT_REQ8)
4304         {
4305             return FALSE;   /* Can only match UTF-8 target */
4306         }
4307         else {
4308             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4309         }
4310     }
4311     else { /* an EXACTFish node */
4312         U8 *pat_end = pat + STR_LENs(text_node);
4313
4314         /* An EXACTFL node has at least some characters unfolded, because what
4315          * they match is not known until now.  So, now is the time to fold
4316          * the first few of them, as many as are needed to determine 'c1' and
4317          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4318          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4319          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4320          * need to fold as many characters as a single character can fold to,
4321          * so that later we can check if the first ones are such a multi-char
4322          * fold.  But, in such a pattern only locale-problematic characters
4323          * aren't folded, so we can skip this completely if the first character
4324          * in the node isn't one of the tricky ones */
4325         if (OP(text_node) == EXACTFL) {
4326
4327             if (! is_utf8_pat) {
4328                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4329                 {
4330                     folded[0] = folded[1] = 's';
4331                     pat = folded;
4332                     pat_end = folded + 2;
4333                 }
4334             }
4335             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4336                 U8 *s = pat;
4337                 U8 *d = folded;
4338                 int i;
4339
4340                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4341                     if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
4342                         *(d++) = (U8) toFOLD_LC(*s);
4343                         s++;
4344                     }
4345                     else {
4346                         STRLEN len;
4347                         _toFOLD_utf8_flags(s,
4348                                            pat_end,
4349                                            d,
4350                                            &len,
4351                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4352                         d += len;
4353                         s += UTF8SKIP(s);
4354                     }
4355                 }
4356
4357                 pat = folded;
4358                 pat_end = d;
4359             }
4360         }
4361
4362         if (    ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4363              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4364         {
4365             /* Multi-character folds require more context to sort out.  Also
4366              * PL_utf8_foldclosures used below doesn't handle them, so have to
4367              * be handled outside this routine */
4368             use_chrtest_void = TRUE;
4369         }
4370         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4371             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4372
4373             if (   UNLIKELY(PL_in_utf8_turkic_locale)
4374                 && OP(text_node) == EXACTFL
4375                 && UNLIKELY(   c1 == 'i' || c1 == 'I'
4376                             || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
4377                             || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
4378             {   /* Hard-coded Turkish locale rules for these 4 characters
4379                    override normal rules */
4380                 if (c1 == 'i') {
4381                     c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4382                 }
4383                 else if (c1 == 'I') {
4384                     c2 = LATIN_SMALL_LETTER_DOTLESS_I;
4385                 }
4386                 else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
4387                     c2 = 'i';
4388                 }
4389                 else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
4390                     c2 = 'I';
4391                 }
4392             }
4393             else if (c1 > 255) {
4394                 const U32 * remaining_folds;
4395                 U32 first_fold;
4396
4397                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4398                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4399                 Size_t folds_count = _inverse_folds(c1, &first_fold,
4400                                                        &remaining_folds);
4401                 if (folds_count == 0) {
4402                     c2 = c1;    /* there is only a single character that could
4403                                    match */
4404                 }
4405                 else if (folds_count != 1) {
4406                     /* If there aren't exactly two folds to this (itself and
4407                      * another), it is outside the scope of this function */
4408                     use_chrtest_void = TRUE;
4409                 }
4410                 else {  /* There are two.  We already have one, get the other */
4411                     c2 = first_fold;
4412
4413                     /* Folds that cross the 255/256 boundary are forbidden if
4414                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4415                      * ASCIII.  The only other match to c1 is c2, and since c1
4416                      * is above 255, c2 better be as well under these
4417                      * circumstances.  If it isn't, it means the only legal
4418                      * match of c1 is itself. */
4419                     if (    c2 < 256
4420                         && (   (   OP(text_node) == EXACTFL
4421                                 && ! IN_UTF8_CTYPE_LOCALE)
4422                             || ((     OP(text_node) == EXACTFAA
4423                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4424                                 && (isASCII(c1) || isASCII(c2)))))
4425                     {
4426                         c2 = c1;
4427                     }
4428                 }
4429             }
4430             else /* Here, c1 is <= 255 */
4431                 if (   utf8_target
4432                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4433                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4434                     && (   (   OP(text_node) != EXACTFAA
4435                             && OP(text_node) != EXACTFAA_NO_TRIE)
4436                         ||   ! isASCII(c1)))
4437             {
4438                 /* Here, there could be something above Latin1 in the target
4439                  * which folds to this character in the pattern.  All such
4440                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4441                  * than two characters involved in their folds, so are outside
4442                  * the scope of this function */
4443                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4444                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4445                 }
4446                 else {
4447                     use_chrtest_void = TRUE;
4448                 }
4449             }
4450             else { /* Here nothing above Latin1 can fold to the pattern
4451                       character */
4452                 switch (OP(text_node)) {
4453
4454                     case EXACTFL:   /* /l rules */
4455                         c2 = PL_fold_locale[c1];
4456                         break;
4457
4458                     case EXACTF:   /* This node only generated for non-utf8
4459                                     patterns */
4460                         assert(! is_utf8_pat);
4461                         if (! utf8_target) {    /* /d rules */
4462                             c2 = PL_fold[c1];
4463                             break;
4464                         }
4465                         /* FALLTHROUGH */
4466                         /* /u rules for all these.  This happens to work for
4467                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4468                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4469                                                 non-utf8 patterns */
4470                         assert(! is_utf8_pat);
4471                         /* FALLTHROUGH */
4472                     case EXACTFAA:
4473                     case EXACTFUP:
4474                     case EXACTFU:
4475                         c2 = PL_fold_latin1[c1];
4476                         break;
4477                     case EXACTFU_REQ8:
4478                         return FALSE;
4479                         NOT_REACHED; /* NOTREACHED */
4480
4481                     default:
4482                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4483                         NOT_REACHED; /* NOTREACHED */
4484                 }
4485             }
4486         }
4487     }
4488
4489     /* Here have figured things out.  Set up the returns */
4490     if (use_chrtest_void) {
4491         *c2p = *c1p = CHRTEST_VOID;
4492     }
4493     else if (utf8_target) {
4494         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4495             uvchr_to_utf8(c1_utf8, c1);
4496             uvchr_to_utf8(c2_utf8, c2);
4497         }
4498
4499         /* Invariants are stored in both the utf8 and byte outputs; Use
4500          * negative numbers otherwise for the byte ones.  Make sure that the
4501          * byte ones are the same iff the utf8 ones are the same */
4502         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4503         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4504                 ? *c2_utf8
4505                 : (c1 == c2)
4506                   ? CHRTEST_NOT_A_CP_1
4507                   : CHRTEST_NOT_A_CP_2;
4508     }
4509     else if (c1 > 255) {
4510        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4511                            can represent */
4512            return FALSE;
4513        }
4514
4515        *c1p = *c2p = c2;    /* c2 is the only representable value */
4516     }
4517     else {  /* c1 is representable; see about c2 */
4518        *c1p = c1;
4519        *c2p = (c2 < 256) ? c2 : c1;
4520     }
4521
4522     return TRUE;
4523 }
4524
4525 STATIC bool
4526 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4527 {
4528     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4529      * between the inputs.  See https://www.unicode.org/reports/tr29/. */
4530
4531     PERL_ARGS_ASSERT_ISGCB;
4532
4533     switch (GCB_table[before][after]) {
4534         case GCB_BREAKABLE:
4535             return TRUE;
4536
4537         case GCB_NOBREAK:
4538             return FALSE;
4539
4540         case GCB_RI_then_RI:
4541             {
4542                 int RI_count = 1;
4543                 U8 * temp_pos = (U8 *) curpos;
4544
4545                 /* Do not break within emoji flag sequences. That is, do not
4546                  * break between regional indicator (RI) symbols if there is an
4547                  * odd number of RI characters before the break point.
4548                  *  GB12   sot (RI RI)* RI × RI
4549                  *  GB13 [^RI] (RI RI)* RI × RI */
4550
4551                 while (backup_one_GCB(strbeg,
4552                                     &temp_pos,
4553                                     utf8_target) == GCB_Regional_Indicator)
4554                 {
4555                     RI_count++;
4556                 }
4557
4558                 return RI_count % 2 != 1;
4559             }
4560
4561         case GCB_EX_then_EM:
4562
4563             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4564             {
4565                 U8 * temp_pos = (U8 *) curpos;
4566                 GCB_enum prev;
4567
4568                 do {
4569                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4570                 }
4571                 while (prev == GCB_Extend);
4572
4573                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4574             }
4575
4576         case GCB_Maybe_Emoji_NonBreak:
4577
4578             {
4579
4580             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4581               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4582               */
4583                 U8 * temp_pos = (U8 *) curpos;
4584                 GCB_enum prev;
4585
4586                 do {
4587                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4588                 }
4589                 while (prev == GCB_Extend);
4590
4591                 return prev != GCB_ExtPict_XX;
4592             }
4593
4594         default:
4595             break;
4596     }
4597
4598 #ifdef DEBUGGING
4599     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4600                                   before, after, GCB_table[before][after]);
4601     assert(0);
4602 #endif
4603     return TRUE;
4604 }
4605
4606 STATIC GCB_enum
4607 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4608 {
4609     dVAR;
4610     GCB_enum gcb;
4611
4612     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4613
4614     if (*curpos < strbeg) {
4615         return GCB_EDGE;
4616     }
4617
4618     if (utf8_target) {
4619         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4620         U8 * prev_prev_char_pos;
4621
4622         if (! prev_char_pos) {
4623             return GCB_EDGE;
4624         }
4625
4626         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4627             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4628             *curpos = prev_char_pos;
4629             prev_char_pos = prev_prev_char_pos;
4630         }
4631         else {
4632             *curpos = (U8 *) strbeg;
4633             return GCB_EDGE;
4634         }
4635     }
4636     else {
4637         if (*curpos - 2 < strbeg) {
4638             *curpos = (U8 *) strbeg;
4639             return GCB_EDGE;
4640         }
4641         (*curpos)--;
4642         gcb = getGCB_VAL_CP(*(*curpos - 1));
4643     }
4644
4645     return gcb;
4646 }
4647
4648 /* Combining marks attach to most classes that precede them, but this defines
4649  * the exceptions (from TR14) */
4650 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4651                                      || prev == LB_Mandatory_Break      \
4652                                      || prev == LB_Carriage_Return      \
4653                                      || prev == LB_Line_Feed            \
4654                                      || prev == LB_Next_Line            \
4655                                      || prev == LB_Space                \
4656                                      || prev == LB_ZWSpace))
4657
4658 STATIC bool
4659 S_isLB(pTHX_ LB_enum before,
4660              LB_enum after,
4661              const U8 * const strbeg,
4662              const U8 * const curpos,
4663              const U8 * const strend,
4664              const bool utf8_target)
4665 {
4666     U8 * temp_pos = (U8 *) curpos;
4667     LB_enum prev = before;
4668
4669     /* Is the boundary between 'before' and 'after' line-breakable?
4670      * Most of this is just a table lookup of a generated table from Unicode
4671      * rules.  But some rules require context to decide, and so have to be
4672      * implemented in code */
4673
4674     PERL_ARGS_ASSERT_ISLB;
4675
4676     /* Rule numbers in the comments below are as of Unicode 9.0 */
4677
4678   redo:
4679     before = prev;
4680     switch (LB_table[before][after]) {
4681         case LB_BREAKABLE:
4682             return TRUE;
4683
4684         case LB_NOBREAK:
4685         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4686             return FALSE;
4687
4688         case LB_SP_foo + LB_BREAKABLE:
4689         case LB_SP_foo + LB_NOBREAK:
4690         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4691
4692             /* When we have something following a SP, we have to look at the
4693              * context in order to know what to do.
4694              *
4695              * SP SP should not reach here because LB7: Do not break before
4696              * spaces.  (For two spaces in a row there is nothing that
4697              * overrides that) */
4698             assert(after != LB_Space);
4699
4700             /* Here we have a space followed by a non-space.  Mostly this is a
4701              * case of LB18: "Break after spaces".  But there are complications
4702              * as the handling of spaces is somewhat tricky.  They are in a
4703              * number of rules, which have to be applied in priority order, but
4704              * something earlier in the string can cause a rule to be skipped
4705              * and a lower priority rule invoked.  A prime example is LB7 which
4706              * says don't break before a space.  But rule LB8 (lower priority)
4707              * says that the first break opportunity after a ZW is after any
4708              * span of spaces immediately after it.  If a ZW comes before a SP
4709              * in the input, rule LB8 applies, and not LB7.  Other such rules
4710              * involve combining marks which are rules 9 and 10, but they may
4711              * override higher priority rules if they come earlier in the
4712              * string.  Since we're doing random access into the middle of the
4713              * string, we have to look for rules that should get applied based
4714              * on both string position and priority.  Combining marks do not
4715              * attach to either ZW nor SP, so we don't have to consider them
4716              * until later.
4717              *
4718              * To check for LB8, we have to find the first non-space character
4719              * before this span of spaces */
4720             do {
4721                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4722             }
4723             while (prev == LB_Space);
4724
4725             /* LB8 Break before any character following a zero-width space,
4726              * even if one or more spaces intervene.
4727              *      ZW SP* ÷
4728              * So if we have a ZW just before this span, and to get here this
4729              * is the final space in the span. */
4730             if (prev == LB_ZWSpace) {
4731                 return TRUE;
4732             }
4733
4734             /* Here, not ZW SP+.  There are several rules that have higher
4735              * priority than LB18 and can be resolved now, as they don't depend
4736              * on anything earlier in the string (except ZW, which we have
4737              * already handled).  One of these rules is LB11 Do not break
4738              * before Word joiner, but we have specially encoded that in the
4739              * lookup table so it is caught by the single test below which
4740              * catches the other ones. */
4741             if (LB_table[LB_Space][after] - LB_SP_foo
4742                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4743             {
4744                 return FALSE;
4745             }
4746
4747             /* If we get here, we have to XXX consider combining marks. */
4748             if (prev == LB_Combining_Mark) {
4749
4750                 /* What happens with these depends on the character they
4751                  * follow.  */
4752                 do {
4753                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4754                 }
4755                 while (prev == LB_Combining_Mark);
4756
4757                 /* Most times these attach to and inherit the characteristics
4758                  * of that character, but not always, and when not, they are to
4759                  * be treated as AL by rule LB10. */
4760                 if (! LB_CM_ATTACHES_TO(prev)) {
4761                     prev = LB_Alphabetic;
4762                 }
4763             }
4764
4765             /* Here, we have the character preceding the span of spaces all set
4766              * up.  We follow LB18: "Break after spaces" unless the table shows
4767              * that is overriden */
4768             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4769
4770         case LB_CM_ZWJ_foo:
4771
4772             /* We don't know how to treat the CM except by looking at the first
4773              * non-CM character preceding it.  ZWJ is treated as CM */
4774             do {
4775                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4776             }
4777             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4778
4779             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4780              * attatches to it, then it inherits the behavior of 'prev'.  If it
4781              * doesn't attach, it is to be treated as an AL */
4782             if (! LB_CM_ATTACHES_TO(prev)) {
4783                 prev = LB_Alphabetic;
4784             }
4785
4786             goto redo;
4787
4788         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4789         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4790
4791             /* LB21a Don't break after Hebrew + Hyphen.
4792              * HL (HY | BA) × */
4793
4794             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4795                                                           == LB_Hebrew_Letter)
4796             {
4797                 return FALSE;
4798             }
4799
4800             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4801
4802         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4803         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4804
4805             /* LB25a (PR | PO) × ( OP | HY )? NU */
4806             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4807                 return FALSE;
4808             }
4809
4810             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4811                                                                 == LB_BREAKABLE;
4812
4813         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4814         case LB_SY_or_IS_then_various + LB_NOBREAK:
4815         {
4816             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4817
4818             LB_enum temp = prev;
4819             do {
4820                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4821             }
4822             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4823             if (temp == LB_Numeric) {
4824                 return FALSE;
4825             }
4826
4827             return LB_table[prev][after] - LB_SY_or_IS_then_various
4828                                                                == LB_BREAKABLE;
4829         }
4830
4831         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4832         case LB_various_then_PO_or_PR + LB_NOBREAK:
4833         {
4834             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4835
4836             LB_enum temp = prev;
4837             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4838             {
4839                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4840             }
4841             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4842                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4843             }
4844             if (temp == LB_Numeric) {
4845                 return FALSE;
4846             }
4847             return LB_various_then_PO_or_PR;
4848         }
4849
4850         case LB_RI_then_RI + LB_NOBREAK:
4851         case LB_RI_then_RI + LB_BREAKABLE:
4852             {
4853                 int RI_count = 1;
4854
4855                 /* LB30a Break between two regional indicator symbols if and
4856                  * only if there are an even number of regional indicators
4857                  * preceding the position of the break.
4858                  *
4859                  *    sot (RI RI)* RI × RI
4860                  *  [^RI] (RI RI)* RI × RI */
4861
4862                 while (backup_one_LB(strbeg,
4863                                      &temp_pos,
4864                                      utf8_target) == LB_Regional_Indicator)
4865                 {
4866                     RI_count++;
4867                 }
4868
4869                 return RI_count % 2 == 0;
4870             }
4871
4872         default:
4873             break;
4874     }
4875
4876 #ifdef DEBUGGING
4877     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4878                                   before, after, LB_table[before][after]);
4879     assert(0);
4880 #endif
4881     return TRUE;
4882 }
4883
4884 STATIC LB_enum
4885 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4886 {
4887     dVAR;
4888
4889     LB_enum lb;
4890
4891     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4892
4893     if (*curpos >= strend) {
4894         return LB_EDGE;
4895     }
4896
4897     if (utf8_target) {
4898         *curpos += UTF8SKIP(*curpos);
4899         if (*curpos >= strend) {
4900             return LB_EDGE;
4901         }
4902         lb = getLB_VAL_UTF8(*curpos, strend);
4903     }
4904     else {
4905         (*curpos)++;
4906         if (*curpos >= strend) {
4907             return LB_EDGE;
4908         }
4909         lb = getLB_VAL_CP(**curpos);
4910     }
4911
4912     return lb;
4913 }
4914
4915 STATIC LB_enum
4916 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4917 {
4918     dVAR;
4919     LB_enum lb;
4920
4921     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4922
4923     if (*curpos < strbeg) {
4924         return LB_EDGE;
4925     }
4926
4927     if (utf8_target) {
4928         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4929         U8 * prev_prev_char_pos;
4930
4931         if (! prev_char_pos) {
4932             return LB_EDGE;
4933         }
4934
4935         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4936             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4937             *curpos = prev_char_pos;
4938             prev_char_pos = prev_prev_char_pos;
4939         }
4940         else {
4941             *curpos = (U8 *) strbeg;
4942             return LB_EDGE;
4943         }
4944     }
4945     else {
4946         if (*curpos - 2 < strbeg) {
4947             *curpos = (U8 *) strbeg;
4948             return LB_EDGE;
4949         }
4950         (*curpos)--;
4951         lb = getLB_VAL_CP(*(*curpos - 1));
4952     }
4953
4954     return lb;
4955 }
4956
4957 STATIC bool
4958 S_isSB(pTHX_ SB_enum before,
4959              SB_enum after,
4960              const U8 * const strbeg,
4961              const U8 * const curpos,
4962              const U8 * const strend,
4963              const bool utf8_target)
4964 {
4965     /* returns a boolean indicating if there is a Sentence Boundary Break
4966      * between the inputs.  See https://www.unicode.org/reports/tr29/ */
4967
4968     U8 * lpos = (U8 *) curpos;
4969     bool has_para_sep = FALSE;
4970     bool has_sp = FALSE;
4971
4972     PERL_ARGS_ASSERT_ISSB;
4973
4974     /* Break at the start and end of text.
4975         SB1.  sot  ÷
4976         SB2.  ÷  eot
4977       But unstated in Unicode is don't break if the text is empty */
4978     if (before == SB_EDGE || after == SB_EDGE) {
4979         return before != after;
4980     }
4981
4982     /* SB 3: Do not break within CRLF. */
4983     if (before == SB_CR && after == SB_LF) {
4984         return FALSE;
4985     }
4986
4987     /* Break after paragraph separators.  CR and LF are considered
4988      * so because Unicode views text as like word processing text where there
4989      * are no newlines except between paragraphs, and the word processor takes
4990      * care of wrapping without there being hard line-breaks in the text *./
4991        SB4.  Sep | CR | LF  ÷ */
4992     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4993         return TRUE;
4994     }
4995
4996     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4997      * (See Section 6.2, Replacing Ignore Rules.)
4998         SB5.  X (Extend | Format)*  →  X */
4999     if (after == SB_Extend || after == SB_Format) {
5000
5001         /* Implied is that the these characters attach to everything
5002          * immediately prior to them except for those separator-type
5003          * characters.  And the rules earlier have already handled the case
5004          * when one of those immediately precedes the extend char */
5005         return FALSE;
5006     }
5007
5008     if (before == SB_Extend || before == SB_Format) {
5009         U8 * temp_pos = lpos;
5010         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5011         if (   backup != SB_EDGE
5012             && backup != SB_Sep
5013             && backup != SB_CR
5014             && backup != SB_LF)
5015         {
5016             before = backup;
5017             lpos = temp_pos;
5018         }
5019
5020         /* Here, both 'before' and 'backup' are these types; implied is that we
5021          * don't break between them */
5022         if (backup == SB_Extend || backup == SB_Format) {
5023             return FALSE;
5024         }
5025     }
5026
5027     /* Do not break after ambiguous terminators like period, if they are
5028      * immediately followed by a number or lowercase letter, if they are
5029      * between uppercase letters, if the first following letter (optionally
5030      * after certain punctuation) is lowercase, or if they are followed by
5031      * "continuation" punctuation such as comma, colon, or semicolon. For
5032      * example, a period may be an abbreviation or numeric period, and thus may
5033      * not mark the end of a sentence.
5034
5035      * SB6. ATerm  ×  Numeric */
5036     if (before == SB_ATerm && after == SB_Numeric) {
5037         return FALSE;
5038     }
5039
5040     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5041     if (before == SB_ATerm && after == SB_Upper) {
5042         U8 * temp_pos = lpos;
5043         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5044         if (backup == SB_Upper || backup == SB_Lower) {
5045             return FALSE;
5046         }
5047     }
5048
5049     /* The remaining rules that aren't the final one, all require an STerm or
5050      * an ATerm after having backed up over some Close* Sp*, and in one case an
5051      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5052      * So do that backup now, setting flags if either Sp or a paragraph
5053      * separator are found */
5054
5055     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5056         has_para_sep = TRUE;
5057         before = backup_one_SB(strbeg, &lpos, utf8_target);
5058     }
5059
5060     if (before == SB_Sp) {
5061         has_sp = TRUE;
5062         do {
5063             before = backup_one_SB(strbeg, &lpos, utf8_target);
5064         }
5065         while (before == SB_Sp);
5066     }
5067
5068     while (before == SB_Close) {
5069         before = backup_one_SB(strbeg, &lpos, utf8_target);
5070     }
5071
5072     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5073      * most cases an STerm */
5074     if (before == SB_STerm || before == SB_ATerm) {
5075
5076         /* So, here the lhs matches
5077          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5078          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5079          * The rules that apply here are:
5080          *
5081          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5082                                            | LF | STerm | ATerm) )* Lower
5083            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5084            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5085            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5086            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5087          */
5088
5089         /* And all but SB11 forbid having seen a paragraph separator */
5090         if (! has_para_sep) {
5091             if (before == SB_ATerm) {          /* SB8 */
5092                 U8 * rpos = (U8 *) curpos;
5093                 SB_enum later = after;
5094
5095                 while (    later != SB_OLetter
5096                         && later != SB_Upper
5097                         && later != SB_Lower
5098                         && later != SB_Sep
5099                         && later != SB_CR
5100                         && later != SB_LF
5101                         && later != SB_STerm
5102                         && later != SB_ATerm
5103                         && later != SB_EDGE)
5104                 {
5105                     later = advance_one_SB(&rpos, strend, utf8_target);
5106                 }
5107                 if (later == SB_Lower) {
5108                     return FALSE;
5109                 }
5110             }
5111
5112             if (   after == SB_SContinue    /* SB8a */
5113                 || after == SB_STerm
5114                 || after == SB_ATerm)
5115             {
5116                 return FALSE;
5117             }
5118
5119             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5120                 if (   after == SB_Close
5121                     || after == SB_Sp
5122                     || after == SB_Sep
5123                     || after == SB_CR
5124                     || after == SB_LF)
5125                 {
5126                     return FALSE;
5127                 }
5128             }
5129
5130             /* SB10.  This and SB9 could probably be combined some way, but khw
5131              * has decided to follow the Unicode rule book precisely for
5132              * simplified maintenance */
5133             if (   after == SB_Sp
5134                 || after == SB_Sep
5135                 || after == SB_CR
5136                 || after == SB_LF)
5137             {
5138                 return FALSE;
5139             }
5140         }
5141
5142         /* SB11.  */
5143         return TRUE;
5144     }
5145
5146     /* Otherwise, do not break.
5147     SB12.  Any  ×  Any */
5148
5149     return FALSE;
5150 }
5151
5152 STATIC SB_enum
5153 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5154 {
5155     dVAR;
5156     SB_enum sb;
5157
5158     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5159
5160     if (*curpos >= strend) {
5161         return SB_EDGE;
5162     }
5163
5164     if (utf8_target) {
5165         do {
5166             *curpos += UTF8SKIP(*curpos);
5167             if (*curpos >= strend) {
5168                 return SB_EDGE;
5169             }
5170             sb = getSB_VAL_UTF8(*curpos, strend);
5171         } while (sb == SB_Extend || sb == SB_Format);
5172     }
5173     else {
5174         do {
5175             (*curpos)++;
5176             if (*curpos >= strend) {
5177                 return SB_EDGE;
5178             }
5179             sb = getSB_VAL_CP(**curpos);
5180         } while (sb == SB_Extend || sb == SB_Format);
5181     }
5182
5183     return sb;
5184 }
5185
5186 STATIC SB_enum
5187 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5188 {
5189     dVAR;
5190     SB_enum sb;
5191
5192     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5193
5194     if (*curpos < strbeg) {
5195         return SB_EDGE;
5196     }
5197
5198     if (utf8_target) {
5199         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5200         if (! prev_char_pos) {
5201             return SB_EDGE;
5202         }
5203
5204         /* Back up over Extend and Format.  curpos is always just to the right
5205          * of the characater whose value we are getting */
5206         do {
5207             U8 * prev_prev_char_pos;
5208             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5209                                                                       strbeg)))
5210             {
5211                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5212                 *curpos = prev_char_pos;
5213                 prev_char_pos = prev_prev_char_pos;
5214             }
5215             else {
5216                 *curpos = (U8 *) strbeg;
5217                 return SB_EDGE;
5218             }
5219         } while (sb == SB_Extend || sb == SB_Format);
5220     }
5221     else {
5222         do {
5223             if (*curpos - 2 < strbeg) {
5224                 *curpos = (U8 *) strbeg;
5225                 return SB_EDGE;
5226             }
5227             (*curpos)--;
5228             sb = getSB_VAL_CP(*(*curpos - 1));
5229         } while (sb == SB_Extend || sb == SB_Format);
5230     }
5231
5232     return sb;
5233 }
5234
5235 STATIC bool
5236 S_isWB(pTHX_ WB_enum previous,
5237              WB_enum before,
5238              WB_enum after,
5239              const U8 * const strbeg,
5240              const U8 * const curpos,
5241              const U8 * const strend,
5242              const bool utf8_target)
5243 {
5244     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5245      *  a Unicode word break, using their published algorithm, but tailored for
5246      *  Perl by treating spans of white space as one unit.  Context may be
5247      *  needed to make this determination.  If the value for the character
5248      *  before 'before' is known, it is passed as 'previous'; otherwise that
5249      *  should be set to WB_UNKNOWN.  The other input parameters give the
5250      *  boundaries and current position in the matching of the string.  That
5251      *  is, 'curpos' marks the position where the character whose wb value is
5252      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5253
5254     U8 * before_pos = (U8 *) curpos;
5255     U8 * after_pos = (U8 *) curpos;
5256     WB_enum prev = before;
5257     WB_enum next;
5258
5259     PERL_ARGS_ASSERT_ISWB;
5260
5261     /* Rule numbers in the comments below are as of Unicode 9.0 */
5262
5263   redo:
5264     before = prev;
5265     switch (WB_table[before][after]) {
5266         case WB_BREAKABLE:
5267             return TRUE;
5268
5269         case WB_NOBREAK:
5270             return FALSE;
5271
5272         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5273             next = advance_one_WB(&after_pos, strend, utf8_target,
5274                                  FALSE /* Don't skip Extend nor Format */ );
5275             /* A space immediately preceeding an Extend or Format is attached
5276              * to by them, and hence gets separated from previous spaces.
5277              * Otherwise don't break between horizontal white space */
5278             return next == WB_Extend || next == WB_Format;
5279
5280         /* WB4 Ignore Format and Extend characters, except when they appear at
5281          * the beginning of a region of text.  This code currently isn't
5282          * general purpose, but it works as the rules are currently and likely
5283          * to be laid out.  The reason it works is that when 'they appear at
5284          * the beginning of a region of text', the rule is to break before
5285          * them, just like any other character.  Therefore, the default rule
5286          * applies and we don't have to look in more depth.  Should this ever
5287          * change, we would have to have 2 'case' statements, like in the rules
5288          * below, and backup a single character (not spacing over the extend
5289          * ones) and then see if that is one of the region-end characters and
5290          * go from there */
5291         case WB_Ex_or_FO_or_ZWJ_then_foo:
5292             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5293             goto redo;
5294
5295         case WB_DQ_then_HL + WB_BREAKABLE:
5296         case WB_DQ_then_HL + WB_NOBREAK:
5297
5298             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5299
5300             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5301                                                             == WB_Hebrew_Letter)
5302             {
5303                 return FALSE;
5304             }
5305
5306              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5307
5308         case WB_HL_then_DQ + WB_BREAKABLE:
5309         case WB_HL_then_DQ + WB_NOBREAK:
5310
5311             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5312
5313             if (advance_one_WB(&after_pos, strend, utf8_target,
5314                                        TRUE /* Do skip Extend and Format */ )
5315                                                             == WB_Hebrew_Letter)
5316             {
5317                 return FALSE;
5318             }
5319
5320             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5321
5322         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5323         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5324
5325             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5326              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5327
5328             next = advance_one_WB(&after_pos, strend, utf8_target,
5329                                        TRUE /* Do skip Extend and Format */ );
5330
5331             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5332             {
5333                 return FALSE;
5334             }
5335
5336             return WB_table[before][after]
5337                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5338
5339         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5340         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5341
5342             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5343              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5344
5345             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5346             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5347             {
5348                 return FALSE;
5349             }
5350
5351             return WB_table[before][after]
5352                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5353
5354         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5355         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5356
5357             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5358              * */
5359
5360             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5361                                                             == WB_Numeric)
5362             {
5363                 return FALSE;
5364             }
5365
5366             return WB_table[before][after]
5367                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5368
5369         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5370         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5371
5372             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5373
5374             if (advance_one_WB(&after_pos, strend, utf8_target,
5375                                        TRUE /* Do skip Extend and Format */ )
5376                                                             == WB_Numeric)
5377             {
5378                 return FALSE;
5379             }
5380
5381             return WB_table[before][after]
5382                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5383
5384         case WB_RI_then_RI + WB_NOBREAK:
5385         case WB_RI_then_RI + WB_BREAKABLE:
5386             {
5387                 int RI_count = 1;
5388
5389                 /* Do not break within emoji flag sequences. That is, do not
5390                  * break between regional indicator (RI) symbols if there is an
5391                  * odd number of RI characters before the potential break
5392                  * point.
5393                  *
5394                  * WB15   sot (RI RI)* RI × RI
5395                  * WB16 [^RI] (RI RI)* RI × RI */
5396
5397                 while (backup_one_WB(&previous,
5398                                      strbeg,
5399                                      &before_pos,
5400                                      utf8_target) == WB_Regional_Indicator)
5401                 {
5402                     RI_count++;
5403                 }
5404
5405                 return RI_count % 2 != 1;
5406             }
5407
5408         default:
5409             break;
5410     }
5411
5412 #ifdef DEBUGGING
5413     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5414                                   before, after, WB_table[before][after]);
5415     assert(0);
5416 #endif
5417     return TRUE;
5418 }
5419
5420 STATIC WB_enum
5421 S_advance_one_WB(pTHX_ U8 ** curpos,
5422                        const U8 * const strend,
5423                        const bool utf8_target,
5424                        const bool skip_Extend_Format)
5425 {
5426     dVAR;
5427     WB_enum wb;
5428
5429     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5430
5431     if (*curpos >= strend) {
5432         return WB_EDGE;
5433     }
5434
5435     if (utf8_target) {
5436
5437         /* Advance over Extend and Format */
5438         do {
5439             *curpos += UTF8SKIP(*curpos);
5440             if (*curpos >= strend) {
5441                 return WB_EDGE;
5442             }
5443             wb = getWB_VAL_UTF8(*curpos, strend);
5444         } while (    skip_Extend_Format
5445                  && (wb == WB_Extend || wb == WB_Format));
5446     }
5447     else {
5448         do {
5449             (*curpos)++;
5450             if (*curpos >= strend) {
5451                 return WB_EDGE;
5452             }
5453             wb = getWB_VAL_CP(**curpos);
5454         } while (    skip_Extend_Format
5455                  && (wb == WB_Extend || wb == WB_Format));
5456     }
5457
5458     return wb;
5459 }
5460
5461 STATIC WB_enum
5462 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5463 {
5464     dVAR;
5465     WB_enum wb;
5466
5467     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5468
5469     /* If we know what the previous character's break value is, don't have
5470         * to look it up */
5471     if (*previous != WB_UNKNOWN) {
5472         wb = *previous;
5473
5474         /* But we need to move backwards by one */
5475         if (utf8_target) {
5476             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5477             if (! *curpos) {
5478                 *previous = WB_EDGE;
5479                 *curpos = (U8 *) strbeg;
5480             }
5481             else {
5482                 *previous = WB_UNKNOWN;
5483             }
5484         }
5485         else {
5486             (*curpos)--;
5487             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5488         }
5489
5490         /* And we always back up over these three types */
5491         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5492             return wb;
5493         }
5494     }
5495
5496     if (*curpos < strbeg) {
5497         return WB_EDGE;
5498     }
5499
5500     if (utf8_target) {
5501         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5502         if (! prev_char_pos) {
5503             return WB_EDGE;
5504         }
5505
5506         /* Back up over Extend and Format.  curpos is always just to the right
5507          * of the characater whose value we are getting */
5508         do {
5509             U8 * prev_prev_char_pos;
5510             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5511                                                    -1,
5512                                                    strbeg)))
5513             {
5514                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5515                 *curpos = prev_char_pos;
5516                 prev_char_pos = prev_prev_char_pos;
5517             }
5518             else {
5519                 *curpos = (U8 *) strbeg;
5520                 return WB_EDGE;
5521             }
5522         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5523     }
5524     else {
5525         do {
5526             if (*curpos - 2 < strbeg) {
5527                 *curpos = (U8 *) strbeg;
5528                 return WB_EDGE;
5529             }
5530             (*curpos)--;
5531             wb = getWB_VAL_CP(*(*curpos - 1));
5532         } while (wb == WB_Extend || wb == WB_Format);
5533     }
5534
5535     return wb;
5536 }
5537
5538 /* Macros for regmatch(), using its internal variables */
5539 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
5540 #define NEXTCHR_IS_EOS (nextchr < 0)
5541
5542 #define SET_nextchr \
5543     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
5544
5545 #define SET_locinput(p) \
5546     locinput = (p);  \
5547     SET_nextchr
5548
5549 #define sayYES goto yes
5550 #define sayNO goto no
5551 #define sayNO_SILENT goto no_silent
5552
5553 /* we dont use STMT_START/END here because it leads to
5554    "unreachable code" warnings, which are bogus, but distracting. */
5555 #define CACHEsayNO \
5556     if (ST.cache_mask) \
5557        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
5558     sayNO
5559
5560 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5561 (                                                           \
5562     (   ( st )                                         ) && \
5563     (   ( st )->u.eval.close_paren                     ) && \
5564     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5565 )
5566
5567 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5568 (                                                           \
5569     (   ( st )                                         ) && \
5570     (   ( st )->u.eval.close_paren                     ) && \
5571     (   ( expr )                                       ) && \
5572     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5573 )
5574
5575
5576 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5577     (st)->u.eval.close_paren = ( (expr) + 1 )
5578
5579 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5580     (st)->u.eval.close_paren = 0
5581
5582 /* push a new state then goto it */
5583
5584 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
5585     pushinput = input; \
5586     pusheol = eol; \
5587     pushsr0 = sr0; \
5588     scan = node; \
5589     st->resume_state = state; \
5590     goto push_state;
5591
5592 /* push a new state with success backtracking, then goto it */
5593
5594 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
5595     pushinput = input; \
5596     pusheol = eol;     \
5597     pushsr0 = sr0; \
5598     scan = node; \
5599     st->resume_state = state; \
5600     goto push_yes_state;
5601
5602 #define DEBUG_STATE_pp(pp)                                  \
5603     DEBUG_STATE_r({                                         \
5604         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
5605         Perl_re_printf( aTHX_                               \
5606             "%*s" pp " %s%s%s%s%s\n",                       \
5607             INDENT_CHARS(depth), "",                        \
5608             PL_reg_name[st->resume_state],                  \
5609             ((st==yes_state||st==mark_state) ? "[" : ""),   \
5610             ((st==yes_state) ? "Y" : ""),                   \
5611             ((st==mark_state) ? "M" : ""),                  \
5612             ((st==yes_state||st==mark_state) ? "]" : "")    \
5613         );                                                  \
5614     });
5615
5616 /*
5617
5618 regmatch() - main matching routine
5619
5620 This is basically one big switch statement in a loop. We execute an op,
5621 set 'next' to point the next op, and continue. If we come to a point which
5622 we may need to backtrack to on failure such as (A|B|C), we push a
5623 backtrack state onto the backtrack stack. On failure, we pop the top
5624 state, and re-enter the loop at the state indicated. If there are no more
5625 states to pop, we return failure.
5626
5627 Sometimes we also need to backtrack on success; for example /A+/, where
5628 after successfully matching one A, we need to go back and try to
5629 match another one; similarly for lookahead assertions: if the assertion
5630 completes successfully, we backtrack to the state just before the assertion
5631 and then carry on.  In these cases, the pushed state is marked as
5632 'backtrack on success too'. This marking is in fact done by a chain of
5633 pointers, each pointing to the previous 'yes' state. On success, we pop to
5634 the nearest yes state, discarding any intermediate failure-only states.
5635 Sometimes a yes state is pushed just to force some cleanup code to be
5636 called at the end of a successful match or submatch; e.g. (??{$re}) uses
5637 it to free the inner regex.
5638
5639 Note that failure backtracking rewinds the cursor position, while
5640 success backtracking leaves it alone.
5641
5642 A pattern is complete when the END op is executed, while a subpattern
5643 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
5644 ops trigger the "pop to last yes state if any, otherwise return true"
5645 behaviour.
5646
5647 A common convention in this function is to use A and B to refer to the two
5648 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
5649 the subpattern to be matched possibly multiple times, while B is the entire
5650 rest of the pattern. Variable and state names reflect this convention.
5651
5652 The states in the main switch are the union of ops and failure/success of
5653 substates associated with with that op.  For example, IFMATCH is the op
5654 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
5655 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
5656 successfully matched A and IFMATCH_A_fail is a state saying that we have
5657 just failed to match A. Resume states always come in pairs. The backtrack
5658 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
5659 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
5660 on success or failure.
5661
5662 The struct that holds a backtracking state is actually a big union, with
5663 one variant for each major type of op. The variable st points to the
5664 top-most backtrack struct. To make the code clearer, within each
5665 block of code we #define ST to alias the relevant union.
5666
5667 Here's a concrete example of a (vastly oversimplified) IFMATCH
5668 implementation:
5669
5670     switch (state) {
5671     ....
5672
5673 #define ST st->u.ifmatch
5674
5675     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5676         ST.foo = ...; // some state we wish to save
5677         ...
5678         // push a yes backtrack state with a resume value of
5679         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
5680         // first node of A:
5681         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
5682         // NOTREACHED
5683
5684     case IFMATCH_A: // we have successfully executed A; now continue with B
5685         next = B;
5686         bar = ST.foo; // do something with the preserved value
5687         break;
5688
5689     case IFMATCH_A_fail: // A failed, so the assertion failed
5690         ...;   // do some housekeeping, then ...
5691         sayNO; // propagate the failure
5692
5693 #undef ST
5694
5695     ...
5696     }
5697
5698 For any old-timers reading this who are familiar with the old recursive
5699 approach, the code above is equivalent to:
5700
5701     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5702     {
5703         int foo = ...
5704         ...
5705         if (regmatch(A)) {
5706             next = B;
5707             bar = foo;
5708             break;
5709         }
5710         ...;   // do some housekeeping, then ...
5711         sayNO; // propagate the failure
5712     }
5713
5714 The topmost backtrack state, pointed to by st, is usually free. If you
5715 want to claim it, populate any ST.foo fields in it with values you wish to
5716 save, then do one of
5717
5718         PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
5719         PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
5720
5721 which sets that backtrack state's resume value to 'resume_state', pushes a
5722 new free entry to the top of the backtrack stack, then goes to 'node'.
5723 On backtracking, the free slot is popped, and the saved state becomes the
5724 new free state. An ST.foo field in this new top state can be temporarily
5725 accessed to retrieve values, but once the main loop is re-entered, it
5726 becomes available for reuse.
5727
5728 Note that the depth of the backtrack stack constantly increases during the
5729 left-to-right execution of the pattern, rather than going up and down with
5730 the pattern nesting. For example the stack is at its maximum at Z at the
5731 end of the pattern, rather than at X in the following:
5732
5733     /(((X)+)+)+....(Y)+....Z/
5734
5735 The only exceptions to this are lookahead/behind assertions and the cut,
5736 (?>A), which pop all the backtrack states associated with A before
5737 continuing.
5738
5739 Backtrack state structs are allocated in slabs of about 4K in size.
5740 PL_regmatch_state and st always point to the currently active state,
5741 and PL_regmatch_slab points to the slab currently containing
5742 PL_regmatch_state.  The first time regmatch() is called, the first slab is
5743 allocated, and is never freed until interpreter destruction. When the slab
5744 is full, a new one is allocated and chained to the end. At exit from
5745 regmatch(), slabs allocated since entry are freed.
5746
5747 In order to work with variable length lookbehinds, an upper limit is placed on
5748 lookbehinds which is set to where the match position is at the end of where the
5749 lookbehind would get to.  Nothing in the lookbehind should match above that,
5750 except we should be able to look beyond if for things like \b, which need the
5751 next character in the string to be able to determine if this is a boundary or
5752 not.  We also can't match the end of string/line unless we are also at the end
5753 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
5754 that match a width, we have to add a condition that they are within the legal
5755 bounds of our window into the string.
5756
5757 */
5758
5759 /* returns -1 on failure, $+[0] on success */
5760 STATIC SSize_t
5761 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5762 {
5763     dVAR;
5764     const bool utf8_target = reginfo->is_utf8_target;
5765     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5766     REGEXP *rex_sv = reginfo->prog;
5767     regexp *rex = ReANY(rex_sv);
5768     RXi_GET_DECL(rex,rexi);
5769     /* the current state. This is a cached copy of PL_regmatch_state */
5770     regmatch_state *st;
5771     /* cache heavy used fields of st in registers */
5772     regnode *scan;
5773     regnode *next;
5774     U32 n = 0;  /* general value; init to avoid compiler warning */
5775     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5776     SSize_t endref = 0; /* offset of end of backref when ln is start */
5777     char *locinput = startpos;
5778     char *loceol = reginfo->strend;
5779     char *pushinput; /* where to continue after a PUSH */
5780     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
5781     U8   *pushsr0;   /* save starting pos of script run */
5782     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5783
5784     bool result = 0;        /* return value of S_regmatch */
5785     U32 depth = 0;            /* depth of backtrack stack */
5786     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5787     const U32 max_nochange_depth =
5788         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5789         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5790     regmatch_state *yes_state = NULL; /* state to pop to on success of
5791                                                             subpattern */
5792     /* mark_state piggy backs on the yes_state logic so that when we unwind
5793        the stack on success we can update the mark_state as we go */
5794     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5795     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5796     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5797     U32 state_num;
5798     bool no_final = 0;      /* prevent failure from backtracking? */
5799     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5800     char *startpoint = locinput;
5801     SV *popmark = NULL;     /* are we looking for a mark? */
5802     SV *sv_commit = NULL;   /* last mark name seen in failure */
5803     SV *sv_yes_mark = NULL; /* last mark name we have seen
5804                                during a successful match */
5805     U32 lastopen = 0;       /* last open we saw */
5806     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5807     SV* const oreplsv = GvSVn(PL_replgv);
5808     /* these three flags are set by various ops to signal information to
5809      * the very next op. They have a useful lifetime of exactly one loop
5810      * iteration, and are not preserved or restored by state pushes/pops
5811      */
5812     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5813     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5814     int logical = 0;        /* the following EVAL is:
5815                                 0: (?{...})
5816                                 1: (?(?{...})X|Y)
5817                                 2: (??{...})
5818                                or the following IFMATCH/UNLESSM is:
5819                                 false: plain (?=foo)
5820                                 true:  used as a condition: (?(?=foo))
5821                             */
5822     PAD* last_pad = NULL;
5823     dMULTICALL;
5824     U8 gimme = G_SCALAR;
5825     CV *caller_cv = NULL;       /* who called us */
5826     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5827     U32 maxopenparen = 0;       /* max '(' index seen so far */
5828     int to_complement;  /* Invert the result? */
5829     _char_class_number classnum;
5830     bool is_utf8_pat = reginfo->is_utf8_pat;
5831     bool match = FALSE;
5832     I32 orig_savestack_ix = PL_savestack_ix;
5833     U8 * script_run_begin = NULL;
5834
5835 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5836 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5837 #  define SOLARIS_BAD_OPTIMIZER
5838     const U32 *pl_charclass_dup = PL_charclass;
5839 #  define PL_charclass pl_charclass_dup
5840 #endif
5841
5842 #ifdef DEBUGGING
5843     DECLARE_AND_GET_RE_DEBUG_FLAGS;
5844 #endif
5845
5846     /* protect against undef(*^R) */
5847     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5848
5849     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5850     multicall_oldcatch = 0;
5851     PERL_UNUSED_VAR(multicall_cop);
5852
5853     PERL_ARGS_ASSERT_REGMATCH;
5854
5855     st = PL_regmatch_state;
5856
5857     /* Note that nextchr is a byte even in UTF */
5858     SET_nextchr;
5859     scan = prog;
5860
5861     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5862             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5863             Perl_re_printf( aTHX_ "regmatch start\n" );
5864     }));
5865
5866     while (scan != NULL) {
5867         next = scan + NEXT_OFF(scan);
5868         if (next == scan)
5869             next = NULL;
5870         state_num = OP(scan);
5871
5872       reenter_switch:
5873         DEBUG_EXECUTE_r(
5874             if (state_num <= REGNODE_MAX) {
5875                 SV * const prop = sv_newmortal();
5876                 regnode *rnext = regnext(scan);
5877
5878                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5879                 regprop(rex, prop, scan, reginfo, NULL);
5880                 Perl_re_printf( aTHX_
5881                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5882                     INDENT_CHARS(depth), "",
5883                     (IV)(scan - rexi->program),
5884                     SvPVX_const(prop),
5885                     (PL_regkind[OP(scan)] == END || !rnext) ?
5886                         0 : (IV)(rnext - rexi->program));
5887             }
5888         );
5889
5890         to_complement = 0;
5891
5892         SET_nextchr;
5893         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5894
5895         switch (state_num) {
5896         case SBOL: /*  /^../ and /\A../  */
5897             if (locinput == reginfo->strbeg)
5898                 break;
5899             sayNO;
5900
5901         case MBOL: /*  /^../m  */
5902             if (locinput == reginfo->strbeg ||
5903                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5904             {
5905                 break;
5906             }
5907             sayNO;
5908
5909         case GPOS: /*  \G  */
5910             if (locinput == reginfo->ganch)
5911                 break;
5912             sayNO;
5913
5914         case KEEPS: /*   \K  */
5915             /* update the startpoint */
5916             st->u.keeper.val = rex->offs[0].start;
5917             rex->offs[0].start = locinput - reginfo->strbeg;
5918             PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
5919                             script_run_begin);
5920             NOT_REACHED; /* NOTREACHED */
5921
5922         case KEEPS_next_fail:
5923             /* rollback the start point change */
5924             rex->offs[0].start = st->u.keeper.val;
5925             sayNO_SILENT;
5926             NOT_REACHED; /* NOTREACHED */
5927
5928         case MEOL: /* /..$/m  */
5929             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5930                 sayNO;
5931             break;
5932
5933         case SEOL: /* /..$/  */
5934             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5935                 sayNO;
5936             if (reginfo->strend - locinput > 1)
5937                 sayNO;
5938             break;
5939
5940         case EOS: /*  \z  */
5941             if (!NEXTCHR_IS_EOS)
5942                 sayNO;
5943             break;
5944
5945         case SANY: /*  /./s  */
5946             if (NEXTCHR_IS_EOS || locinput >= loceol)
5947                 sayNO;
5948             goto increment_locinput;
5949
5950         case REG_ANY: /*  /./  */
5951             if (   NEXTCHR_IS_EOS
5952                 || locinput >= loceol
5953                 || nextchr == '\n')
5954             {
5955                 sayNO;
5956             }
5957             goto increment_locinput;
5958
5959
5960 #undef  ST
5961 #define ST st->u.trie
5962         case TRIEC: /* (ab|cd) with known charclass */
5963             /* In this case the charclass data is available inline so
5964                we can fail fast without a lot of extra overhead.
5965              */
5966             if ( !   NEXTCHR_IS_EOS
5967                 &&   locinput < loceol
5968                 && ! ANYOF_BITMAP_TEST(scan, nextchr))
5969             {
5970                 DEBUG_EXECUTE_r(
5971                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5972                               depth, PL_colors[4], PL_colors[5])
5973                 );
5974                 sayNO_SILENT;
5975                 NOT_REACHED; /* NOTREACHED */
5976             }
5977             /* FALLTHROUGH */
5978         case TRIE:  /* (ab|cd)  */
5979             /* the basic plan of execution of the trie is:
5980              * At the beginning, run though all the states, and
5981              * find the longest-matching word. Also remember the position
5982              * of the shortest matching word. For example, this pattern:
5983              *    1  2 3 4    5
5984              *    ab|a|x|abcd|abc
5985              * when matched against the string "abcde", will generate
5986              * accept states for all words except 3, with the longest
5987              * matching word being 4, and the shortest being 2 (with
5988              * the position being after char 1 of the string).
5989              *
5990              * Then for each matching word, in word order (i.e. 1,2,4,5),
5991              * we run the remainder of the pattern; on each try setting
5992              * the current position to the character following the word,
5993              * returning to try the next word on failure.
5994              *
5995              * We avoid having to build a list of words at runtime by
5996              * using a compile-time structure, wordinfo[].prev, which
5997              * gives, for each word, the previous accepting word (if any).
5998              * In the case above it would contain the mappings 1->2, 2->0,
5999              * 3->0, 4->5, 5->1.  We can use this table to generate, from
6000              * the longest word (4 above), a list of all words, by
6001              * following the list of prev pointers; this gives us the
6002              * unordered list 4,5,1,2. Then given the current word we have
6003              * just tried, we can go through the list and find the
6004              * next-biggest word to try (so if we just failed on word 2,
6005              * the next in the list is 4).
6006              *
6007              * Since at runtime we don't record the matching position in
6008              * the string for each word, we have to work that out for
6009              * each word we're about to process. The wordinfo table holds
6010              * the character length of each word; given that we recorded
6011              * at the start: the position of the shortest word and its
6012              * length in chars, we just need to move the pointer the
6013              * difference between the two char lengths. Depending on
6014              * Unicode status and folding, that's cheap or expensive.
6015              *
6016              * This algorithm is optimised for the case where are only a
6017              * small number of accept states, i.e. 0,1, or maybe 2.
6018              * With lots of accepts states, and having to try all of them,
6019              * it becomes quadratic on number of accept states to find all
6020              * the next words.
6021              */
6022
6023             {
6024                 /* what type of TRIE am I? (utf8 makes this contextual) */
6025                 DECL_TRIE_TYPE(scan);
6026
6027                 /* what trie are we using right now */
6028                 reg_trie_data * const trie
6029                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6030                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6031                 U32 state = trie->startstate;
6032
6033                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6034                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6035                     if (utf8_target
6036                         && ! NEXTCHR_IS_EOS
6037                         && UTF8_IS_ABOVE_LATIN1(nextchr)
6038                         && scan->flags == EXACTL)
6039                     {
6040                         /* We only output for EXACTL, as we let the folder
6041                          * output this message for EXACTFLU8 to avoid
6042                          * duplication */
6043                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6044                                                                reginfo->strend);
6045                     }
6046                 }
6047                 if (   trie->bitmap
6048                     && (     NEXTCHR_IS_EOS
6049                         ||   locinput >= loceol
6050                         || ! TRIE_BITMAP_TEST(trie, nextchr)))
6051                 {
6052                     if (trie->states[ state ].wordnum) {
6053                          DEBUG_EXECUTE_r(
6054                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
6055                                           depth, PL_colors[4], PL_colors[5])
6056                         );
6057                         if (!trie->jump)
6058                             break;
6059                     } else {
6060                         DEBUG_EXECUTE_r(
6061                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6062                                           depth, PL_colors[4], PL_colors[5])
6063                         );
6064                         sayNO_SILENT;
6065                    }
6066                 }
6067
6068             {
6069                 U8 *uc = ( U8* )locinput;
6070
6071                 STRLEN len = 0;
6072                 STRLEN foldlen = 0;
6073                 U8 *uscan = (U8*)NULL;
6074                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6075                 U32 charcount = 0; /* how many input chars we have matched */
6076                 U32 accepted = 0; /* have we seen any accepting states? */
6077
6078                 ST.jump = trie->jump;
6079                 ST.me = scan;
6080                 ST.firstpos = NULL;
6081                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6082                 ST.nextword = 0;
6083
6084                 /* fully traverse the TRIE; note the position of the
6085                    shortest accept state and the wordnum of the longest
6086                    accept state */
6087
6088                 while ( state && uc <= (U8*)(loceol) ) {
6089                     U32 base = trie->states[ state ].trans.base;
6090                     UV uvc = 0;
6091                     U16 charid = 0;
6092                     U16 wordnum;
6093                     wordnum = trie->states[ state ].wordnum;
6094
6095                     if (wordnum) { /* it's an accept state */
6096                         if (!accepted) {
6097                             accepted = 1;
6098                             /* record first match position */
6099                             if (ST.longfold) {
6100                                 ST.firstpos = (U8*)locinput;
6101                                 ST.firstchars = 0;
6102                             }
6103                             else {
6104                                 ST.firstpos = uc;
6105                                 ST.firstchars = charcount;
6106                             }
6107                         }
6108                         if (!ST.nextword || wordnum < ST.nextword)
6109                             ST.nextword = wordnum;
6110                         ST.topword = wordnum;
6111                     }
6112
6113                     DEBUG_TRIE_EXECUTE_r({
6114                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6115                                 /* HERE */
6116                                 PerlIO_printf( Perl_debug_log,
6117                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6118                                     INDENT_CHARS(depth), "", PL_colors[4],
6119                                     (UV)state, (accepted ? 'Y' : 'N'));
6120                     });
6121
6122                     /* read a char and goto next state */
6123                     if ( base && (foldlen || uc < (U8*)(loceol))) {
6124                         I32 offset;
6125                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6126                                              (U8 *) loceol, uscan,
6127                                              len, uvc, charid, foldlen,
6128                                              foldbuf, uniflags);
6129                         charcount++;
6130                         if (foldlen>0)
6131                             ST.longfold = TRUE;
6132                         if (charid &&
6133                              ( ((offset =
6134                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6135
6136                              && ((U32)offset < trie->lasttrans)
6137                              && trie->trans[offset].check == state)
6138                         {
6139                             state = trie->trans[offset].next;
6140                         }
6141                         else {
6142                             state = 0;
6143                         }
6144                         uc += len;
6145
6146                     }
6147                     else {
6148                         state = 0;
6149                     }
6150                     DEBUG_TRIE_EXECUTE_r(
6151                         Perl_re_printf( aTHX_
6152                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6153                             charid, uvc, (UV)state, PL_colors[5] );
6154                     );
6155                 }
6156                 if (!accepted)
6157                    sayNO;
6158
6159                 /* calculate total number of accept states */
6160                 {
6161                     U16 w = ST.topword;
6162                     accepted = 0;
6163                     while (w) {
6164                         w = trie->wordinfo[w].prev;
6165                         accepted++;
6166                     }
6167                     ST.accepted = accepted;
6168                 }
6169
6170                 DEBUG_EXECUTE_r(
6171                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6172                         depth,
6173                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6174                 );
6175                 goto trie_first_try; /* jump into the fail handler */
6176             }}
6177             NOT_REACHED; /* NOTREACHED */
6178
6179         case TRIE_next_fail: /* we failed - try next alternative */
6180         {
6181             U8 *uc;
6182             if ( ST.jump ) {
6183                 /* undo any captures done in the tail part of a branch,
6184                  * e.g.
6185                  *    /(?:X(.)(.)|Y(.)).../
6186                  * where the trie just matches X then calls out to do the
6187                  * rest of the branch */
6188                 REGCP_UNWIND(ST.cp);
6189                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6190             }
6191             if (!--ST.accepted) {
6192                 DEBUG_EXECUTE_r({
6193                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6194                         depth,
6195                         PL_colors[4],
6196                         PL_colors[5] );
6197                 });
6198                 sayNO_SILENT;
6199             }
6200             {
6201                 /* Find next-highest word to process.  Note that this code
6202                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6203                 U16 min = 0;
6204                 U16 word;
6205                 U16 const nextword = ST.nextword;
6206                 reg_trie_wordinfo * const wordinfo
6207                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6208                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6209                     if (word > nextword && (!min || word < min))
6210                         min = word;
6211                 }
6212                 ST.nextword = min;
6213             }
6214
6215           trie_first_try:
6216             if (do_cutgroup) {
6217                 do_cutgroup = 0;
6218                 no_final = 0;
6219             }
6220
6221             if ( ST.jump ) {
6222                 ST.lastparen = rex->lastparen;
6223                 ST.lastcloseparen = rex->lastcloseparen;
6224                 REGCP_SET(ST.cp);
6225             }
6226
6227             /* find start char of end of current word */
6228             {
6229                 U32 chars; /* how many chars to skip */
6230                 reg_trie_data * const trie
6231                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6232
6233                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6234                             >=  ST.firstchars);
6235                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6236                             - ST.firstchars;
6237                 uc = ST.firstpos;
6238
6239                 if (ST.longfold) {
6240                     /* the hard option - fold each char in turn and find
6241                      * its folded length (which may be different */
6242                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6243                     STRLEN foldlen;
6244                     STRLEN len;
6245                     UV uvc;
6246                     U8 *uscan;
6247
6248                     while (chars) {
6249                         if (utf8_target) {
6250                             /* XXX This assumes the length is well-formed, as
6251                              * does the UTF8SKIP below */
6252                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6253                                                     uniflags);
6254                             uc += len;
6255                         }
6256                         else {
6257                             uvc = *uc;
6258                             uc++;
6259                         }
6260                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6261                         uscan = foldbuf;
6262                         while (foldlen) {
6263                             if (!--chars)
6264                                 break;
6265                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6266                                                  uniflags);
6267                             uscan += len;
6268                             foldlen -= len;
6269                         }
6270                     }
6271                 }
6272                 else {
6273                     if (utf8_target)
6274                         while (chars--)
6275                             uc += UTF8SKIP(uc);
6276                     else
6277                         uc += chars;
6278                 }
6279             }
6280
6281             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6282                             ? ST.jump[ST.nextword]
6283                             : NEXT_OFF(ST.me));
6284
6285             DEBUG_EXECUTE_r({
6286                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6287                     depth,
6288                     PL_colors[4],
6289                     ST.nextword,
6290                     PL_colors[5]
6291                     );
6292             });
6293
6294             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6295                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6296                                 script_run_begin);
6297                 NOT_REACHED; /* NOTREACHED */
6298             }
6299             /* only one choice left - just continue */
6300             DEBUG_EXECUTE_r({
6301                 AV *const trie_words
6302                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6303                 SV ** const tmp = trie_words
6304                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6305                 SV *sv= tmp ? sv_newmortal() : NULL;
6306
6307                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6308                     depth, PL_colors[4],
6309                     ST.nextword,
6310                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6311                             PL_colors[0], PL_colors[1],
6312                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6313                         )
6314                     : "not compiled under -Dr",
6315                     PL_colors[5] );
6316             });
6317
6318             locinput = (char*)uc;
6319             continue; /* execute rest of RE */
6320             /* NOTREACHED */
6321         }
6322 #undef  ST
6323
6324         case LEXACT_REQ8:
6325             if (! utf8_target) {
6326                 sayNO;
6327             }
6328             /* FALLTHROUGH */
6329
6330         case LEXACT:
6331         {
6332             char *s;
6333
6334             s = STRINGl(scan);
6335             ln = STR_LENl(scan);
6336             goto join_short_long_exact;
6337
6338         case EXACTL:             /*  /abc/l       */
6339             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6340
6341             /* Complete checking would involve going through every character
6342              * matched by the string to see if any is above latin1.  But the
6343              * comparision otherwise might very well be a fast assembly
6344              * language routine, and I (khw) don't think slowing things down
6345              * just to check for this warning is worth it.  So this just checks
6346              * the first character */
6347             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6348                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6349             }
6350             goto do_exact;
6351         case EXACT_REQ8:
6352             if (! utf8_target) {
6353                 sayNO;
6354             }
6355             /* FALLTHROUGH */
6356
6357         case EXACT:             /*  /abc/        */
6358           do_exact:
6359             s = STRINGs(scan);
6360             ln = STR_LENs(scan);
6361
6362           join_short_long_exact:
6363             if (utf8_target != is_utf8_pat) {
6364                 /* The target and the pattern have differing utf8ness. */
6365                 char *l = locinput;
6366                 const char * const e = s + ln;
6367
6368                 if (utf8_target) {
6369                     /* The target is utf8, the pattern is not utf8.
6370                      * Above-Latin1 code points can't match the pattern;
6371                      * invariants match exactly, and the other Latin1 ones need
6372                      * to be downgraded to a single byte in order to do the
6373                      * comparison.  (If we could be confident that the target
6374                      * is not malformed, this could be refactored to have fewer
6375                      * tests by just assuming that if the first bytes match, it
6376                      * is an invariant, but there are tests in the test suite
6377                      * dealing with (??{...}) which violate this) */
6378                     while (s < e) {
6379                         if (   l >= loceol
6380                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6381                         {
6382                             sayNO;
6383                         }
6384                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6385                             if (*l != *s) {
6386                                 sayNO;
6387                             }
6388                             l++;
6389                         }
6390                         else {
6391                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6392                             {
6393                                 sayNO;
6394                             }
6395                             l += 2;
6396                         }
6397                         s++;
6398                     }
6399                 }
6400                 else {
6401                     /* The target is not utf8, the pattern is utf8. */
6402                     while (s < e) {
6403                         if (   l >= loceol
6404                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6405                         {
6406                             sayNO;
6407                         }
6408                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6409                             if (*s != *l) {
6410                                 sayNO;
6411                             }
6412                             s++;
6413                         }
6414                         else {
6415                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6416                             {
6417                                 sayNO;
6418                             }
6419                             s += 2;
6420                         }
6421                         l++;
6422                     }
6423                 }
6424                 locinput = l;
6425             }
6426             else {
6427                 /* The target and the pattern have the same utf8ness. */
6428                 /* Inline the first character, for speed. */
6429                 if (   loceol - locinput < ln
6430                     || UCHARAT(s) != nextchr
6431                     || (ln > 1 && memNE(s, locinput, ln)))
6432                 {
6433                     sayNO;
6434                 }
6435                 locinput += ln;
6436             }
6437             break;
6438             }
6439
6440         case EXACTFL:            /*  /abc/il      */
6441           {
6442             re_fold_t folder;
6443             const U8 * fold_array;
6444             const char * s;
6445             U32 fold_utf8_flags;
6446
6447             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6448             folder = foldEQ_locale;
6449             fold_array = PL_fold_locale;
6450             fold_utf8_flags = FOLDEQ_LOCALE;
6451             goto do_exactf;
6452
6453         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6454                                       is effectively /u; hence to match, target
6455                                       must be UTF-8. */
6456             if (! utf8_target) {
6457                 sayNO;
6458             }
6459             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6460                                              | FOLDEQ_S2_FOLDS_SANE;
6461             folder = foldEQ_latin1_s2_folded;
6462             fold_array = PL_fold_latin1;
6463             goto do_exactf;
6464
6465         case EXACTFU_REQ8:      /* /abc/iu with something in /abc/ > 255 */
6466             if (! utf8_target) {
6467                 sayNO;
6468             }
6469             assert(is_utf8_pat);
6470             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6471             goto do_exactf;
6472
6473         case EXACTFUP:          /*  /foo/iu, and something is problematic in
6474                                     'foo' so can't take shortcuts. */
6475             assert(! is_utf8_pat);
6476             folder = foldEQ_latin1;
6477             fold_array = PL_fold_latin1;
6478             fold_utf8_flags = 0;
6479             goto do_exactf;
6480
6481         case EXACTFU:            /*  /abc/iu      */
6482             folder = foldEQ_latin1_s2_folded;
6483             fold_array = PL_fold_latin1;
6484             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6485             goto do_exactf;
6486
6487         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6488                                    patterns */
6489             assert(! is_utf8_pat);
6490             /* FALLTHROUGH */
6491         case EXACTFAA:            /*  /abc/iaa     */
6492             folder = foldEQ_latin1_s2_folded;
6493             fold_array = PL_fold_latin1;
6494             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6495             if (is_utf8_pat || ! utf8_target) {
6496
6497                 /* The possible presence of a MICRO SIGN in the pattern forbids
6498                  * us to view a non-UTF-8 pattern as folded when there is a
6499                  * UTF-8 target */
6500                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
6501                                   |FOLDEQ_S2_FOLDS_SANE;
6502             }
6503             goto do_exactf;
6504
6505
6506         case EXACTF:             /*  /abc/i    This node only generated for
6507                                                non-utf8 patterns */
6508             assert(! is_utf8_pat);
6509             folder = foldEQ;
6510             fold_array = PL_fold;
6511             fold_utf8_flags = 0;
6512
6513           do_exactf:
6514             s = STRINGs(scan);
6515             ln = STR_LENs(scan);
6516
6517             if (   utf8_target
6518                 || is_utf8_pat
6519                 || state_num == EXACTFUP
6520                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6521             {
6522               /* Either target or the pattern are utf8, or has the issue where
6523                * the fold lengths may differ. */
6524                 const char * const l = locinput;
6525                 char *e = loceol;
6526
6527                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
6528                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
6529                 {
6530                     sayNO;
6531                 }
6532                 locinput = e;
6533                 break;
6534             }
6535
6536             /* Neither the target nor the pattern are utf8 */
6537             if (UCHARAT(s) != nextchr
6538                 && !NEXTCHR_IS_EOS
6539                 && UCHARAT(s) != fold_array[nextchr])
6540             {
6541                 sayNO;
6542             }
6543             if (loceol - locinput < ln)
6544                 sayNO;
6545             if (ln > 1 && ! folder(locinput, s, ln))
6546                 sayNO;
6547             locinput += ln;
6548             break;
6549         }
6550
6551         case NBOUNDL: /*  /\B/l  */
6552             to_complement = 1;
6553             /* FALLTHROUGH */
6554
6555         case BOUNDL:  /*  /\b/l  */
6556         {
6557             bool b1, b2;
6558             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6559
6560             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6561                 if (! IN_UTF8_CTYPE_LOCALE) {
6562                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6563                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6564                 }
6565                 goto boundu;
6566             }
6567
6568             if (utf8_target) {
6569                 if (locinput == reginfo->strbeg)
6570                     b1 = isWORDCHAR_LC('\n');
6571                 else {
6572                     U8 *p = reghop3((U8*)locinput, -1,
6573                                     (U8*)(reginfo->strbeg));
6574                     b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
6575                 }
6576                 b2 = (NEXTCHR_IS_EOS)
6577                     ? isWORDCHAR_LC('\n')
6578                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6579                                               (U8*) reginfo->strend);
6580             }
6581             else { /* Here the string isn't utf8 */
6582                 b1 = (locinput == reginfo->strbeg)
6583                      ? isWORDCHAR_LC('\n')
6584                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6585                 b2 = (NEXTCHR_IS_EOS)
6586                     ? isWORDCHAR_LC('\n')
6587                     : isWORDCHAR_LC(nextchr);
6588             }
6589             if (to_complement ^ (b1 == b2)) {
6590                 sayNO;
6591             }
6592             break;
6593         }
6594
6595         case NBOUND:  /*  /\B/   */
6596             to_complement = 1;
6597             /* FALLTHROUGH */
6598
6599         case BOUND:   /*  /\b/   */
6600             if (utf8_target) {
6601                 goto bound_utf8;
6602             }
6603             goto bound_ascii_match_only;
6604
6605         case NBOUNDA: /*  /\B/a  */
6606             to_complement = 1;
6607             /* FALLTHROUGH */
6608
6609         case BOUNDA:  /*  /\b/a  */
6610         {
6611             bool b1, b2;
6612
6613           bound_ascii_match_only:
6614             /* Here the string isn't utf8, or is utf8 and only ascii characters
6615              * are to match \w.  In the latter case looking at the byte just
6616              * prior to the current one may be just the final byte of a
6617              * multi-byte character.  This is ok.  There are two cases:
6618              * 1) it is a single byte character, and then the test is doing
6619              *    just what it's supposed to.
6620              * 2) it is a multi-byte character, in which case the final byte is
6621              *    never mistakable for ASCII, and so the test will say it is
6622              *    not a word character, which is the correct answer. */
6623             b1 = (locinput == reginfo->strbeg)
6624                  ? isWORDCHAR_A('\n')
6625                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6626             b2 = (NEXTCHR_IS_EOS)
6627                 ? isWORDCHAR_A('\n')
6628                 : isWORDCHAR_A(nextchr);
6629             if (to_complement ^ (b1 == b2)) {
6630                 sayNO;
6631             }
6632             break;
6633         }
6634
6635         case NBOUNDU: /*  /\B/u  */
6636             to_complement = 1;
6637             /* FALLTHROUGH */
6638
6639         case BOUNDU:  /*  /\b/u  */
6640
6641           boundu:
6642             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6643                 match = FALSE;
6644             }
6645             else if (utf8_target) {
6646               bound_utf8:
6647                 switch((bound_type) FLAGS(scan)) {
6648                     case TRADITIONAL_BOUND:
6649                     {
6650                         bool b1, b2;
6651                         if (locinput == reginfo->strbeg) {
6652                             b1 = 0 /* isWORDCHAR_L1('\n') */;
6653                         }
6654                         else {
6655                             U8 *p = reghop3((U8*)locinput, -1,
6656                                             (U8*)(reginfo->strbeg));
6657
6658                             b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
6659                         }
6660                         b2 = (NEXTCHR_IS_EOS)
6661                             ? 0 /* isWORDCHAR_L1('\n') */
6662                             : isWORDCHAR_utf8_safe((U8*)locinput,
6663                                                    (U8*) reginfo->strend);
6664                         match = cBOOL(b1 != b2);
6665                         break;
6666                     }
6667                     case GCB_BOUND:
6668                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6669                             match = TRUE; /* GCB always matches at begin and
6670                                              end */
6671                         }
6672                         else {
6673                             /* Find the gcb values of previous and current
6674                              * chars, then see if is a break point */
6675                             match = isGCB(getGCB_VAL_UTF8(
6676                                                 reghop3((U8*)locinput,
6677                                                         -1,
6678                                                         (U8*)(reginfo->strbeg)),
6679                                                 (U8*) reginfo->strend),
6680                                           getGCB_VAL_UTF8((U8*) locinput,
6681                                                         (U8*) reginfo->strend),
6682                                           (U8*) reginfo->strbeg,
6683                                           (U8*) locinput,
6684                                           utf8_target);
6685                         }
6686                         break;
6687
6688                     case LB_BOUND:
6689                         if (locinput == reginfo->strbeg) {
6690                             match = FALSE;
6691                         }
6692                         else if (NEXTCHR_IS_EOS) {
6693                             match = TRUE;
6694                         }
6695                         else {
6696                             match = isLB(getLB_VAL_UTF8(
6697                                                 reghop3((U8*)locinput,
6698                                                         -1,
6699                                                         (U8*)(reginfo->strbeg)),
6700                                                 (U8*) reginfo->strend),
6701                                           getLB_VAL_UTF8((U8*) locinput,
6702                                                         (U8*) reginfo->strend),
6703                                           (U8*) reginfo->strbeg,
6704                                           (U8*) locinput,
6705                                           (U8*) reginfo->strend,
6706                                           utf8_target);
6707                         }
6708                         break;
6709
6710                     case SB_BOUND: /* Always matches at begin and end */
6711                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6712                             match = TRUE;
6713                         }
6714                         else {
6715                             match = isSB(getSB_VAL_UTF8(
6716                                                 reghop3((U8*)locinput,
6717                                                         -1,
6718                                                         (U8*)(reginfo->strbeg)),
6719                                                 (U8*) reginfo->strend),
6720                                           getSB_VAL_UTF8((U8*) locinput,
6721                                                         (U8*) reginfo->strend),
6722                                           (U8*) reginfo->strbeg,
6723                                           (U8*) locinput,
6724                                           (U8*) reginfo->strend,
6725                                           utf8_target);
6726                         }
6727                         break;
6728
6729                     case WB_BOUND:
6730                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6731                             match = TRUE;
6732                         }
6733                         else {
6734                             match = isWB(WB_UNKNOWN,
6735                                          getWB_VAL_UTF8(
6736                                                 reghop3((U8*)locinput,
6737                                                         -1,
6738                                                         (U8*)(reginfo->strbeg)),
6739                                                 (U8*) reginfo->strend),
6740                                           getWB_VAL_UTF8((U8*) locinput,
6741                                                         (U8*) reginfo->strend),
6742                                           (U8*) reginfo->strbeg,
6743                                           (U8*) locinput,
6744                                           (U8*) reginfo->strend,
6745                                           utf8_target);
6746                         }
6747                         break;
6748                 }
6749             }
6750             else {  /* Not utf8 target */
6751                 switch((bound_type) FLAGS(scan)) {
6752                     case TRADITIONAL_BOUND:
6753                     {
6754                         bool b1, b2;
6755                         b1 = (locinput == reginfo->strbeg)
6756                             ? 0 /* isWORDCHAR_L1('\n') */
6757                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6758                         b2 = (NEXTCHR_IS_EOS)
6759                             ? 0 /* isWORDCHAR_L1('\n') */
6760                             : isWORDCHAR_L1(nextchr);
6761                         match = cBOOL(b1 != b2);
6762                         break;
6763                     }
6764
6765                     case GCB_BOUND:
6766                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6767                             match = TRUE; /* GCB always matches at begin and
6768                                              end */
6769                         }
6770                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6771                                    range */
6772                             match =    UCHARAT(locinput - 1) != '\r'
6773                                     || UCHARAT(locinput) != '\n';
6774                         }
6775                         break;
6776
6777                     case LB_BOUND:
6778                         if (locinput == reginfo->strbeg) {
6779                             match = FALSE;
6780                         }
6781                         else if (NEXTCHR_IS_EOS) {
6782                             match = TRUE;
6783                         }
6784                         else {
6785                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6786                                          getLB_VAL_CP(UCHARAT(locinput)),
6787                                          (U8*) reginfo->strbeg,
6788                                          (U8*) locinput,
6789                                          (U8*) reginfo->strend,
6790                                          utf8_target);
6791                         }
6792                         break;
6793
6794                     case SB_BOUND: /* Always matches at begin and end */
6795                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6796                             match = TRUE;
6797                         }
6798                         else {
6799                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6800                                          getSB_VAL_CP(UCHARAT(locinput)),
6801                                          (U8*) reginfo->strbeg,
6802                                          (U8*) locinput,
6803                                          (U8*) reginfo->strend,
6804                                          utf8_target);
6805                         }
6806                         break;
6807
6808                     case WB_BOUND:
6809                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6810                             match = TRUE;
6811                         }
6812                         else {
6813                             match = isWB(WB_UNKNOWN,
6814                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6815                                          getWB_VAL_CP(UCHARAT(locinput)),
6816                                          (U8*) reginfo->strbeg,
6817                                          (U8*) locinput,
6818                                          (U8*) reginfo->strend,
6819                                          utf8_target);
6820                         }
6821                         break;
6822                 }
6823             }
6824
6825             if (to_complement ^ ! match) {
6826                 sayNO;
6827             }
6828             break;
6829
6830         case ANYOFPOSIXL:
6831         case ANYOFL:  /*  /[abc]/l      */
6832             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6833
6834             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6835             {
6836               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6837             }
6838             /* FALLTHROUGH */
6839         case ANYOFD:  /*   /[abc]/d       */
6840         case ANYOF:  /*   /[abc]/       */
6841             if (NEXTCHR_IS_EOS || locinput >= loceol)
6842                 sayNO;
6843             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
6844                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
6845             {
6846                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
6847                     sayNO;
6848                 }
6849                 locinput++;
6850             }
6851             else {
6852                 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6853                                                                    utf8_target))
6854                 {
6855                     sayNO;
6856                 }
6857                 goto increment_locinput;
6858             }
6859             break;
6860
6861         case ANYOFM:
6862             if (   NEXTCHR_IS_EOS
6863                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
6864                 || locinput >= loceol)
6865             {
6866                 sayNO;
6867             }
6868             locinput++; /* ANYOFM is always single byte */
6869             break;
6870
6871         case NANYOFM:
6872             if (   NEXTCHR_IS_EOS
6873                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
6874                 || locinput >= loceol)
6875             {
6876                 sayNO;
6877             }
6878             goto increment_locinput;
6879             break;
6880
6881         case ANYOFH:
6882             if (   ! utf8_target
6883                 ||   NEXTCHR_IS_EOS
6884                 ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
6885                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6886                                                                    utf8_target))
6887             {
6888                 sayNO;
6889             }
6890             goto increment_locinput;
6891             break;
6892
6893         case ANYOFHb:
6894             if (   ! utf8_target
6895                 ||   NEXTCHR_IS_EOS
6896                 ||   ANYOF_FLAGS(scan) != (U8) *locinput
6897                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6898                                                                   utf8_target))
6899             {
6900                 sayNO;
6901             }
6902             goto increment_locinput;
6903             break;
6904
6905         case ANYOFHr:
6906             if (   ! utf8_target
6907                 ||   NEXTCHR_IS_EOS
6908                 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
6909                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
6910                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
6911                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6912                                                                    utf8_target))
6913             {
6914                 sayNO;
6915             }
6916             goto increment_locinput;
6917             break;
6918
6919         case ANYOFHs:
6920             if (   ! utf8_target
6921                 ||   NEXTCHR_IS_EOS
6922                 ||   loceol - locinput < FLAGS(scan)
6923                 ||   memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
6924                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6925                                                                    utf8_target))
6926             {
6927                 sayNO;
6928             }
6929             goto increment_locinput;
6930             break;
6931
6932         case ANYOFR:
6933             if (NEXTCHR_IS_EOS) {
6934                 sayNO;
6935             }
6936
6937             if (utf8_target) {
6938                 if (    ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
6939                    || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
6940                                                 (U8 *) reginfo->strend,
6941                                                 NULL),
6942                                     ANYOFRbase(scan), ANYOFRdelta(scan)))
6943                 {
6944                     sayNO;
6945                 }
6946             }
6947             else {
6948                 if (! withinCOUNT((U8) *locinput,
6949                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
6950                 {
6951                     sayNO;
6952                 }
6953             }
6954             goto increment_locinput;
6955             break;
6956
6957         case ANYOFRb:
6958             if (NEXTCHR_IS_EOS) {
6959                 sayNO;
6960             }
6961
6962             if (utf8_target) {
6963                 if (     ANYOF_FLAGS(scan) != (U8) *locinput
6964                     || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
6965                                                 (U8 *) reginfo->strend,
6966                                                 NULL),
6967                                      ANYOFRbase(scan), ANYOFRdelta(scan)))
6968                 {
6969                     sayNO;
6970                 }
6971             }
6972             else {
6973                 if (! withinCOUNT((U8) *locinput,
6974                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
6975                 {
6976                     sayNO;
6977                 }
6978             }
6979             goto increment_locinput;
6980             break;
6981
6982         /* The argument (FLAGS) to all the POSIX node types is the class number
6983          * */
6984
6985         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6986             to_complement = 1;
6987             /* FALLTHROUGH */
6988
6989         case POSIXL:    /* \w or [:punct:] etc. under /l */
6990             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6991             if (NEXTCHR_IS_EOS || locinput >= loceol)
6992                 sayNO;
6993
6994             /* Use isFOO_lc() for characters within Latin1.  (Note that
6995              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6996              * wouldn't be invariant) */
6997             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6998                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6999                     sayNO;
7000                 }
7001
7002                 locinput++;
7003                 break;
7004             }
7005
7006             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7007                 /* An above Latin-1 code point, or malformed */
7008                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7009                                                        reginfo->strend);
7010                 goto utf8_posix_above_latin1;
7011             }
7012
7013             /* Here is a UTF-8 variant code point below 256 and the target is
7014              * UTF-8 */
7015             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7016                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
7017                                             *(locinput + 1))))))
7018             {
7019                 sayNO;
7020             }
7021
7022             goto increment_locinput;
7023
7024         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
7025             to_complement = 1;
7026             /* FALLTHROUGH */
7027
7028         case POSIXD:    /* \w or [:punct:] etc. under /d */
7029             if (utf8_target) {
7030                 goto utf8_posix;
7031             }
7032             goto posixa;
7033
7034         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
7035
7036             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7037                 sayNO;
7038             }
7039
7040             /* All UTF-8 variants match */
7041             if (! UTF8_IS_INVARIANT(nextchr)) {
7042                 goto increment_locinput;
7043             }
7044
7045             to_complement = 1;
7046             goto join_nposixa;
7047
7048         case POSIXA:    /* \w or [:punct:] etc. under /a */
7049
7050           posixa:
7051             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7052              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7053              * character is a single byte */
7054
7055             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7056                 sayNO;
7057             }
7058
7059           join_nposixa:
7060
7061             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
7062                                                                 FLAGS(scan)))))
7063             {
7064                 sayNO;
7065             }
7066
7067             /* Here we are either not in utf8, or we matched a utf8-invariant,
7068              * so the next char is the next byte */
7069             locinput++;
7070             break;
7071
7072         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
7073             to_complement = 1;
7074             /* FALLTHROUGH */
7075
7076         case POSIXU:    /* \w or [:punct:] etc. under /u */
7077           utf8_posix:
7078             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7079                 sayNO;
7080             }
7081
7082             /* Use _generic_isCC() for characters within Latin1.  (Note that
7083              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7084              * wouldn't be invariant) */
7085             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
7086                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
7087                                                            FLAGS(scan)))))
7088                 {
7089                     sayNO;
7090                 }
7091                 locinput++;
7092             }
7093             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7094                 if (! (to_complement
7095                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
7096                                                                *(locinput + 1)),
7097                                              FLAGS(scan)))))
7098                 {
7099                     sayNO;
7100                 }
7101                 locinput += 2;
7102             }
7103             else {  /* Handle above Latin-1 code points */
7104               utf8_posix_above_latin1:
7105                 classnum = (_char_class_number) FLAGS(scan);
7106                 switch (classnum) {
7107                     default:
7108                         if (! (to_complement
7109                            ^ cBOOL(_invlist_contains_cp(
7110                                       PL_XPosix_ptrs[classnum],
7111                                       utf8_to_uvchr_buf((U8 *) locinput,
7112                                                         (U8 *) reginfo->strend,
7113                                                         NULL)))))
7114                         {
7115                             sayNO;
7116                         }
7117                         break;
7118                     case _CC_ENUM_SPACE:
7119                         if (! (to_complement
7120                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
7121                         {
7122                             sayNO;
7123                         }
7124                         break;
7125                     case _CC_ENUM_BLANK:
7126                         if (! (to_complement
7127                                         ^ cBOOL(is_HORIZWS_high(locinput))))
7128                         {
7129                             sayNO;
7130                         }
7131                         break;
7132                     case _CC_ENUM_XDIGIT:
7133                         if (! (to_complement
7134                                         ^ cBOOL(is_XDIGIT_high(locinput))))
7135                         {
7136                             sayNO;
7137                         }
7138                         break;
7139                     case _CC_ENUM_VERTSPACE:
7140                         if (! (to_complement
7141                                         ^ cBOOL(is_VERTWS_high(locinput))))
7142                         {
7143                             sayNO;
7144                         }
7145                         break;
7146                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
7147                     case _CC_ENUM_ASCII:
7148                         if (! to_complement) {
7149                             sayNO;
7150                         }
7151                         break;
7152                 }
7153                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7154             }
7155             break;
7156
7157         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
7158                        a Unicode extended Grapheme Cluster */
7159             if (NEXTCHR_IS_EOS || locinput >= loceol)
7160                 sayNO;
7161             if  (! utf8_target) {
7162
7163                 /* Match either CR LF  or '.', as all the other possibilities
7164                  * require utf8 */
7165                 locinput++;         /* Match the . or CR */
7166                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
7167                                        match the LF */
7168                     && locinput <  loceol
7169                     && UCHARAT(locinput) == '\n')
7170                 {
7171                     locinput++;
7172                 }
7173             }
7174             else {
7175
7176                 /* Get the gcb type for the current character */
7177                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7178                                                        (U8*) reginfo->strend);
7179
7180                 /* Then scan through the input until we get to the first
7181                  * character whose type is supposed to be a gcb with the
7182                  * current character.  (There is always a break at the
7183                  * end-of-input) */
7184                 locinput += UTF8SKIP(locinput);
7185                 while (locinput < loceol) {
7186                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7187                                                          (U8*) reginfo->strend);
7188                     if (isGCB(prev_gcb, cur_gcb,
7189                               (U8*) reginfo->strbeg, (U8*) locinput,
7190                               utf8_target))
7191                     {
7192                         break;
7193                     }
7194
7195                     prev_gcb = cur_gcb;
7196                     locinput += UTF8SKIP(locinput);
7197                 }
7198
7199
7200             }
7201             break;
7202
7203         case REFFLN:  /*  /\g{name}/il  */
7204         {   /* The capture buffer cases.  The ones beginning with N for the
7205                named buffers just convert to the equivalent numbered and
7206                pretend they were called as the corresponding numbered buffer
7207                op.  */
7208             /* don't initialize these in the declaration, it makes C++
7209                unhappy */
7210             const char *s;
7211             char type;
7212             re_fold_t folder;
7213             const U8 *fold_array;
7214             UV utf8_fold_flags;
7215
7216             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7217             folder = foldEQ_locale;
7218             fold_array = PL_fold_locale;
7219             type = REFFL;
7220             utf8_fold_flags = FOLDEQ_LOCALE;
7221             goto do_nref;
7222
7223         case REFFAN:  /*  /\g{name}/iaa  */
7224             folder = foldEQ_latin1;
7225             fold_array = PL_fold_latin1;
7226             type = REFFA;
7227             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7228             goto do_nref;
7229
7230         case REFFUN:  /*  /\g{name}/iu  */
7231             folder = foldEQ_latin1;
7232             fold_array = PL_fold_latin1;
7233             type = REFFU;
7234             utf8_fold_flags = 0;
7235             goto do_nref;
7236
7237         case REFFN:  /*  /\g{name}/i  */
7238             folder = foldEQ;
7239             fold_array = PL_fold;
7240             type = REFF;
7241             utf8_fold_flags = 0;
7242             goto do_nref;
7243
7244         case REFN:  /*  /\g{name}/   */
7245             type = REF;
7246             folder = NULL;
7247             fold_array = NULL;
7248             utf8_fold_flags = 0;
7249           do_nref:
7250
7251             /* For the named back references, find the corresponding buffer
7252              * number */
7253             n = reg_check_named_buff_matched(rex,scan);
7254
7255             if ( ! n ) {
7256                 sayNO;
7257             }
7258             goto do_nref_ref_common;
7259
7260         case REFFL:  /*  /\1/il  */
7261             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7262             folder = foldEQ_locale;
7263             fold_array = PL_fold_locale;
7264             utf8_fold_flags = FOLDEQ_LOCALE;
7265             goto do_ref;
7266
7267         case REFFA:  /*  /\1/iaa  */
7268             folder = foldEQ_latin1;
7269             fold_array = PL_fold_latin1;
7270             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7271             goto do_ref;
7272
7273         case REFFU:  /*  /\1/iu  */
7274             folder = foldEQ_latin1;
7275             fold_array = PL_fold_latin1;
7276             utf8_fold_flags = 0;
7277             goto do_ref;
7278
7279         case REFF:  /*  /\1/i  */
7280             folder = foldEQ;
7281             fold_array = PL_fold;
7282             utf8_fold_flags = 0;
7283             goto do_ref;
7284
7285         case REF:  /*  /\1/    */
7286             folder = NULL;
7287             fold_array = NULL;
7288             utf8_fold_flags = 0;
7289
7290           do_ref:
7291             type = OP(scan);
7292             n = ARG(scan);  /* which paren pair */
7293
7294           do_nref_ref_common:
7295             ln = rex->offs[n].start;
7296             endref = rex->offs[n].end;
7297             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7298             if (rex->lastparen < n || ln == -1 || endref == -1)
7299                 sayNO;                  /* Do not match unless seen CLOSEn. */
7300             if (ln == endref)
7301                 break;
7302
7303             s = reginfo->strbeg + ln;
7304             if (type != REF     /* REF can do byte comparison */
7305                 && (utf8_target || type == REFFU || type == REFFL))
7306             {
7307                 char * limit = loceol;
7308
7309                 /* This call case insensitively compares the entire buffer
7310                     * at s, with the current input starting at locinput, but
7311                     * not going off the end given by loceol, and
7312                     * returns in <limit> upon success, how much of the
7313                     * current input was matched */
7314                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7315                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7316                 {
7317                     sayNO;
7318                 }
7319                 locinput = limit;
7320                 break;
7321             }
7322
7323             /* Not utf8:  Inline the first character, for speed. */
7324             if ( ! NEXTCHR_IS_EOS
7325                 && locinput < loceol
7326                 && UCHARAT(s) != nextchr
7327                 && (   type == REF
7328                     || UCHARAT(s) != fold_array[nextchr]))
7329             {
7330                 sayNO;
7331             }
7332             ln = endref - ln;
7333             if (locinput + ln > loceol)
7334                 sayNO;
7335             if (ln > 1 && (type == REF
7336                            ? memNE(s, locinput, ln)
7337                            : ! folder(locinput, s, ln)))
7338                 sayNO;
7339             locinput += ln;
7340             break;
7341         }
7342
7343         case NOTHING: /* null op; e.g. the 'nothing' following
7344                        * the '*' in m{(a+|b)*}' */
7345             break;
7346         case TAIL: /* placeholder while compiling (A|B|C) */
7347             break;
7348
7349 #undef  ST
7350 #define ST st->u.eval
7351 #define CUR_EVAL cur_eval->u.eval
7352
7353         {
7354             SV *ret;
7355             REGEXP *re_sv;
7356             regexp *re;
7357             regexp_internal *rei;
7358             regnode *startpoint;
7359             U32 arg;
7360
7361         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7362             arg= (U32)ARG(scan);
7363             if (cur_eval && cur_eval->locinput == locinput) {
7364                 if ( ++nochange_depth > max_nochange_depth )
7365                     Perl_croak(aTHX_
7366                         "Pattern subroutine nesting without pos change"
7367                         " exceeded limit in regex");
7368             } else {
7369                 nochange_depth = 0;
7370             }
7371             re_sv = rex_sv;
7372             re = rex;
7373             rei = rexi;
7374             startpoint = scan + ARG2L(scan);
7375             EVAL_CLOSE_PAREN_SET( st, arg );
7376             /* Detect infinite recursion
7377              *
7378              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7379              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7380              * So we track the position in the string we are at each time
7381              * we recurse and if we try to enter the same routine twice from
7382              * the same position we throw an error.
7383              */
7384             if ( rex->recurse_locinput[arg] == locinput ) {
7385                 /* FIXME: we should show the regop that is failing as part
7386                  * of the error message. */
7387                 Perl_croak(aTHX_ "Infinite recursion in regex");
7388             } else {
7389                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7390                 rex->recurse_locinput[arg]= locinput;
7391
7392                 DEBUG_r({
7393                     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7394                     DEBUG_STACK_r({
7395                         Perl_re_exec_indentf( aTHX_
7396                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7397                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7398                         );
7399                     });
7400                 });
7401             }
7402
7403             /* Save all the positions seen so far. */
7404             ST.cp = regcppush(rex, 0, maxopenparen);
7405             REGCP_SET(ST.lastcp);
7406
7407             /* and then jump to the code we share with EVAL */
7408             goto eval_recurse_doit;
7409             /* NOTREACHED */
7410
7411         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7412             if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7413                 if ( ++nochange_depth > max_nochange_depth )
7414                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7415             } else {
7416                 nochange_depth = 0;
7417             }
7418             {
7419                 /* execute the code in the {...} */
7420
7421                 dSP;
7422                 IV before;
7423                 OP * const oop = PL_op;
7424                 COP * const ocurcop = PL_curcop;
7425                 OP *nop;
7426                 CV *newcv;
7427
7428                 /* save *all* paren positions */
7429                 regcppush(rex, 0, maxopenparen);
7430                 REGCP_SET(ST.lastcp);
7431
7432                 if (!caller_cv)
7433                     caller_cv = find_runcv(NULL);
7434
7435                 n = ARG(scan);
7436
7437                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7438                     newcv = (ReANY(
7439                                     (REGEXP*)(rexi->data->data[n])
7440                             ))->qr_anoncv;
7441                     nop = (OP*)rexi->data->data[n+1];
7442                 }
7443                 else if (rexi->data->what[n] == 'l') { /* literal code */
7444                     newcv = caller_cv;
7445                     nop = (OP*)rexi->data->data[n];
7446                     assert(CvDEPTH(newcv));
7447                 }
7448                 else {
7449                     /* literal with own CV */
7450                     assert(rexi->data->what[n] == 'L');
7451                     newcv = rex->qr_anoncv;
7452                     nop = (OP*)rexi->data->data[n];
7453                 }
7454
7455                 /* Some notes about MULTICALL and the context and save stacks.
7456                  *
7457                  * In something like
7458                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7459                  * since codeblocks don't introduce a new scope (so that
7460                  * local() etc accumulate), at the end of a successful
7461                  * match there will be a SAVEt_CLEARSV on the savestack
7462                  * for each of $x, $y, $z. If the three code blocks above
7463                  * happen to have come from different CVs (e.g. via
7464                  * embedded qr//s), then we must ensure that during any
7465                  * savestack unwinding, PL_comppad always points to the
7466                  * right pad at each moment. We achieve this by
7467                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7468                  * there is a change of pad.
7469                  * In theory whenever we call a code block, we should
7470                  * push a CXt_SUB context, then pop it on return from
7471                  * that code block. This causes a bit of an issue in that
7472                  * normally popping a context also clears the savestack
7473                  * back to cx->blk_oldsaveix, but here we specifically
7474                  * don't want to clear the save stack on exit from the
7475                  * code block.
7476                  * Also for efficiency we don't want to keep pushing and
7477                  * popping the single SUB context as we backtrack etc.
7478                  * So instead, we push a single context the first time
7479                  * we need, it, then hang onto it until the end of this
7480                  * function. Whenever we encounter a new code block, we
7481                  * update the CV etc if that's changed. During the times
7482                  * in this function where we're not executing a code
7483                  * block, having the SUB context still there is a bit
7484                  * naughty - but we hope that no-one notices.
7485                  * When the SUB context is initially pushed, we fake up
7486                  * cx->blk_oldsaveix to be as if we'd pushed this context
7487                  * on first entry to S_regmatch rather than at some random
7488                  * point during the regexe execution. That way if we
7489                  * croak, popping the context stack will ensure that
7490                  * *everything* SAVEd by this function is undone and then
7491                  * the context popped, rather than e.g., popping the
7492                  * context (and restoring the original PL_comppad) then
7493                  * popping more of the savestack and restoring a bad
7494                  * PL_comppad.
7495                  */
7496
7497                 /* If this is the first EVAL, push a MULTICALL. On
7498                  * subsequent calls, if we're executing a different CV, or
7499                  * if PL_comppad has got messed up from backtracking
7500                  * through SAVECOMPPADs, then refresh the context.
7501                  */
7502                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7503                 {
7504                     U8 flags = (CXp_SUB_RE |
7505                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7506                     SAVECOMPPAD();
7507                     if (last_pushed_cv) {
7508                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7509                     }
7510                     else {
7511                         PUSH_MULTICALL_FLAGS(newcv, flags);
7512                     }
7513                     /* see notes above */
7514                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7515
7516                     last_pushed_cv = newcv;
7517                 }
7518                 else {
7519                     /* these assignments are just to silence compiler
7520                      * warnings */
7521                     multicall_cop = NULL;
7522                 }
7523                 last_pad = PL_comppad;
7524
7525                 /* the initial nextstate you would normally execute
7526                  * at the start of an eval (which would cause error
7527                  * messages to come from the eval), may be optimised
7528                  * away from the execution path in the regex code blocks;
7529                  * so manually set PL_curcop to it initially */
7530                 {
7531                     OP *o = cUNOPx(nop)->op_first;
7532                     assert(o->op_type == OP_NULL);
7533                     if (o->op_targ == OP_SCOPE) {
7534                         o = cUNOPo->op_first;
7535                     }
7536                     else {
7537                         assert(o->op_targ == OP_LEAVE);
7538                         o = cUNOPo->op_first;
7539                         assert(o->op_type == OP_ENTER);
7540                         o = OpSIBLING(o);
7541                     }
7542
7543                     if (o->op_type != OP_STUB) {
7544                         assert(    o->op_type == OP_NEXTSTATE
7545                                 || o->op_type == OP_DBSTATE
7546                                 || (o->op_type == OP_NULL
7547                                     &&  (  o->op_targ == OP_NEXTSTATE
7548                                         || o->op_targ == OP_DBSTATE
7549                                         )
7550                                     )
7551                         );
7552                         PL_curcop = (COP*)o;
7553                     }
7554                 }
7555                 nop = nop->op_next;
7556
7557                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7558                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7559
7560                 rex->offs[0].end = locinput - reginfo->strbeg;
7561                 if (reginfo->info_aux_eval->pos_magic)
7562                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7563                                   reginfo->sv, reginfo->strbeg,
7564                                   locinput - reginfo->strbeg);
7565
7566                 if (sv_yes_mark) {
7567                     SV *sv_mrk = get_sv("REGMARK", 1);
7568                     sv_setsv(sv_mrk, sv_yes_mark);
7569                 }
7570
7571                 /* we don't use MULTICALL here as we want to call the
7572                  * first op of the block of interest, rather than the
7573                  * first op of the sub. Also, we don't want to free
7574                  * the savestack frame */
7575                 before = (IV)(SP-PL_stack_base);
7576                 PL_op = nop;
7577                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7578                 SPAGAIN;
7579                 if ((IV)(SP-PL_stack_base) == before)
7580                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7581                 else {
7582                     ret = POPs;
7583                     PUTBACK;
7584                 }
7585
7586                 /* before restoring everything, evaluate the returned
7587                  * value, so that 'uninit' warnings don't use the wrong
7588                  * PL_op or pad. Also need to process any magic vars
7589                  * (e.g. $1) *before* parentheses are restored */
7590
7591                 PL_op = NULL;
7592
7593                 re_sv = NULL;
7594                 if (logical == 0) {       /*   (?{})/   */
7595                     SV *replsv = save_scalar(PL_replgv);
7596                     sv_setsv(replsv, ret); /* $^R */
7597                     SvSETMAGIC(replsv);
7598                 }
7599                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7600                     sw = cBOOL(SvTRUE_NN(ret));
7601                     logical = 0;
7602                 }
7603                 else {                   /*  /(??{})  */
7604                     /*  if its overloaded, let the regex compiler handle
7605                      *  it; otherwise extract regex, or stringify  */
7606                     if (SvGMAGICAL(ret))
7607                         ret = sv_mortalcopy(ret);
7608                     if (!SvAMAGIC(ret)) {
7609                         SV *sv = ret;
7610                         if (SvROK(sv))
7611                             sv = SvRV(sv);
7612                         if (SvTYPE(sv) == SVt_REGEXP)
7613                             re_sv = (REGEXP*) sv;
7614                         else if (SvSMAGICAL(ret)) {
7615                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7616                             if (mg)
7617                                 re_sv = (REGEXP *) mg->mg_obj;
7618                         }
7619
7620                         /* force any undef warnings here */
7621                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7622                             ret = sv_mortalcopy(ret);
7623                             (void) SvPV_force_nolen(ret);
7624                         }
7625                     }
7626
7627                 }
7628
7629                 /* *** Note that at this point we don't restore
7630                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7631                  * be used again soon. This is safe as long as nothing
7632                  * in the regexp code uses the pad ! */
7633                 PL_op = oop;
7634                 PL_curcop = ocurcop;
7635                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7636                 PL_curpm_under = PL_curpm;
7637                 PL_curpm = PL_reg_curpm;
7638
7639                 if (logical != 2) {
7640                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
7641                                     script_run_begin);
7642                     /* NOTREACHED */
7643                 }
7644             }
7645
7646                 /* only /(??{})/  from now on */
7647                 logical = 0;
7648                 {
7649                     /* extract RE object from returned value; compiling if
7650                      * necessary */
7651
7652                     if (re_sv) {
7653                         re_sv = reg_temp_copy(NULL, re_sv);
7654                     }
7655                     else {
7656                         U32 pm_flags = 0;
7657
7658                         if (SvUTF8(ret) && IN_BYTES) {
7659                             /* In use 'bytes': make a copy of the octet
7660                              * sequence, but without the flag on */
7661                             STRLEN len;
7662                             const char *const p = SvPV(ret, len);
7663                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7664                         }
7665                         if (rex->intflags & PREGf_USE_RE_EVAL)
7666                             pm_flags |= PMf_USE_RE_EVAL;
7667
7668                         /* if we got here, it should be an engine which
7669                          * supports compiling code blocks and stuff */
7670                         assert(rex->engine && rex->engine->op_comp);
7671                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7672                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7673                                     rex->engine, NULL, NULL,
7674                                     /* copy /msixn etc to inner pattern */
7675                                     ARG2L(scan),
7676                                     pm_flags);
7677
7678                         if (!(SvFLAGS(ret)
7679                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7680                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7681                             /* This isn't a first class regexp. Instead, it's
7682                                caching a regexp onto an existing, Perl visible
7683                                scalar.  */
7684                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7685                         }
7686                     }
7687                     SAVEFREESV(re_sv);
7688                     re = ReANY(re_sv);
7689                 }
7690                 RXp_MATCH_COPIED_off(re);
7691                 re->subbeg = rex->subbeg;
7692                 re->sublen = rex->sublen;
7693                 re->suboffset = rex->suboffset;
7694                 re->subcoffset = rex->subcoffset;
7695                 re->lastparen = 0;
7696                 re->lastcloseparen = 0;
7697                 rei = RXi_GET(re);
7698                 DEBUG_EXECUTE_r(
7699                     debug_start_match(re_sv, utf8_target, locinput,
7700                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7701                 );
7702                 startpoint = rei->program + 1;
7703                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7704                                              * close_paren only for GOSUB */
7705                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7706                 /* Save all the seen positions so far. */
7707                 ST.cp = regcppush(rex, 0, maxopenparen);
7708                 REGCP_SET(ST.lastcp);
7709                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7710                 maxopenparen = 0;
7711                 /* run the pattern returned from (??{...}) */
7712
7713               eval_recurse_doit: /* Share code with GOSUB below this line
7714                             * At this point we expect the stack context to be
7715                             * set up correctly */
7716
7717                 /* invalidate the S-L poscache. We're now executing a
7718                  * different set of WHILEM ops (and their associated
7719                  * indexes) against the same string, so the bits in the
7720                  * cache are meaningless. Setting maxiter to zero forces
7721                  * the cache to be invalidated and zeroed before reuse.
7722                  * XXX This is too dramatic a measure. Ideally we should
7723                  * save the old cache and restore when running the outer
7724                  * pattern again */
7725                 reginfo->poscache_maxiter = 0;
7726
7727                 /* the new regexp might have a different is_utf8_pat than we do */
7728                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7729
7730                 ST.prev_rex = rex_sv;
7731                 ST.prev_curlyx = cur_curlyx;
7732                 rex_sv = re_sv;
7733                 SET_reg_curpm(rex_sv);
7734                 rex = re;
7735                 rexi = rei;
7736                 cur_curlyx = NULL;
7737                 ST.B = next;
7738                 ST.prev_eval = cur_eval;
7739                 cur_eval = st;
7740                 /* now continue from first node in postoned RE */
7741                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
7742                                     loceol, script_run_begin);
7743                 NOT_REACHED; /* NOTREACHED */
7744         }
7745
7746         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7747             /* note: this is called twice; first after popping B, then A */
7748             DEBUG_STACK_r({
7749                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7750                     depth, cur_eval, ST.prev_eval);
7751             });
7752
7753 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7754             if ( cur_eval && CUR_EVAL.close_paren ) {\
7755                 DEBUG_STACK_r({ \
7756                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7757                         depth,    \
7758                         CUR_EVAL.close_paren - 1,\
7759                         cur_eval, \
7760                         VAL);     \
7761                 });               \
7762                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7763             }
7764
7765             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7766
7767             rex_sv = ST.prev_rex;
7768             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7769             SET_reg_curpm(rex_sv);
7770             rex = ReANY(rex_sv);
7771             rexi = RXi_GET(rex);
7772             {
7773                 /* preserve $^R across LEAVE's. See Bug 121070. */
7774                 SV *save_sv= GvSV(PL_replgv);
7775                 SV *replsv;
7776                 SvREFCNT_inc(save_sv);
7777                 regcpblow(ST.cp); /* LEAVE in disguise */
7778                 /* don't move this initialization up */
7779                 replsv = GvSV(PL_replgv);
7780                 sv_setsv(replsv, save_sv);
7781                 SvSETMAGIC(replsv);
7782                 SvREFCNT_dec(save_sv);
7783             }
7784             cur_eval = ST.prev_eval;
7785             cur_curlyx = ST.prev_curlyx;
7786
7787             /* Invalidate cache. See "invalidate" comment above. */
7788             reginfo->poscache_maxiter = 0;
7789             if ( nochange_depth )
7790                 nochange_depth--;
7791
7792             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7793             sayYES;
7794
7795
7796         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7797             REGCP_UNWIND(ST.lastcp);
7798             sayNO;
7799
7800         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7801             /* note: this is called twice; first after popping B, then A */
7802             DEBUG_STACK_r({
7803                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7804                     depth, cur_eval, ST.prev_eval);
7805             });
7806
7807             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7808
7809             rex_sv = ST.prev_rex;
7810             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7811             SET_reg_curpm(rex_sv);
7812             rex = ReANY(rex_sv);
7813             rexi = RXi_GET(rex);
7814
7815             REGCP_UNWIND(ST.lastcp);
7816             regcppop(rex, &maxopenparen);
7817             cur_eval = ST.prev_eval;
7818             cur_curlyx = ST.prev_curlyx;
7819
7820             /* Invalidate cache. See "invalidate" comment above. */
7821             reginfo->poscache_maxiter = 0;
7822             if ( nochange_depth )
7823                 nochange_depth--;
7824
7825             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7826             sayNO_SILENT;
7827 #undef ST
7828
7829         case OPEN: /*  (  */
7830             n = ARG(scan);  /* which paren pair */
7831             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7832             if (n > maxopenparen)
7833                 maxopenparen = n;
7834             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7835                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7836                 depth,
7837                 PTR2UV(rex),
7838                 PTR2UV(rex->offs),
7839                 (UV)n,
7840                 (IV)rex->offs[n].start_tmp,
7841                 (UV)maxopenparen
7842             ));
7843             lastopen = n;
7844             break;
7845
7846         case SROPEN: /*  (*SCRIPT_RUN:  */
7847             script_run_begin = (U8 *) locinput;
7848             break;
7849
7850
7851         case CLOSE:  /*  )  */
7852             n = ARG(scan);  /* which paren pair */
7853             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7854                              locinput - reginfo->strbeg);
7855             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7856                 goto fake_end;
7857
7858             break;
7859
7860         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7861
7862             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7863             {
7864                 sayNO;
7865             }
7866
7867             break;
7868
7869
7870         case ACCEPT:  /*  (*ACCEPT)  */
7871             if (scan->flags)
7872                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7873             if (ARG2L(scan)){
7874                 regnode *cursor;
7875                 for (cursor=scan;
7876                      cursor && OP(cursor)!=END;
7877                      cursor=regnext(cursor))
7878                 {
7879                     if ( OP(cursor)==CLOSE ){
7880                         n = ARG(cursor);
7881                         if ( n <= lastopen ) {
7882                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7883                                              locinput - reginfo->strbeg);
7884                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7885                                 break;
7886                         }
7887                     }
7888                 }
7889             }
7890             goto fake_end;
7891             /* NOTREACHED */
7892
7893         case GROUPP:  /*  (?(1))  */
7894             n = ARG(scan);  /* which paren pair */
7895             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7896             break;
7897
7898         case GROUPPN:  /*  (?(<name>))  */
7899             /* reg_check_named_buff_matched returns 0 for no match */
7900             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7901             break;
7902
7903         case INSUBP:   /*  (?(R))  */
7904             n = ARG(scan);
7905             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7906              * of SCAN is already set up as matches a eval.close_paren */
7907             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7908             break;
7909
7910         case DEFINEP:  /*  (?(DEFINE))  */
7911             sw = 0;
7912             break;
7913
7914         case IFTHEN:   /*  (?(cond)A|B)  */
7915             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7916             if (sw)
7917                 next = NEXTOPER(NEXTOPER(scan));
7918             else {
7919                 next = scan + ARG(scan);
7920                 if (OP(next) == IFTHEN) /* Fake one. */
7921                     next = NEXTOPER(NEXTOPER(next));
7922             }
7923             break;
7924
7925         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7926             logical = scan->flags;
7927             break;
7928
7929 /*******************************************************************
7930
7931 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7932 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7933 STAR/PLUS/CURLY/CURLYN are used instead.)
7934
7935 A*B is compiled as <CURLYX><A><WHILEM><B>
7936
7937 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7938 state, which contains the current count, initialised to -1. It also sets
7939 cur_curlyx to point to this state, with any previous value saved in the
7940 state block.
7941
7942 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7943 since the pattern may possibly match zero times (i.e. it's a while {} loop
7944 rather than a do {} while loop).
7945
7946 Each entry to WHILEM represents a successful match of A. The count in the
7947 CURLYX block is incremented, another WHILEM state is pushed, and execution
7948 passes to A or B depending on greediness and the current count.
7949
7950 For example, if matching against the string a1a2a3b (where the aN are
7951 substrings that match /A/), then the match progresses as follows: (the
7952 pushed states are interspersed with the bits of strings matched so far):
7953
7954     <CURLYX cnt=-1>
7955     <CURLYX cnt=0><WHILEM>
7956     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7957     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7958     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7959     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7960
7961 (Contrast this with something like CURLYM, which maintains only a single
7962 backtrack state:
7963
7964     <CURLYM cnt=0> a1
7965     a1 <CURLYM cnt=1> a2
7966     a1 a2 <CURLYM cnt=2> a3
7967     a1 a2 a3 <CURLYM cnt=3> b
7968 )
7969
7970 Each WHILEM state block marks a point to backtrack to upon partial failure
7971 of A or B, and also contains some minor state data related to that
7972 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7973 overall state, such as the count, and pointers to the A and B ops.
7974
7975 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7976 must always point to the *current* CURLYX block, the rules are:
7977
7978 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7979 and set cur_curlyx to point the new block.
7980
7981 When popping the CURLYX block after a successful or unsuccessful match,
7982 restore the previous cur_curlyx.
7983
7984 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7985 to the outer one saved in the CURLYX block.
7986
7987 When popping the WHILEM block after a successful or unsuccessful B match,
7988 restore the previous cur_curlyx.
7989
7990 Here's an example for the pattern (AI* BI)*BO
7991 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7992
7993 cur_
7994 curlyx backtrack stack
7995 ------ ---------------
7996 NULL
7997 CO     <CO prev=NULL> <WO>
7998 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7999 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8000 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8001
8002 At this point the pattern succeeds, and we work back down the stack to
8003 clean up, restoring as we go:
8004
8005 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8006 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8007 CO     <CO prev=NULL> <WO>
8008 NULL
8009
8010 *******************************************************************/
8011
8012 #define ST st->u.curlyx
8013
8014         case CURLYX:    /* start of /A*B/  (for complex A) */
8015         {
8016             /* No need to save/restore up to this paren */
8017             I32 parenfloor = scan->flags;
8018
8019             assert(next); /* keep Coverity happy */
8020             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
8021                 next += ARG(next);
8022
8023             /* XXXX Probably it is better to teach regpush to support
8024                parenfloor > maxopenparen ... */
8025             if (parenfloor > (I32)rex->lastparen)
8026                 parenfloor = rex->lastparen; /* Pessimization... */
8027
8028             ST.prev_curlyx= cur_curlyx;
8029             cur_curlyx = st;
8030             ST.cp = PL_savestack_ix;
8031
8032             /* these fields contain the state of the current curly.
8033              * they are accessed by subsequent WHILEMs */
8034             ST.parenfloor = parenfloor;
8035             ST.me = scan;
8036             ST.B = next;
8037             ST.minmod = minmod;
8038             minmod = 0;
8039             ST.count = -1;      /* this will be updated by WHILEM */
8040             ST.lastloc = NULL;  /* this will be updated by WHILEM */
8041
8042             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
8043                                 script_run_begin);
8044             NOT_REACHED; /* NOTREACHED */
8045         }
8046
8047         case CURLYX_end: /* just finished matching all of A*B */
8048             cur_curlyx = ST.prev_curlyx;
8049             sayYES;
8050             NOT_REACHED; /* NOTREACHED */
8051
8052         case CURLYX_end_fail: /* just failed to match all of A*B */
8053             regcpblow(ST.cp);
8054             cur_curlyx = ST.prev_curlyx;
8055             sayNO;
8056             NOT_REACHED; /* NOTREACHED */
8057
8058
8059 #undef ST
8060 #define ST st->u.whilem
8061
8062         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
8063         {
8064             /* see the discussion above about CURLYX/WHILEM */
8065             I32 n;
8066             int min, max;
8067             regnode *A;
8068
8069             assert(cur_curlyx); /* keep Coverity happy */
8070
8071             min = ARG1(cur_curlyx->u.curlyx.me);
8072             max = ARG2(cur_curlyx->u.curlyx.me);
8073             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
8074             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8075             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8076             ST.cache_offset = 0;
8077             ST.cache_mask = 0;
8078
8079
8080             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
8081                   depth, (long)n, min, max)
8082             );
8083
8084             /* First just match a string of min A's. */
8085
8086             if (n < min) {
8087                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8088                 cur_curlyx->u.curlyx.lastloc = locinput;
8089                 REGCP_SET(ST.lastcp);
8090
8091                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8092                                 script_run_begin);
8093                 NOT_REACHED; /* NOTREACHED */
8094             }
8095
8096             /* If degenerate A matches "", assume A done. */
8097
8098             if (locinput == cur_curlyx->u.curlyx.lastloc) {
8099                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
8100                    depth)
8101                 );
8102                 goto do_whilem_B_max;
8103             }
8104
8105             /* super-linear cache processing.
8106              *
8107              * The idea here is that for certain types of CURLYX/WHILEM -
8108              * principally those whose upper bound is infinity (and
8109              * excluding regexes that have things like \1 and other very
8110              * non-regular expresssiony things), then if a pattern like
8111              * /....A*.../ fails and we backtrack to the WHILEM, then we
8112              * make a note that this particular WHILEM op was at string
8113              * position 47 (say) when the rest of pattern failed. Then, if
8114              * we ever find ourselves back at that WHILEM, and at string
8115              * position 47 again, we can just fail immediately rather than
8116              * running the rest of the pattern again.
8117              *
8118              * This is very handy when patterns start to go
8119              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8120              * with a combinatorial explosion of backtracking.
8121              *
8122              * The cache is implemented as a bit array, with one bit per
8123              * string byte position per WHILEM op (up to 16) - so its
8124              * between 0.25 and 2x the string size.
8125              *
8126              * To avoid allocating a poscache buffer every time, we do an
8127              * initially countdown; only after we have  executed a WHILEM
8128              * op (string-length x #WHILEMs) times do we allocate the
8129              * cache.
8130              *
8131              * The top 4 bits of scan->flags byte say how many different
8132              * relevant CURLLYX/WHILEM op pairs there are, while the
8133              * bottom 4-bits is the identifying index number of this
8134              * WHILEM.
8135              */
8136
8137             if (scan->flags) {
8138
8139                 if (!reginfo->poscache_maxiter) {
8140                     /* start the countdown: Postpone detection until we
8141                      * know the match is not *that* much linear. */
8142                     reginfo->poscache_maxiter
8143                         =    (reginfo->strend - reginfo->strbeg + 1)
8144                            * (scan->flags>>4);
8145                     /* possible overflow for long strings and many CURLYX's */
8146                     if (reginfo->poscache_maxiter < 0)
8147                         reginfo->poscache_maxiter = I32_MAX;
8148                     reginfo->poscache_iter = reginfo->poscache_maxiter;
8149                 }
8150
8151                 if (reginfo->poscache_iter-- == 0) {
8152                     /* initialise cache */
8153                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8154                     regmatch_info_aux *const aux = reginfo->info_aux;
8155                     if (aux->poscache) {
8156                         if ((SSize_t)reginfo->poscache_size < size) {
8157                             Renew(aux->poscache, size, char);
8158                             reginfo->poscache_size = size;
8159                         }
8160                         Zero(aux->poscache, size, char);
8161                     }
8162                     else {
8163                         reginfo->poscache_size = size;
8164                         Newxz(aux->poscache, size, char);
8165                     }
8166                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8167       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8168                               PL_colors[4], PL_colors[5])
8169                     );
8170                 }
8171
8172                 if (reginfo->poscache_iter < 0) {
8173                     /* have we already failed at this position? */
8174                     SSize_t offset, mask;
8175
8176                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8177                     offset  = (scan->flags & 0xf) - 1
8178                                 +   (locinput - reginfo->strbeg)
8179                                   * (scan->flags>>4);
8180                     mask    = 1 << (offset % 8);
8181                     offset /= 8;
8182                     if (reginfo->info_aux->poscache[offset] & mask) {
8183                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8184                             depth)
8185                         );
8186                         cur_curlyx->u.curlyx.count--;
8187                         sayNO; /* cache records failure */
8188                     }
8189                     ST.cache_offset = offset;
8190                     ST.cache_mask   = mask;
8191                 }
8192             }
8193
8194             /* Prefer B over A for minimal matching. */
8195
8196             if (cur_curlyx->u.curlyx.minmod) {
8197                 ST.save_curlyx = cur_curlyx;
8198                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8199                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8200                                     locinput, loceol, script_run_begin);
8201                 NOT_REACHED; /* NOTREACHED */
8202             }
8203
8204             /* Prefer A over B for maximal matching. */
8205
8206             if (n < max) { /* More greed allowed? */
8207                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8208                             maxopenparen);
8209                 cur_curlyx->u.curlyx.lastloc = locinput;
8210                 REGCP_SET(ST.lastcp);
8211                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8212                                 script_run_begin);
8213                 NOT_REACHED; /* NOTREACHED */
8214             }
8215             goto do_whilem_B_max;
8216         }
8217         NOT_REACHED; /* NOTREACHED */
8218
8219         case WHILEM_B_min: /* just matched B in a minimal match */
8220         case WHILEM_B_max: /* just matched B in a maximal match */
8221             cur_curlyx = ST.save_curlyx;
8222             sayYES;
8223             NOT_REACHED; /* NOTREACHED */
8224
8225         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8226             cur_curlyx = ST.save_curlyx;
8227             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8228             cur_curlyx->u.curlyx.count--;
8229             CACHEsayNO;
8230             NOT_REACHED; /* NOTREACHED */
8231
8232         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8233             /* FALLTHROUGH */
8234         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8235             REGCP_UNWIND(ST.lastcp);
8236             regcppop(rex, &maxopenparen);
8237             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8238             cur_curlyx->u.curlyx.count--;
8239             CACHEsayNO;
8240             NOT_REACHED; /* NOTREACHED */
8241
8242         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8243             REGCP_UNWIND(ST.lastcp);
8244             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8245             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8246                 depth)
8247             );
8248           do_whilem_B_max:
8249             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8250                 && ckWARN(WARN_REGEXP)
8251                 && !reginfo->warned)
8252             {
8253                 reginfo->warned = TRUE;
8254                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8255                      "Complex regular subexpression recursion limit (%d) "
8256                      "exceeded",
8257                      REG_INFTY - 1);
8258             }
8259
8260             /* now try B */
8261             ST.save_curlyx = cur_curlyx;
8262             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8263             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8264                                 locinput, loceol, script_run_begin);
8265             NOT_REACHED; /* NOTREACHED */
8266
8267         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8268             cur_curlyx = ST.save_curlyx;
8269
8270             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8271                 /* Maximum greed exceeded */
8272                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8273                     && ckWARN(WARN_REGEXP)
8274                     && !reginfo->warned)
8275                 {
8276                     reginfo->warned     = TRUE;
8277                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8278                         "Complex regular subexpression recursion "
8279                         "limit (%d) exceeded",
8280                         REG_INFTY - 1);
8281                 }
8282                 cur_curlyx->u.curlyx.count--;
8283                 CACHEsayNO;
8284             }
8285
8286             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8287             );
8288             /* Try grabbing another A and see if it helps. */
8289             cur_curlyx->u.curlyx.lastloc = locinput;
8290             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8291                             maxopenparen);
8292             REGCP_SET(ST.lastcp);
8293             PUSH_STATE_GOTO(WHILEM_A_min,
8294                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8295                 locinput, loceol, script_run_begin);
8296             NOT_REACHED; /* NOTREACHED */
8297
8298 #undef  ST
8299 #define ST st->u.branch
8300
8301         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8302             next = scan + ARG(scan);
8303             if (next == scan)
8304                 next = NULL;
8305             scan = NEXTOPER(scan);
8306             /* FALLTHROUGH */
8307
8308         case BRANCH:        /*  /(...|A|...)/ */
8309             scan = NEXTOPER(scan); /* scan now points to inner node */
8310             ST.lastparen = rex->lastparen;
8311             ST.lastcloseparen = rex->lastcloseparen;
8312             ST.next_branch = next;
8313             REGCP_SET(ST.cp);
8314
8315             /* Now go into the branch */
8316             if (has_cutgroup) {
8317                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8318                                     script_run_begin);
8319             } else {
8320                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8321                                 script_run_begin);
8322             }
8323             NOT_REACHED; /* NOTREACHED */
8324
8325         case CUTGROUP:  /*  /(*THEN)/  */
8326             sv_yes_mark = st->u.mark.mark_name = scan->flags
8327                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8328                 : NULL;
8329             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8330                             script_run_begin);
8331             NOT_REACHED; /* NOTREACHED */
8332
8333         case CUTGROUP_next_fail:
8334             do_cutgroup = 1;
8335             no_final = 1;
8336             if (st->u.mark.mark_name)
8337                 sv_commit = st->u.mark.mark_name;
8338             sayNO;
8339             NOT_REACHED; /* NOTREACHED */
8340
8341         case BRANCH_next:
8342             sayYES;
8343             NOT_REACHED; /* NOTREACHED */
8344
8345         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8346             if (do_cutgroup) {
8347                 do_cutgroup = 0;
8348                 no_final = 0;
8349             }
8350             REGCP_UNWIND(ST.cp);
8351             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8352             scan = ST.next_branch;
8353             /* no more branches? */
8354             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8355                 DEBUG_EXECUTE_r({
8356                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8357                         depth,
8358                         PL_colors[4],
8359                         PL_colors[5] );
8360                 });
8361                 sayNO_SILENT;
8362             }
8363             continue; /* execute next BRANCH[J] op */
8364             /* NOTREACHED */
8365
8366         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8367             minmod = 1;
8368             break;
8369
8370 #undef  ST
8371 #define ST st->u.curlym
8372
8373         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8374
8375             /* This is an optimisation of CURLYX that enables us to push
8376              * only a single backtracking state, no matter how many matches
8377              * there are in {m,n}. It relies on the pattern being constant
8378              * length, with no parens to influence future backrefs
8379              */
8380
8381             ST.me = scan;
8382             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8383
8384             ST.lastparen      = rex->lastparen;
8385             ST.lastcloseparen = rex->lastcloseparen;
8386
8387             /* if paren positive, emulate an OPEN/CLOSE around A */
8388             if (ST.me->flags) {
8389                 U32 paren = ST.me->flags;
8390                 if (paren > maxopenparen)
8391                     maxopenparen = paren;
8392                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8393             }
8394             ST.A = scan;
8395             ST.B = next;
8396             ST.alen = 0;
8397             ST.count = 0;
8398             ST.minmod = minmod;
8399             minmod = 0;
8400             ST.c1 = CHRTEST_UNINIT;
8401             REGCP_SET(ST.cp);
8402
8403             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8404                 goto curlym_do_B;
8405
8406           curlym_do_A: /* execute the A in /A{m,n}B/  */
8407             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8408                                 script_run_begin);
8409             NOT_REACHED; /* NOTREACHED */
8410
8411         case CURLYM_A: /* we've just matched an A */
8412             ST.count++;
8413             /* after first match, determine A's length: u.curlym.alen */
8414             if (ST.count == 1) {
8415                 if (reginfo->is_utf8_target) {
8416                     char *s = st->locinput;
8417                     while (s < locinput) {
8418                         ST.alen++;
8419                         s += UTF8SKIP(s);
8420                     }
8421                 }
8422                 else {
8423                     ST.alen = locinput - st->locinput;
8424                 }
8425                 if (ST.alen == 0)
8426                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8427             }
8428             DEBUG_EXECUTE_r(
8429                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8430                           depth, (IV) ST.count, (IV)ST.alen)
8431             );
8432
8433             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8434                 goto fake_end;
8435
8436             {
8437                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8438                 if ( max == REG_INFTY || ST.count < max )
8439                     goto curlym_do_A; /* try to match another A */
8440             }
8441             goto curlym_do_B; /* try to match B */
8442
8443         case CURLYM_A_fail: /* just failed to match an A */
8444             REGCP_UNWIND(ST.cp);
8445
8446
8447             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8448                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8449                 sayNO;
8450
8451           curlym_do_B: /* execute the B in /A{m,n}B/  */
8452             if (ST.c1 == CHRTEST_UNINIT) {
8453                 /* calculate c1 and c2 for possible match of 1st char
8454                  * following curly */
8455                 ST.c1 = ST.c2 = CHRTEST_VOID;
8456                 assert(ST.B);
8457                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8458                     regnode *text_node = ST.B;
8459                     if (! HAS_TEXT(text_node))
8460                         FIND_NEXT_IMPT(text_node);
8461                     if (PL_regkind[OP(text_node)] == EXACT) {
8462                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8463                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8464                            reginfo))
8465                         {
8466                             sayNO;
8467                         }
8468                     }
8469                 }
8470             }
8471
8472             DEBUG_EXECUTE_r(
8473                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8474                     depth, (IV)ST.count)
8475                 );
8476             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8477                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8478
8479                            /* (We can use memEQ and memNE in this file without
8480                             * having to worry about one being shorter than the
8481                             * other, since the first byte of each gives the
8482                             * length of the character) */
8483                     if (   memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
8484                                                               reginfo->strend))
8485                         && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
8486                                                              reginfo->strend)))
8487                     {
8488                         /* simulate B failing */
8489                         DEBUG_OPTIMISE_r(
8490                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8491                                 depth,
8492                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8493                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8494                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8495                         );
8496                         state_num = CURLYM_B_fail;
8497                         goto reenter_switch;
8498                     }
8499                 }
8500                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8501                     /* simulate B failing */
8502                     DEBUG_OPTIMISE_r(
8503                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8504                             depth,
8505                             (int) nextchr, ST.c1, ST.c2)
8506                     );
8507                     state_num = CURLYM_B_fail;
8508                     goto reenter_switch;
8509                 }
8510             }
8511
8512             if (ST.me->flags) {
8513                 /* emulate CLOSE: mark current A as captured */
8514                 U32 paren = (U32)ST.me->flags;
8515                 if (ST.count) {
8516                     CLOSE_CAPTURE(paren,
8517                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
8518                         locinput - reginfo->strbeg);
8519                 }
8520                 else
8521                     rex->offs[paren].end = -1;
8522
8523                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8524                 {
8525                     if (ST.count)
8526                         goto fake_end;
8527                     else
8528                         sayNO;
8529                 }
8530             }
8531
8532             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
8533                             script_run_begin);
8534             NOT_REACHED; /* NOTREACHED */
8535
8536         case CURLYM_B_fail: /* just failed to match a B */
8537             REGCP_UNWIND(ST.cp);
8538             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8539             if (ST.minmod) {
8540                 I32 max = ARG2(ST.me);
8541                 if (max != REG_INFTY && ST.count == max)
8542                     sayNO;
8543                 goto curlym_do_A; /* try to match a further A */
8544             }
8545             /* backtrack one A */
8546             if (ST.count == ARG1(ST.me) /* min */)
8547                 sayNO;
8548             ST.count--;
8549             SET_locinput(HOPc(locinput, -ST.alen));
8550             goto curlym_do_B; /* try to match B */
8551
8552 #undef ST
8553 #define ST st->u.curly
8554
8555 #define CURLY_SETPAREN(paren, success) \
8556     if (paren) { \
8557         if (success) { \
8558             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8559                                  locinput - reginfo->strbeg); \
8560         } \
8561         else { \
8562             rex->offs[paren].end = -1; \
8563             rex->lastparen      = ST.lastparen; \
8564             rex->lastcloseparen = ST.lastcloseparen; \
8565         } \
8566     }
8567
8568         case STAR:              /*  /A*B/ where A is width 1 char */
8569             ST.paren = 0;
8570             ST.min = 0;
8571             ST.max = REG_INFTY;
8572             scan = NEXTOPER(scan);
8573             goto repeat;
8574
8575         case PLUS:              /*  /A+B/ where A is width 1 char */
8576             ST.paren = 0;
8577             ST.min = 1;
8578             ST.max = REG_INFTY;
8579             scan = NEXTOPER(scan);
8580             goto repeat;
8581
8582         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8583             ST.paren = scan->flags;     /* Which paren to set */
8584             ST.lastparen      = rex->lastparen;
8585             ST.lastcloseparen = rex->lastcloseparen;
8586             if (ST.paren > maxopenparen)
8587                 maxopenparen = ST.paren;
8588             ST.min = ARG1(scan);  /* min to match */
8589             ST.max = ARG2(scan);  /* max to match */
8590             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8591
8592             /* handle the single-char capture called as a GOSUB etc */
8593             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8594             {
8595                 char *li = locinput;
8596                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
8597                     sayNO;
8598                 SET_locinput(li);
8599                 goto fake_end;
8600             }
8601
8602             goto repeat;
8603
8604         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8605             ST.paren = 0;
8606             ST.min = ARG1(scan);  /* min to match */
8607             ST.max = ARG2(scan);  /* max to match */
8608             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8609           repeat:
8610             /*
8611             * Lookahead to avoid useless match attempts
8612             * when we know what character comes next.
8613             *
8614             * Used to only do .*x and .*?x, but now it allows
8615             * for )'s, ('s and (?{ ... })'s to be in the way
8616             * of the quantifier and the EXACT-like node.  -- japhy
8617             */
8618
8619             assert(ST.min <= ST.max);
8620             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8621                 ST.c1 = ST.c2 = CHRTEST_VOID;
8622             }
8623             else {
8624                 regnode *text_node = next;
8625
8626                 if (! HAS_TEXT(text_node))
8627                     FIND_NEXT_IMPT(text_node);
8628
8629                 if (! HAS_TEXT(text_node))
8630                     ST.c1 = ST.c2 = CHRTEST_VOID;
8631                 else {
8632                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8633                         ST.c1 = ST.c2 = CHRTEST_VOID;
8634                     }
8635                     else {
8636                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8637                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8638                            reginfo))
8639                         {
8640                             sayNO;
8641                         }
8642                     }
8643                 }
8644             }
8645
8646             ST.A = scan;
8647             ST.B = next;
8648             if (minmod) {
8649                 char *li = locinput;
8650                 minmod = 0;
8651                 if (ST.min &&
8652                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
8653                             < ST.min)
8654                     sayNO;
8655                 SET_locinput(li);
8656                 ST.count = ST.min;
8657                 REGCP_SET(ST.cp);
8658                 if (ST.c1 == CHRTEST_VOID)
8659                     goto curly_try_B_min;
8660
8661                 ST.oldloc = locinput;
8662
8663                 /* set ST.maxpos to the furthest point along the
8664                  * string that could possibly match */
8665                 if  (ST.max == REG_INFTY) {
8666                     ST.maxpos = loceol - 1;
8667                     if (utf8_target)
8668                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8669                             ST.maxpos--;
8670                 }
8671                 else if (utf8_target) {
8672                     int m = ST.max - ST.min;
8673                     for (ST.maxpos = locinput;
8674                          m >0 && ST.maxpos <  loceol; m--)
8675                         ST.maxpos += UTF8SKIP(ST.maxpos);
8676                 }
8677                 else {
8678                     ST.maxpos = locinput + ST.max - ST.min;
8679                     if (ST.maxpos >=  loceol)
8680                         ST.maxpos =  loceol - 1;
8681                 }
8682                 goto curly_try_B_min_known;
8683
8684             }
8685             else {
8686                 /* avoid taking address of locinput, so it can remain
8687                  * a register var */
8688                 char *li = locinput;
8689                 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
8690                 if (ST.count < ST.min)
8691                     sayNO;
8692                 SET_locinput(li);
8693                 if ((ST.count > ST.min)
8694                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8695                 {
8696                     /* A{m,n} must come at the end of the string, there's
8697                      * no point in backing off ... */
8698                     ST.min = ST.count;
8699                     /* ...except that $ and \Z can match before *and* after
8700                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8701                        We may back off by one in this case. */
8702                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8703                         ST.min--;
8704                 }
8705                 REGCP_SET(ST.cp);
8706                 goto curly_try_B_max;
8707             }
8708             NOT_REACHED; /* NOTREACHED */
8709
8710         case CURLY_B_min_fail:
8711             /* failed to find B in a non-greedy match.
8712              * Handles both cases where c1,c2 valid or not */
8713
8714             REGCP_UNWIND(ST.cp);
8715             if (ST.paren) {
8716                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8717             }
8718
8719             if (ST.c1 == CHRTEST_VOID) {
8720                 /* failed -- move forward one */
8721                 char *li = locinput;
8722                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
8723                     sayNO;
8724                 }
8725                 locinput = li;
8726                 ST.count++;
8727                 if (!(   ST.count <= ST.max
8728                         /* count overflow ? */
8729                      || (ST.max == REG_INFTY && ST.count > 0))
8730                 )
8731                     sayNO;
8732             }
8733             else {
8734                 int n;
8735                 /* Couldn't or didn't -- move forward. */
8736                 ST.oldloc = locinput;
8737                 if (utf8_target)
8738                     locinput += UTF8SKIP(locinput);
8739                 else
8740                     locinput++;
8741                 ST.count++;
8742
8743               curly_try_B_min_known:
8744                 /* find the next place where 'B' could work, then call B */
8745                 if (utf8_target) {
8746                     n = (ST.oldloc == locinput) ? 0 : 1;
8747                     if (ST.c1 == ST.c2) {
8748                         /* set n to utf8_distance(oldloc, locinput) */
8749                         while (    locinput <= ST.maxpos
8750                                &&  locinput < loceol
8751                                &&  memNE(locinput, ST.c1_utf8,
8752                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8753                         {
8754                             locinput += UTF8_SAFE_SKIP(locinput,
8755                                                        reginfo->strend);
8756                             n++;
8757                         }
8758                     }
8759                     else {
8760                         /* set n to utf8_distance(oldloc, locinput) */
8761                         while (   locinput <= ST.maxpos
8762                                && locinput < loceol
8763                                && memNE(locinput, ST.c1_utf8,
8764                                      UTF8_SAFE_SKIP(locinput, reginfo->strend))
8765                                && memNE(locinput, ST.c2_utf8,
8766                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8767                         {
8768                             locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
8769                             n++;
8770                         }
8771                     }
8772                 }
8773                 else {  /* Not utf8_target */
8774                     if (ST.c1 == ST.c2) {
8775                         locinput = (char *) memchr(locinput,
8776                                                    ST.c1,
8777                                                    ST.maxpos + 1 - locinput);
8778                         if (! locinput) {
8779                             locinput = ST.maxpos + 1;
8780                         }
8781                     }
8782                     else {
8783                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8784
8785                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8786                             while (   locinput <= ST.maxpos
8787                                    && UCHARAT(locinput) != ST.c1
8788                                    && UCHARAT(locinput) != ST.c2)
8789                             {
8790                                 locinput++;
8791                             }
8792                         }
8793                         else {
8794                             /* If c1 and c2 only differ by a single bit, we can
8795                              * avoid a conditional each time through the loop,
8796                              * at the expense of a little preliminary setup and
8797                              * an extra mask each iteration.  By masking out
8798                              * that bit, we match exactly two characters, c1
8799                              * and c2, and so we don't have to test for both.
8800                              * On both ASCII and EBCDIC platforms, most of the
8801                              * ASCII-range and Latin1-range folded equivalents
8802                              * differ only in a single bit, so this is actually
8803                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8804                              * 0x61). */
8805                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8806                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8807                             while (   locinput <= ST.maxpos
8808                                    && (UCHARAT(locinput) & c1_c2_mask)
8809                                                                 != c1_masked)
8810                             {
8811                                 locinput++;
8812                             }
8813                         }
8814                     }
8815                     n = locinput - ST.oldloc;
8816                 }
8817                 if (locinput > ST.maxpos)
8818                     sayNO;
8819                 if (n) {
8820                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8821                      * at b; check that everything between oldloc and
8822                      * locinput matches */
8823                     char *li = ST.oldloc;
8824                     ST.count += n;
8825                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
8826                         sayNO;
8827                     assert(n == REG_INFTY || locinput == li);
8828                 }
8829             }
8830
8831           curly_try_B_min:
8832             CURLY_SETPAREN(ST.paren, ST.count);
8833             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
8834                             script_run_begin);
8835             NOT_REACHED; /* NOTREACHED */
8836
8837
8838           curly_try_B_max:
8839             /* a successful greedy match: now try to match B */
8840             {
8841                 bool could_match = locinput <  loceol;
8842
8843                 /* If it could work, try it. */
8844                 if (ST.c1 != CHRTEST_VOID && could_match) {
8845                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8846                     {
8847                         could_match =  memEQ(locinput, ST.c1_utf8,
8848                                              UTF8_SAFE_SKIP(locinput,
8849                                                             reginfo->strend))
8850                                     || memEQ(locinput, ST.c2_utf8,
8851                                              UTF8_SAFE_SKIP(locinput,
8852                                                             reginfo->strend));
8853                     }
8854                     else {
8855                         could_match =   UCHARAT(locinput) == ST.c1
8856                                      || UCHARAT(locinput) == ST.c2;
8857                     }
8858                 }
8859                 if (ST.c1 == CHRTEST_VOID || could_match) {
8860                     CURLY_SETPAREN(ST.paren, ST.count);
8861                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
8862                                     script_run_begin);
8863                     NOT_REACHED; /* NOTREACHED */
8864                 }
8865             }
8866             /* FALLTHROUGH */
8867
8868         case CURLY_B_max_fail:
8869             /* failed to find B in a greedy match */
8870
8871             REGCP_UNWIND(ST.cp);
8872             if (ST.paren) {
8873                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8874             }
8875             /*  back up. */
8876             if (--ST.count < ST.min)
8877                 sayNO;
8878             locinput = HOPc(locinput, -1);
8879             goto curly_try_B_max;
8880
8881 #undef ST
8882
8883         case END: /*  last op of main pattern  */
8884           fake_end:
8885             if (cur_eval) {
8886                 /* we've just finished A in /(??{A})B/; now continue with B */
8887                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8888                 st->u.eval.prev_rex = rex_sv;           /* inner */
8889
8890                 /* Save *all* the positions. */
8891                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8892                 rex_sv = CUR_EVAL.prev_rex;
8893                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8894                 SET_reg_curpm(rex_sv);
8895                 rex = ReANY(rex_sv);
8896                 rexi = RXi_GET(rex);
8897
8898                 st->u.eval.prev_curlyx = cur_curlyx;
8899                 cur_curlyx = CUR_EVAL.prev_curlyx;
8900
8901                 REGCP_SET(st->u.eval.lastcp);
8902
8903                 /* Restore parens of the outer rex without popping the
8904                  * savestack */
8905                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8906
8907                 st->u.eval.prev_eval = cur_eval;
8908                 cur_eval = CUR_EVAL.prev_eval;
8909                 DEBUG_EXECUTE_r(
8910                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8911                                       depth, cur_eval););
8912                 if ( nochange_depth )
8913                     nochange_depth--;
8914
8915                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8916
8917                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
8918                                     st->u.eval.prev_eval->u.eval.B,
8919                                     locinput, loceol, script_run_begin);
8920             }
8921
8922             if (locinput < reginfo->till) {
8923                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8924                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8925                                       PL_colors[4],
8926                                       (long)(locinput - startpos),
8927                                       (long)(reginfo->till - startpos),
8928                                       PL_colors[5]));
8929
8930                 sayNO_SILENT;           /* Cannot match: too short. */
8931             }
8932             sayYES;                     /* Success! */
8933
8934         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8935             DEBUG_EXECUTE_r(
8936             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8937                 depth, PL_colors[4], PL_colors[5]));
8938             sayYES;                     /* Success! */
8939
8940 #undef  ST
8941 #define ST st->u.ifmatch
8942
8943         case SUSPEND:   /* (?>A) */
8944             ST.wanted = 1;
8945             ST.start = locinput;
8946             ST.end = loceol;
8947             ST.count = 1;
8948             goto do_ifmatch;
8949
8950         case UNLESSM:   /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
8951             ST.wanted = 0;
8952             goto ifmatch_trivial_fail_test;
8953
8954         case IFMATCH:   /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
8955             ST.wanted = 1;
8956           ifmatch_trivial_fail_test:
8957             ST.count = scan->next_off + 1; /* next_off repurposed to be
8958                                               lookbehind count, requires
8959                                               non-zero flags */
8960             if (! scan->flags) {    /* 'flags' zero means lookahed */
8961
8962                 /* Lookahead starts here and ends at the normal place */
8963                 ST.start = locinput;
8964                 ST.end = loceol;
8965             }
8966             else {
8967                 PERL_UINT_FAST8_T back_count = scan->flags;
8968                 char * s;
8969
8970                 /* Lookbehind can look beyond the current position */
8971                 ST.end = loceol;
8972
8973                 /* ... and starts at the first place in the input that is in
8974                  * the range of the possible start positions */
8975                 for (; ST.count > 0; ST.count--, back_count--) {
8976                     s = HOPBACKc(locinput, back_count);
8977                     if (s) {
8978                         ST.start = s;
8979                         goto do_ifmatch;
8980                     }
8981                 }
8982
8983                 /* If the lookbehind doesn't start in the actual string, is a
8984                  * trivial match failure */
8985                 if (logical) {
8986                     logical = 0;
8987                     sw = 1 - cBOOL(ST.wanted);
8988                 }
8989                 else if (ST.wanted)
8990                     sayNO;
8991
8992                 /* Here, we didn't want it to match, so is actually success */
8993                 next = scan + ARG(scan);
8994                 if (next == scan)
8995                     next = NULL;
8996                 break;
8997             }
8998
8999           do_ifmatch:
9000             ST.me = scan;
9001             ST.logical = logical;
9002             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9003
9004             /* execute body of (?...A) */
9005             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
9006                                 ST.end, script_run_begin);
9007             NOT_REACHED; /* NOTREACHED */
9008
9009         {
9010             bool matched;
9011
9012         case IFMATCH_A_fail: /* body of (?...A) failed */
9013             if (! ST.logical && ST.count > 1) {
9014
9015                 /* It isn't a real failure until we've tried all starting
9016                  * positions.  Move to the next starting position and retry */
9017                 ST.count--;
9018                 ST.start = HOPc(ST.start, 1);
9019                 scan = ST.me;
9020                 logical = ST.logical;
9021                 goto do_ifmatch;
9022             }
9023
9024             /* Here, all starting positions have been tried. */
9025             matched = FALSE;
9026             goto ifmatch_done;
9027
9028         case IFMATCH_A: /* body of (?...A) succeeded */
9029             matched = TRUE;
9030           ifmatch_done:
9031             sw = matched == ST.wanted;
9032             if (! ST.logical && !sw) {
9033                 sayNO;
9034             }
9035
9036             if (OP(ST.me) != SUSPEND) {
9037                 /* restore old position except for (?>...) */
9038                 locinput = st->locinput;
9039                 loceol = st->loceol;
9040                 script_run_begin = st->sr0;
9041             }
9042             scan = ST.me + ARG(ST.me);
9043             if (scan == ST.me)
9044                 scan = NULL;
9045             continue; /* execute B */
9046         }
9047
9048 #undef ST
9049
9050         case LONGJMP: /*  alternative with many branches compiles to
9051                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9052             next = scan + ARG(scan);
9053             if (next == scan)
9054                 next = NULL;
9055             break;
9056
9057         case COMMIT:  /*  (*COMMIT)  */
9058             reginfo->cutpoint = loceol;
9059             /* FALLTHROUGH */
9060
9061         case PRUNE:   /*  (*PRUNE)   */
9062             if (scan->flags)
9063                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9064             PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9065                             script_run_begin);
9066             NOT_REACHED; /* NOTREACHED */
9067
9068         case COMMIT_next_fail:
9069             no_final = 1;
9070             /* FALLTHROUGH */
9071             sayNO;
9072             NOT_REACHED; /* NOTREACHED */
9073
9074         case OPFAIL:   /* (*FAIL)  */
9075             if (scan->flags)
9076                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9077             if (logical) {
9078                 /* deal with (?(?!)X|Y) properly,
9079                  * make sure we trigger the no branch
9080                  * of the trailing IFTHEN structure*/
9081                 sw= 0;
9082                 break;
9083             } else {
9084                 sayNO;
9085             }
9086             NOT_REACHED; /* NOTREACHED */
9087
9088 #define ST st->u.mark
9089         case MARKPOINT: /*  (*MARK:foo)  */
9090             ST.prev_mark = mark_state;
9091             ST.mark_name = sv_commit = sv_yes_mark
9092                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9093             mark_state = st;
9094             ST.mark_loc = locinput;
9095             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9096                                 script_run_begin);
9097             NOT_REACHED; /* NOTREACHED */
9098
9099         case MARKPOINT_next:
9100             mark_state = ST.prev_mark;
9101             sayYES;
9102             NOT_REACHED; /* NOTREACHED */
9103
9104         case MARKPOINT_next_fail:
9105             if (popmark && sv_eq(ST.mark_name,popmark))
9106             {
9107                 if (ST.mark_loc > startpoint)
9108                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9109                 popmark = NULL; /* we found our mark */
9110                 sv_commit = ST.mark_name;
9111
9112                 DEBUG_EXECUTE_r({
9113                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9114                             depth,
9115                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9116                 });
9117             }
9118             mark_state = ST.prev_mark;
9119             sv_yes_mark = mark_state ?
9120                 mark_state->u.mark.mark_name : NULL;
9121             sayNO;
9122             NOT_REACHED; /* NOTREACHED */
9123
9124         case SKIP:  /*  (*SKIP)  */
9125             if (!scan->flags) {
9126                 /* (*SKIP) : if we fail we cut here*/
9127                 ST.mark_name = NULL;
9128                 ST.mark_loc = locinput;
9129                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9130                                 script_run_begin);
9131             } else {
9132                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9133                    otherwise do nothing.  Meaning we need to scan
9134                  */
9135                 regmatch_state *cur = mark_state;
9136                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9137
9138                 while (cur) {
9139                     if ( sv_eq( cur->u.mark.mark_name,
9140                                 find ) )
9141                     {
9142                         ST.mark_name = find;
9143                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9144                                          script_run_begin);
9145                     }
9146                     cur = cur->u.mark.prev_mark;
9147                 }
9148             }
9149             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9150             break;
9151
9152         case SKIP_next_fail:
9153             if (ST.mark_name) {
9154                 /* (*CUT:NAME) - Set up to search for the name as we
9155                    collapse the stack*/
9156                 popmark = ST.mark_name;
9157             } else {
9158                 /* (*CUT) - No name, we cut here.*/
9159                 if (ST.mark_loc > startpoint)
9160                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9161                 /* but we set sv_commit to latest mark_name if there
9162                    is one so they can test to see how things lead to this
9163                    cut */
9164                 if (mark_state)
9165                     sv_commit=mark_state->u.mark.mark_name;
9166             }
9167             no_final = 1;
9168             sayNO;
9169             NOT_REACHED; /* NOTREACHED */
9170 #undef ST
9171
9172         case LNBREAK: /* \R */
9173             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9174                 locinput += n;
9175             } else
9176                 sayNO;
9177             break;
9178
9179         default:
9180             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9181                           PTR2UV(scan), OP(scan));
9182             Perl_croak(aTHX_ "regexp memory corruption");
9183
9184         /* this is a point to jump to in order to increment
9185          * locinput by one character */
9186           increment_locinput:
9187             assert(!NEXTCHR_IS_EOS);
9188             if (utf8_target) {
9189                 locinput += PL_utf8skip[nextchr];
9190                 /* locinput is allowed to go 1 char off the end (signifying
9191                  * EOS), but not 2+ */
9192                 if (locinput >  loceol)
9193                     sayNO;
9194             }
9195             else
9196                 locinput++;
9197             break;
9198
9199         } /* end switch */
9200
9201         /* switch break jumps here */
9202         scan = next; /* prepare to execute the next op and ... */
9203         continue;    /* ... jump back to the top, reusing st */
9204         /* NOTREACHED */
9205
9206       push_yes_state:
9207         /* push a state that backtracks on success */
9208         st->u.yes.prev_yes_state = yes_state;
9209         yes_state = st;
9210         /* FALLTHROUGH */
9211       push_state:
9212         /* push a new regex state, then continue at scan  */
9213         {
9214             regmatch_state *newst;
9215             DECLARE_AND_GET_RE_DEBUG_FLAGS;
9216
9217             DEBUG_r( /* DEBUG_STACK_r */
9218               if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9219                 regmatch_state *cur = st;
9220                 regmatch_state *curyes = yes_state;
9221                 U32 i;
9222                 regmatch_slab *slab = PL_regmatch_slab;
9223                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9224                     if (cur < SLAB_FIRST(slab)) {
9225                         slab = slab->prev;
9226                         cur = SLAB_LAST(slab);
9227                     }
9228                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9229                         depth,
9230                         i ? "    " : "push",
9231                         depth - i, PL_reg_name[cur->resume_state],
9232                         (curyes == cur) ? "yes" : ""
9233                     );
9234                     if (curyes == cur)
9235                         curyes = cur->u.yes.prev_yes_state;
9236                 }
9237             } else {
9238                 DEBUG_STATE_pp("push")
9239             });
9240             depth++;
9241             st->locinput = locinput;
9242             st->loceol = loceol;
9243             st->sr0 = script_run_begin;
9244             newst = st+1;
9245             if (newst >  SLAB_LAST(PL_regmatch_slab))
9246                 newst = S_push_slab(aTHX);
9247             PL_regmatch_state = newst;
9248
9249             locinput = pushinput;
9250             loceol = pusheol;
9251             script_run_begin = pushsr0;
9252             st = newst;
9253             continue;
9254             /* NOTREACHED */
9255         }
9256     }
9257 #ifdef SOLARIS_BAD_OPTIMIZER
9258 #  undef PL_charclass
9259 #endif
9260
9261     /*
9262     * We get here only if there's trouble -- normally "case END" is
9263     * the terminating point.
9264     */
9265     Perl_croak(aTHX_ "corrupted regexp pointers");
9266     NOT_REACHED; /* NOTREACHED */
9267
9268   yes:
9269     if (yes_state) {
9270         /* we have successfully completed a subexpression, but we must now
9271          * pop to the state marked by yes_state and continue from there */
9272         assert(st != yes_state);
9273 #ifdef DEBUGGING
9274         while (st != yes_state) {
9275             st--;
9276             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9277                 PL_regmatch_slab = PL_regmatch_slab->prev;
9278                 st = SLAB_LAST(PL_regmatch_slab);
9279             }
9280             DEBUG_STATE_r({
9281                 if (no_final) {
9282                     DEBUG_STATE_pp("pop (no final)");
9283                 } else {
9284                     DEBUG_STATE_pp("pop (yes)");
9285                 }
9286             });
9287             depth--;
9288         }
9289 #else
9290         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9291             || yes_state > SLAB_LAST(PL_regmatch_slab))
9292         {
9293             /* not in this slab, pop slab */
9294             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9295             PL_regmatch_slab = PL_regmatch_slab->prev;
9296             st = SLAB_LAST(PL_regmatch_slab);
9297         }
9298         depth -= (st - yes_state);
9299 #endif
9300         st = yes_state;
9301         yes_state = st->u.yes.prev_yes_state;
9302         PL_regmatch_state = st;
9303
9304         if (no_final) {
9305             locinput= st->locinput;
9306             loceol= st->loceol;
9307             script_run_begin = st->sr0;
9308         }
9309         state_num = st->resume_state + no_final;
9310         goto reenter_switch;
9311     }
9312
9313     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9314                           PL_colors[4], PL_colors[5]));
9315
9316     if (reginfo->info_aux_eval) {
9317         /* each successfully executed (?{...}) block does the equivalent of
9318          *   local $^R = do {...}
9319          * When popping the save stack, all these locals would be undone;
9320          * bypass this by setting the outermost saved $^R to the latest
9321          * value */
9322         /* I dont know if this is needed or works properly now.
9323          * see code related to PL_replgv elsewhere in this file.
9324          * Yves
9325          */
9326         if (oreplsv != GvSV(PL_replgv)) {
9327             sv_setsv(oreplsv, GvSV(PL_replgv));
9328             SvSETMAGIC(oreplsv);
9329         }
9330     }
9331     result = 1;
9332     goto final_exit;
9333
9334   no:
9335     DEBUG_EXECUTE_r(
9336         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9337             depth,
9338             PL_colors[4], PL_colors[5])
9339         );
9340
9341   no_silent:
9342     if (no_final) {
9343         if (yes_state) {
9344             goto yes;
9345         } else {
9346             goto final_exit;
9347         }
9348     }
9349     if (depth) {
9350         /* there's a previous state to backtrack to */
9351         st--;
9352         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9353             PL_regmatch_slab = PL_regmatch_slab->prev;
9354             st = SLAB_LAST(PL_regmatch_slab);
9355         }
9356         PL_regmatch_state = st;
9357         locinput= st->locinput;
9358         loceol= st->loceol;
9359         script_run_begin = st->sr0;
9360
9361         DEBUG_STATE_pp("pop");
9362         depth--;
9363         if (yes_state == st)
9364             yes_state = st->u.yes.prev_yes_state;
9365
9366         state_num = st->resume_state + 1; /* failure = success + 1 */
9367         PERL_ASYNC_CHECK();
9368         goto reenter_switch;
9369     }
9370     result = 0;
9371
9372   final_exit:
9373     if (rex->intflags & PREGf_VERBARG_SEEN) {
9374         SV *sv_err = get_sv("REGERROR", 1);
9375         SV *sv_mrk = get_sv("REGMARK", 1);
9376         if (result) {
9377             sv_commit = &PL_sv_no;
9378             if (!sv_yes_mark)
9379                 sv_yes_mark = &PL_sv_yes;
9380         } else {
9381             if (!sv_commit)
9382                 sv_commit = &PL_sv_yes;
9383             sv_yes_mark = &PL_sv_no;
9384         }
9385         assert(sv_err);
9386         assert(sv_mrk);
9387         sv_setsv(sv_err, sv_commit);
9388         sv_setsv(sv_mrk, sv_yes_mark);
9389     }
9390
9391
9392     if (last_pushed_cv) {
9393         dSP;
9394         /* see "Some notes about MULTICALL" above */
9395         POP_MULTICALL;
9396         PERL_UNUSED_VAR(SP);
9397     }
9398     else
9399         LEAVE_SCOPE(orig_savestack_ix);
9400
9401     assert(!result ||  locinput - reginfo->strbeg >= 0);
9402     return result ?  locinput - reginfo->strbeg : -1;
9403 }
9404
9405 /*
9406  - regrepeat - repeatedly match something simple, report how many
9407  *
9408  * What 'simple' means is a node which can be the operand of a quantifier like
9409  * '+', or {1,3}
9410  *
9411  * startposp - pointer to a pointer to the start position.  This is updated
9412  *             to point to the byte following the highest successful
9413  *             match.
9414  * p         - the regnode to be repeatedly matched against.
9415  * loceol    - pointer to the end position beyond which we aren't supposed to
9416  *             look.
9417  * reginfo   - struct holding match state, such as utf8_target
9418  * max       - maximum number of things to match.
9419  * depth     - (for debugging) backtracking depth.
9420  */
9421 STATIC I32
9422 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9423             char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9424 {
9425     dVAR;
9426     char *scan;     /* Pointer to current position in target string */
9427     I32 c;
9428     char *this_eol = loceol;   /* potentially adjusted version. */
9429     I32 hardcount = 0;  /* How many matches so far */
9430     bool utf8_target = reginfo->is_utf8_target;
9431     unsigned int to_complement = 0;  /* Invert the result? */
9432     UV utf8_flags = 0;
9433     _char_class_number classnum;
9434
9435     PERL_ARGS_ASSERT_REGREPEAT;
9436
9437     /* This routine is structured so that we switch on the input OP.  Each OP
9438      * case: statement contains a loop to repeatedly apply the OP, advancing
9439      * the input until it fails, or reaches the end of the input, or until it
9440      * reaches the upper limit of matches. */
9441
9442     scan = *startposp;
9443     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
9444                                max */
9445         max = I32_MAX;
9446     else if (! utf8_target && this_eol - scan > max)
9447         this_eol = scan + max;
9448
9449     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
9450      * to the maximum of how far we should go in it (leaving it set to the real
9451      * end, if the maximum permissible would take us beyond that).  This allows
9452      * us to make the loop exit condition that we haven't gone past <this_eol> to
9453      * also mean that we haven't exceeded the max permissible count, saving a
9454      * test each time through the loops.  But it assumes that the OP matches a
9455      * single byte, which is true for most of the OPs below when applied to a
9456      * non-UTF-8 target.  Those relatively few OPs that don't have this
9457      * characteristic will have to compensate.
9458      *
9459      * There is no adjustment for UTF-8 targets, as the number of bytes per
9460      * character varies.  OPs will have to test both that the count is less
9461      * than the max permissible (using <hardcount> to keep track), and that we
9462      * are still within the bounds of the string (using <this_eol>.  A few OPs
9463      * match a single byte no matter what the encoding.  They can omit the max
9464      * test if, for the UTF-8 case, they do the adjustment that was skipped
9465      * above.
9466      *
9467      * Thus, the code above sets things up for the common case; and exceptional
9468      * cases need extra work; the common case is to make sure <scan> doesn't
9469      * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9470      * count doesn't exceed the maximum permissible */
9471
9472     switch (OP(p)) {
9473     case REG_ANY:
9474         if (utf8_target) {
9475             while (scan < this_eol && hardcount < max && *scan != '\n') {
9476                 scan += UTF8SKIP(scan);
9477                 hardcount++;
9478             }
9479         } else {
9480             scan = (char *) memchr(scan, '\n', this_eol - scan);
9481             if (! scan) {
9482                 scan = this_eol;
9483             }
9484         }
9485         break;
9486     case SANY:
9487         if (utf8_target) {
9488             while (scan < this_eol && hardcount < max) {
9489                 scan += UTF8SKIP(scan);
9490                 hardcount++;
9491             }
9492         }
9493         else
9494             scan = this_eol;
9495         break;
9496
9497     case LEXACT_REQ8:
9498         if (! utf8_target) {
9499             break;
9500         }
9501         /* FALLTHROUGH */
9502
9503     case LEXACT:
9504       {
9505         U8 * string;
9506         Size_t str_len;
9507
9508         string = (U8 *) STRINGl(p);
9509         str_len = STR_LENl(p);
9510         goto join_short_long_exact;
9511
9512     case EXACTL:
9513         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9514         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9515             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9516         }
9517         goto do_exact;
9518
9519     case EXACT_REQ8:
9520         if (! utf8_target) {
9521             break;
9522         }
9523         /* FALLTHROUGH */
9524     case EXACT:
9525       do_exact:
9526         string = (U8 *) STRINGs(p);
9527         str_len = STR_LENs(p);
9528
9529       join_short_long_exact:
9530         assert(str_len == reginfo->is_utf8_pat ? UTF8SKIP(string) : 1);
9531
9532         c = *string;
9533
9534         /* Can use a simple find if the pattern char to match on is invariant
9535          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9536          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9537          * true iff it doesn't matter if the argument is in UTF-8 or not */
9538         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9539             if (utf8_target && this_eol - scan > max) {
9540                 /* We didn't adjust <this_eol> because is UTF-8, but ok to do so,
9541                  * since here, to match at all, 1 char == 1 byte */
9542                 this_eol = scan + max;
9543             }
9544             scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9545         }
9546         else if (reginfo->is_utf8_pat) {
9547             if (utf8_target) {
9548                 STRLEN scan_char_len;
9549
9550                 /* When both target and pattern are UTF-8, we have to do
9551                  * string EQ */
9552                 while (hardcount < max
9553                        && scan < this_eol
9554                        && (scan_char_len = UTF8SKIP(scan)) <= str_len
9555                        && memEQ(scan, string, scan_char_len))
9556                 {
9557                     scan += scan_char_len;
9558                     hardcount++;
9559                 }
9560             }
9561             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9562
9563                 /* Target isn't utf8; convert the character in the UTF-8
9564                  * pattern to non-UTF8, and do a simple find */
9565                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(string + 1));
9566                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9567             } /* else pattern char is above Latin1, can't possibly match the
9568                  non-UTF-8 target */
9569         }
9570         else {
9571
9572             /* Here, the string must be utf8; pattern isn't, and <c> is
9573              * different in utf8 than not, so can't compare them directly.
9574              * Outside the loop, find the two utf8 bytes that represent c, and
9575              * then look for those in sequence in the utf8 string */
9576             U8 high = UTF8_TWO_BYTE_HI(c);
9577             U8 low = UTF8_TWO_BYTE_LO(c);
9578
9579             while (hardcount < max
9580                     && scan + 1 < this_eol
9581                     && UCHARAT(scan) == high
9582                     && UCHARAT(scan + 1) == low)
9583             {
9584                 scan += 2;
9585                 hardcount++;
9586             }
9587         }
9588         break;
9589       }
9590
9591     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9592         assert(! reginfo->is_utf8_pat);
9593         /* FALLTHROUGH */
9594     case EXACTFAA:
9595         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9596         if (reginfo->is_utf8_pat || ! utf8_target) {
9597
9598             /* The possible presence of a MICRO SIGN in the pattern forbids us
9599              * to view a non-UTF-8 pattern as folded when there is a UTF-8
9600              * target.  */
9601             utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
9602         }
9603         goto do_exactf;
9604
9605     case EXACTFL:
9606         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9607         utf8_flags = FOLDEQ_LOCALE;
9608         goto do_exactf;
9609
9610     case EXACTF:   /* This node only generated for non-utf8 patterns */
9611         assert(! reginfo->is_utf8_pat);
9612         goto do_exactf;
9613
9614     case EXACTFLU8:
9615         if (! utf8_target) {
9616             break;
9617         }
9618         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9619                                     | FOLDEQ_S2_FOLDS_SANE;
9620         goto do_exactf;
9621
9622     case EXACTFU_REQ8:
9623         if (! utf8_target) {
9624             break;
9625         }
9626         assert(reginfo->is_utf8_pat);
9627         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9628         goto do_exactf;
9629
9630     case EXACTFU:
9631         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9632         /* FALLTHROUGH */
9633
9634     case EXACTFUP:
9635
9636       do_exactf: {
9637         int c1, c2;
9638         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9639
9640         assert(STR_LENs(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1);
9641
9642         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9643                                         reginfo))
9644         {
9645             if (c1 == CHRTEST_VOID) {
9646                 /* Use full Unicode fold matching */
9647                 char *tmpeol = loceol;
9648                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1;
9649                 while (hardcount < max
9650                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9651                                              STRINGs(p), NULL, pat_len,
9652                                              reginfo->is_utf8_pat, utf8_flags))
9653                 {
9654                     scan = tmpeol;
9655                     tmpeol = loceol;
9656                     hardcount++;
9657                 }
9658             }
9659             else if (utf8_target) {
9660                 if (c1 == c2) {
9661                     while (scan < this_eol
9662                            && hardcount < max
9663                            && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9664                                                                   loceol)))
9665                     {
9666                         scan += UTF8SKIP(c1_utf8);
9667                         hardcount++;
9668                     }
9669                 }
9670                 else {
9671                     while (scan < this_eol
9672                            && hardcount < max
9673                            && (   memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9674                                                                      loceol))
9675                                || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
9676                                                                      loceol))))
9677                     {
9678                         scan += UTF8_SAFE_SKIP(scan, loceol);
9679                         hardcount++;
9680                     }
9681                 }
9682             }
9683             else if (c1 == c2) {
9684                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c1);
9685             }
9686             else {
9687                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9688                  * a conditional each time through the loop if the characters
9689                  * differ only in a single bit, as is the usual situation */
9690                 U8 c1_c2_bits_differing = c1 ^ c2;
9691
9692                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9693                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9694
9695                     scan = (char *) find_span_end_mask((U8 *) scan,
9696                                                        (U8 *) this_eol,
9697                                                        c1 & c1_c2_mask,
9698                                                        c1_c2_mask);
9699                 }
9700                 else {
9701                     while (    scan < this_eol
9702                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9703                     {
9704                         scan++;
9705                     }
9706                 }
9707             }
9708         }
9709         break;
9710     }
9711     case ANYOFPOSIXL:
9712     case ANYOFL:
9713         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9714
9715         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9716             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9717         }
9718         /* FALLTHROUGH */
9719     case ANYOFD:
9720     case ANYOF:
9721         if (utf8_target) {
9722             while (hardcount < max
9723                    && scan < this_eol
9724                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
9725             {
9726                 scan += UTF8SKIP(scan);
9727                 hardcount++;
9728             }
9729         }
9730         else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
9731             while (scan < this_eol
9732                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9733                 scan++;
9734         }
9735         else {
9736             while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9737                 scan++;
9738         }
9739         break;
9740
9741     case ANYOFM:
9742         if (utf8_target && this_eol - scan > max) {
9743
9744             /* We didn't adjust <this_eol> at the beginning of this routine
9745              * because is UTF-8, but it is actually ok to do so, since here, to
9746              * match, 1 char == 1 byte. */
9747             this_eol = scan + max;
9748         }
9749
9750         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9751         break;
9752
9753     case NANYOFM:
9754         if (utf8_target) {
9755             while (     hardcount < max
9756                    &&   scan < this_eol
9757                    &&  (*scan & FLAGS(p)) != ARG(p))
9758             {
9759                 scan += UTF8SKIP(scan);
9760                 hardcount++;
9761             }
9762         }
9763         else {
9764             scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9765         }
9766         break;
9767
9768     case ANYOFH:
9769         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9770             while (  hardcount < max
9771                    && scan < this_eol
9772                    && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
9773                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9774             {
9775                 scan += UTF8SKIP(scan);
9776                 hardcount++;
9777             }
9778         }
9779         break;
9780
9781     case ANYOFHb:
9782         if (utf8_target) {  /* ANYOFHb only can match UTF-8 targets */
9783
9784             /* we know the first byte must be the FLAGS field */
9785             while (   hardcount < max
9786                    && scan < this_eol
9787                    && (U8) *scan == ANYOF_FLAGS(p)
9788                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
9789                                                               TRUE))
9790             {
9791                 scan += UTF8SKIP(scan);
9792                 hardcount++;
9793             }
9794         }
9795         break;
9796
9797     case ANYOFHr:
9798         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9799             while (  hardcount < max
9800                    && scan < this_eol
9801                    && inRANGE(NATIVE_UTF8_TO_I8(*scan),
9802                               LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
9803                               HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
9804                    && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
9805                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9806             {
9807                 scan += UTF8SKIP(scan);
9808                 hardcount++;
9809             }
9810         }
9811         break;
9812
9813     case ANYOFHs:
9814         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9815             while (   hardcount < max
9816                    && scan + FLAGS(p) < this_eol
9817                    && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
9818                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9819             {
9820                 scan += UTF8SKIP(scan);
9821                 hardcount++;
9822             }
9823         }
9824         break;
9825
9826     case ANYOFR:
9827         if (utf8_target) {
9828             while (   hardcount < max
9829                    && scan < this_eol
9830                    && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
9831                    && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
9832                                                 (U8 *) this_eol,
9833                                                 NULL),
9834                                   ANYOFRbase(p), ANYOFRdelta(p)))
9835             {
9836                 scan += UTF8SKIP(scan);
9837                 hardcount++;
9838             }
9839         }
9840         else {
9841             while (   hardcount < max
9842                    && scan < this_eol
9843                    && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
9844             {
9845                 scan++;
9846                 hardcount++;
9847             }
9848         }
9849         break;
9850
9851     case ANYOFRb:
9852         if (utf8_target) {
9853             while (   hardcount < max
9854                    && scan < this_eol
9855                    && (U8) *scan == ANYOF_FLAGS(p)
9856                    && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
9857                                                 (U8 *) this_eol,
9858                                                 NULL),
9859                                   ANYOFRbase(p), ANYOFRdelta(p)))
9860             {
9861                 scan += UTF8SKIP(scan);
9862                 hardcount++;
9863             }
9864         }
9865         else {
9866             while (   hardcount < max
9867                    && scan < this_eol
9868                    && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
9869             {
9870                 scan++;
9871                 hardcount++;
9872             }
9873         }
9874         break;
9875
9876     /* The argument (FLAGS) to all the POSIX node types is the class number */
9877
9878     case NPOSIXL:
9879         to_complement = 1;
9880         /* FALLTHROUGH */
9881
9882     case POSIXL:
9883         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9884         if (! utf8_target) {
9885             while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9886                                                                    *scan)))
9887             {
9888                 scan++;
9889             }
9890         } else {
9891             while (hardcount < max && scan < this_eol
9892                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9893                                                                   (U8 *) scan,
9894                                                                   (U8 *) this_eol)))
9895             {
9896                 scan += UTF8SKIP(scan);
9897                 hardcount++;
9898             }
9899         }
9900         break;
9901
9902     case POSIXD:
9903         if (utf8_target) {
9904             goto utf8_posix;
9905         }
9906         /* FALLTHROUGH */
9907
9908     case POSIXA:
9909         if (utf8_target && this_eol - scan > max) {
9910
9911             /* We didn't adjust <this_eol> at the beginning of this routine
9912              * because is UTF-8, but it is actually ok to do so, since here, to
9913              * match, 1 char == 1 byte. */
9914             this_eol = scan + max;
9915         }
9916         while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9917             scan++;
9918         }
9919         break;
9920
9921     case NPOSIXD:
9922         if (utf8_target) {
9923             to_complement = 1;
9924             goto utf8_posix;
9925         }
9926         /* FALLTHROUGH */
9927
9928     case NPOSIXA:
9929         if (! utf8_target) {
9930             while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9931                 scan++;
9932             }
9933         }
9934         else {
9935
9936             /* The complement of something that matches only ASCII matches all
9937              * non-ASCII, plus everything in ASCII that isn't in the class. */
9938             while (hardcount < max && scan < this_eol
9939                    && (   ! isASCII_utf8_safe(scan, loceol)
9940                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9941             {
9942                 scan += UTF8SKIP(scan);
9943                 hardcount++;
9944             }
9945         }
9946         break;
9947
9948     case NPOSIXU:
9949         to_complement = 1;
9950         /* FALLTHROUGH */
9951
9952     case POSIXU:
9953         if (! utf8_target) {
9954             while (scan < this_eol && to_complement
9955                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9956             {
9957                 scan++;
9958             }
9959         }
9960         else {
9961           utf8_posix:
9962             classnum = (_char_class_number) FLAGS(p);
9963             switch (classnum) {
9964                 default:
9965                     while (   hardcount < max && scan < this_eol
9966                            && to_complement ^ cBOOL(_invlist_contains_cp(
9967                                               PL_XPosix_ptrs[classnum],
9968                                               utf8_to_uvchr_buf((U8 *) scan,
9969                                                                 (U8 *) this_eol,
9970                                                                 NULL))))
9971                     {
9972                         scan += UTF8SKIP(scan);
9973                         hardcount++;
9974                     }
9975                     break;
9976
9977                     /* For the classes below, the knowledge of how to handle
9978                      * every code point is compiled in to Perl via a macro.
9979                      * This code is written for making the loops as tight as
9980                      * possible.  It could be refactored to save space instead.
9981                      * */
9982
9983                 case _CC_ENUM_SPACE:
9984                     while (hardcount < max
9985                            && scan < this_eol
9986                            && (to_complement
9987                                ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
9988                     {
9989                         scan += UTF8SKIP(scan);
9990                         hardcount++;
9991                     }
9992                     break;
9993                 case _CC_ENUM_BLANK:
9994                     while (hardcount < max
9995                            && scan < this_eol
9996                            && (to_complement
9997                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
9998                     {
9999                         scan += UTF8SKIP(scan);
10000                         hardcount++;
10001                     }
10002                     break;
10003                 case _CC_ENUM_XDIGIT:
10004                     while (hardcount < max
10005                            && scan < this_eol
10006                            && (to_complement
10007                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10008                     {
10009                         scan += UTF8SKIP(scan);
10010                         hardcount++;
10011                     }
10012                     break;
10013                 case _CC_ENUM_VERTSPACE:
10014                     while (hardcount < max
10015                            && scan < this_eol
10016                            && (to_complement
10017                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10018                     {
10019                         scan += UTF8SKIP(scan);
10020                         hardcount++;
10021                     }
10022                     break;
10023                 case _CC_ENUM_CNTRL:
10024                     while (hardcount < max
10025                            && scan < this_eol
10026                            && (to_complement
10027                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10028                     {
10029                         scan += UTF8SKIP(scan);
10030                         hardcount++;
10031                     }
10032                     break;
10033             }
10034         }
10035         break;
10036
10037     case LNBREAK:
10038         if (utf8_target) {
10039             while (hardcount < max && scan < this_eol &&
10040                     (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
10041                 scan += c;
10042                 hardcount++;
10043             }
10044         } else {
10045             /* LNBREAK can match one or two latin chars, which is ok, but we
10046              * have to use hardcount in this situation, and throw away the
10047              * adjustment to <this_eol> done before the switch statement */
10048             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
10049                 scan+=c;
10050                 hardcount++;
10051             }
10052         }
10053         break;
10054
10055     case BOUNDL:
10056     case NBOUNDL:
10057         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10058         /* FALLTHROUGH */
10059     case BOUND:
10060     case BOUNDA:
10061     case BOUNDU:
10062     case EOS:
10063     case GPOS:
10064     case KEEPS:
10065     case NBOUND:
10066     case NBOUNDA:
10067     case NBOUNDU:
10068     case OPFAIL:
10069     case SBOL:
10070     case SEOL:
10071         /* These are all 0 width, so match right here or not at all. */
10072         break;
10073
10074     default:
10075         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
10076         NOT_REACHED; /* NOTREACHED */
10077
10078     }
10079
10080     if (hardcount)
10081         c = hardcount;
10082     else
10083         c = scan - *startposp;
10084     *startposp = scan;
10085
10086     DEBUG_r({
10087         DECLARE_AND_GET_RE_DEBUG_FLAGS;
10088         DEBUG_EXECUTE_r({
10089             SV * const prop = sv_newmortal();
10090             regprop(prog, prop, p, reginfo, NULL);
10091             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
10092                         depth, SvPVX_const(prop),(IV)c,(IV)max);
10093         });
10094     });
10095
10096     return(c);
10097 }
10098
10099 /*
10100  - reginclass - determine if a character falls into a character class
10101
10102   n is the ANYOF-type regnode
10103   p is the target string
10104   p_end points to one byte beyond the end of the target string
10105   utf8_target tells whether p is in UTF-8.
10106
10107   Returns true if matched; false otherwise.
10108
10109   Note that this can be a synthetic start class, a combination of various
10110   nodes, so things you think might be mutually exclusive, such as locale,
10111   aren't.  It can match both locale and non-locale
10112
10113  */
10114
10115 STATIC bool
10116 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10117 {
10118     dVAR;
10119     const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10120                         ? 0
10121                         : ANYOF_FLAGS(n);
10122     bool match = FALSE;
10123     UV c = *p;
10124
10125     PERL_ARGS_ASSERT_REGINCLASS;
10126
10127     /* If c is not already the code point, get it.  Note that
10128      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10129     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10130         STRLEN c_len = 0;
10131         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10132         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10133         if (c_len == (STRLEN)-1) {
10134             _force_out_malformed_utf8_message(p, p_end,
10135                                               utf8n_flags,
10136                                               1 /* 1 means die */ );
10137             NOT_REACHED; /* NOTREACHED */
10138         }
10139         if (     c > 255
10140             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10141             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
10142         {
10143             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10144         }
10145     }
10146
10147     /* If this character is potentially in the bitmap, check it */
10148     if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10149         if (ANYOF_BITMAP_TEST(n, c))
10150             match = TRUE;
10151         else if ((flags
10152                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10153                   && OP(n) == ANYOFD
10154                   && ! utf8_target
10155                   && ! isASCII(c))
10156         {
10157             match = TRUE;
10158         }
10159         else if (flags & ANYOF_LOCALE_FLAGS) {
10160             if (  (flags & ANYOFL_FOLD)
10161                 && c < sizeof(PL_fold_locale)
10162                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10163             {
10164                 match = TRUE;
10165             }
10166             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
10167                      && c <= U8_MAX  /* param to isFOO_lc() */
10168             ) {
10169
10170                 /* The data structure is arranged so bits 0, 2, 4, ... are set
10171                  * if the class includes the Posix character class given by
10172                  * bit/2; and 1, 3, 5, ... are set if the class includes the
10173                  * complemented Posix class given by int(bit/2).  So we loop
10174                  * through the bits, each time changing whether we complement
10175                  * the result or not.  Suppose for the sake of illustration
10176                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
10177                  * is set, it means there is a match for this ANYOF node if the
10178                  * character is in the class given by the expression (0 / 2 = 0
10179                  * = \w).  If it is in that class, isFOO_lc() will return 1,
10180                  * and since 'to_complement' is 0, the result will stay TRUE,
10181                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
10182                  * bit 1 is 1.  That means there is a match if the character
10183                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
10184                  * but will on bit 1.  On the second iteration 'to_complement'
10185                  * will be 1, so the exclusive or will reverse things, so we
10186                  * are testing for \W.  On the third iteration, 'to_complement'
10187                  * will be 0, and we would be testing for \s; the fourth
10188                  * iteration would test for \S, etc.
10189                  *
10190                  * Note that this code assumes that all the classes are closed
10191                  * under folding.  For example, if a character matches \w, then
10192                  * its fold does too; and vice versa.  This should be true for
10193                  * any well-behaved locale for all the currently defined Posix
10194                  * classes, except for :lower: and :upper:, which are handled
10195                  * by the pseudo-class :cased: which matches if either of the
10196                  * other two does.  To get rid of this assumption, an outer
10197                  * loop could be used below to iterate over both the source
10198                  * character, and its fold (if different) */
10199
10200                 int count = 0;
10201                 int to_complement = 0;
10202
10203                 while (count < ANYOF_MAX) {
10204                     if (ANYOF_POSIXL_TEST(n, count)
10205                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
10206                     {
10207                         match = TRUE;
10208                         break;
10209                     }
10210                     count++;
10211                     to_complement ^= 1;
10212                 }
10213             }
10214         }
10215     }
10216
10217
10218     /* If the bitmap didn't (or couldn't) match, and something outside the
10219      * bitmap could match, try that. */
10220     if (!match) {
10221         if (c >= NUM_ANYOF_CODE_POINTS
10222             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
10223         {
10224             match = TRUE;       /* Everything above the bitmap matches */
10225         }
10226             /* Here doesn't match everything above the bitmap.  If there is
10227              * some information available beyond the bitmap, we may find a
10228              * match in it.  If so, this is most likely because the code point
10229              * is outside the bitmap range.  But rarely, it could be because of
10230              * some other reason.  If so, various flags are set to indicate
10231              * this possibility.  On ANYOFD nodes, there may be matches that
10232              * happen only when the target string is UTF-8; or for other node
10233              * types, because runtime lookup is needed, regardless of the
10234              * UTF-8ness of the target string.  Finally, under /il, there may
10235              * be some matches only possible if the locale is a UTF-8 one. */
10236         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
10237                  && (   c >= NUM_ANYOF_CODE_POINTS
10238                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
10239                          && (   UNLIKELY(OP(n) != ANYOFD)
10240                              || (utf8_target && ! isASCII_uni(c)
10241 #                               if NUM_ANYOF_CODE_POINTS > 256
10242                                                                  && c < 256
10243 #                               endif
10244                                 )))
10245                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10246                          && IN_UTF8_CTYPE_LOCALE)))
10247         {
10248             SV* only_utf8_locale = NULL;
10249             SV * const definition =
10250 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
10251                 get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10252                                             &only_utf8_locale, NULL);
10253 #else
10254                 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
10255                                              &only_utf8_locale, NULL);
10256 #endif
10257             if (definition) {
10258                 U8 utf8_buffer[2];
10259                 U8 * utf8_p;
10260                 if (utf8_target) {
10261                     utf8_p = (U8 *) p;
10262                 } else { /* Convert to utf8 */
10263                     utf8_p = utf8_buffer;
10264                     append_utf8_from_native_byte(*p, &utf8_p);
10265                     utf8_p = utf8_buffer;
10266                 }
10267
10268                 /* Turkish locales have these hard-coded rules overriding
10269                  * normal ones */
10270                 if (   UNLIKELY(PL_in_utf8_turkic_locale)
10271                     && isALPHA_FOLD_EQ(*p, 'i'))
10272                 {
10273                     if (*p == 'i') {
10274                         if (_invlist_contains_cp(definition,
10275                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10276                         {
10277                             match = TRUE;
10278                         }
10279                     }
10280                     else if (*p == 'I') {
10281                         if (_invlist_contains_cp(definition,
10282                                                 LATIN_SMALL_LETTER_DOTLESS_I))
10283                         {
10284                             match = TRUE;
10285                         }
10286                     }
10287                 }
10288                 else if (_invlist_contains_cp(definition, c)) {
10289                     match = TRUE;
10290                 }
10291             }
10292             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10293                 match = _invlist_contains_cp(only_utf8_locale, c);
10294             }
10295         }
10296
10297         /* In a Turkic locale under folding, hard-code the I i case pair
10298          * matches */
10299         if (     UNLIKELY(PL_in_utf8_turkic_locale)
10300             && ! match
10301             &&   (flags & ANYOFL_FOLD)
10302             &&   utf8_target)
10303         {
10304             if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10305                 if (ANYOF_BITMAP_TEST(n, 'i')) {
10306                     match = TRUE;
10307                 }
10308             }
10309             else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10310                 if (ANYOF_BITMAP_TEST(n, 'I')) {
10311                     match = TRUE;
10312                 }
10313             }
10314         }
10315
10316         if (UNICODE_IS_SUPER(c)
10317             && (flags
10318                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10319             && OP(n) != ANYOFD
10320             && ckWARN_d(WARN_NON_UNICODE))
10321         {
10322             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10323                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10324         }
10325     }
10326
10327 #if ANYOF_INVERT != 1
10328     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10329      * use it, don't */
10330 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10331 #endif
10332
10333     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10334     return (flags & ANYOF_INVERT) ^ match;
10335 }
10336
10337 STATIC U8 *
10338 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10339 {
10340     /* return the position 'off' UTF-8 characters away from 's', forward if
10341      * 'off' >= 0, backwards if negative.  But don't go outside of position
10342      * 'lim', which better be < s  if off < 0 */
10343
10344     PERL_ARGS_ASSERT_REGHOP3;
10345
10346     if (off >= 0) {
10347         while (off-- && s < lim) {
10348             /* XXX could check well-formedness here */
10349             U8 *new_s = s + UTF8SKIP(s);
10350             if (new_s > lim) /* lim may be in the middle of a long character */
10351                 return s;
10352             s = new_s;
10353         }
10354     }
10355     else {
10356         while (off++ && s > lim) {
10357             s--;
10358             if (UTF8_IS_CONTINUED(*s)) {
10359                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10360                     s--;
10361                 if (! UTF8_IS_START(*s)) {
10362                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10363                 }
10364             }
10365             /* XXX could check well-formedness here */
10366         }
10367     }
10368     return s;
10369 }
10370
10371 STATIC U8 *
10372 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10373 {
10374     PERL_ARGS_ASSERT_REGHOP4;
10375
10376     if (off >= 0) {
10377         while (off-- && s < rlim) {
10378             /* XXX could check well-formedness here */
10379             s += UTF8SKIP(s);
10380         }
10381     }
10382     else {
10383         while (off++ && s > llim) {
10384             s--;
10385             if (UTF8_IS_CONTINUED(*s)) {
10386                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10387                     s--;
10388                 if (! UTF8_IS_START(*s)) {
10389                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10390                 }
10391             }
10392             /* XXX could check well-formedness here */
10393         }
10394     }
10395     return s;
10396 }
10397
10398 /* like reghop3, but returns NULL on overrun, rather than returning last
10399  * char pos */
10400
10401 STATIC U8 *
10402 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10403 {
10404     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10405
10406     if (off >= 0) {
10407         while (off-- && s < lim) {
10408             /* XXX could check well-formedness here */
10409             s += UTF8SKIP(s);
10410         }
10411         if (off >= 0)
10412             return NULL;
10413     }
10414     else {
10415         while (off++ && s > lim) {
10416             s--;
10417             if (UTF8_IS_CONTINUED(*s)) {
10418                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10419                     s--;
10420                 if (! UTF8_IS_START(*s)) {
10421                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10422                 }
10423             }
10424             /* XXX could check well-formedness here */
10425         }
10426         if (off <= 0)
10427             return NULL;
10428     }
10429     return s;
10430 }
10431
10432
10433 /* when executing a regex that may have (?{}), extra stuff needs setting
10434    up that will be visible to the called code, even before the current
10435    match has finished. In particular:
10436
10437    * $_ is localised to the SV currently being matched;
10438    * pos($_) is created if necessary, ready to be updated on each call-out
10439      to code;
10440    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10441      isn't set until the current pattern is successfully finished), so that
10442      $1 etc of the match-so-far can be seen;
10443    * save the old values of subbeg etc of the current regex, and  set then
10444      to the current string (again, this is normally only done at the end
10445      of execution)
10446 */
10447
10448 static void
10449 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10450 {
10451     MAGIC *mg;
10452     regexp *const rex = ReANY(reginfo->prog);
10453     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10454
10455     eval_state->rex = rex;
10456     eval_state->sv  = reginfo->sv;
10457
10458     if (reginfo->sv) {
10459         /* Make $_ available to executed code. */
10460         if (reginfo->sv != DEFSV) {
10461             SAVE_DEFSV;
10462             DEFSV_set(reginfo->sv);
10463         }
10464         /* will be dec'd by S_cleanup_regmatch_info_aux */
10465         SvREFCNT_inc_NN(reginfo->sv);
10466
10467         if (!(mg = mg_find_mglob(reginfo->sv))) {
10468             /* prepare for quick setting of pos */
10469             mg = sv_magicext_mglob(reginfo->sv);
10470             mg->mg_len = -1;
10471         }
10472         eval_state->pos_magic = mg;
10473         eval_state->pos       = mg->mg_len;
10474         eval_state->pos_flags = mg->mg_flags;
10475     }
10476     else
10477         eval_state->pos_magic = NULL;
10478
10479     if (!PL_reg_curpm) {
10480         /* PL_reg_curpm is a fake PMOP that we can attach the current
10481          * regex to and point PL_curpm at, so that $1 et al are visible
10482          * within a /(?{})/. It's just allocated once per interpreter the
10483          * first time its needed */
10484         Newxz(PL_reg_curpm, 1, PMOP);
10485 #ifdef USE_ITHREADS
10486         {
10487             SV* const repointer = &PL_sv_undef;
10488             /* this regexp is also owned by the new PL_reg_curpm, which
10489                will try to free it.  */
10490             av_push(PL_regex_padav, repointer);
10491             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10492             PL_regex_pad = AvARRAY(PL_regex_padav);
10493         }
10494 #endif
10495     }
10496     SET_reg_curpm(reginfo->prog);
10497     eval_state->curpm = PL_curpm;
10498     PL_curpm_under = PL_curpm;
10499     PL_curpm = PL_reg_curpm;
10500     if (RXp_MATCH_COPIED(rex)) {
10501         /*  Here is a serious problem: we cannot rewrite subbeg,
10502             since it may be needed if this match fails.  Thus
10503             $` inside (?{}) could fail... */
10504         eval_state->subbeg     = rex->subbeg;
10505         eval_state->sublen     = rex->sublen;
10506         eval_state->suboffset  = rex->suboffset;
10507         eval_state->subcoffset = rex->subcoffset;
10508 #ifdef PERL_ANY_COW
10509         eval_state->saved_copy = rex->saved_copy;
10510 #endif
10511         RXp_MATCH_COPIED_off(rex);
10512     }
10513     else
10514         eval_state->subbeg = NULL;
10515     rex->subbeg = (char *)reginfo->strbeg;
10516     rex->suboffset = 0;
10517     rex->subcoffset = 0;
10518     rex->sublen = reginfo->strend - reginfo->strbeg;
10519 }
10520
10521
10522 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10523
10524 static void
10525 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10526 {
10527     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10528     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10529     regmatch_slab *s;
10530
10531     Safefree(aux->poscache);
10532
10533     if (eval_state) {
10534
10535         /* undo the effects of S_setup_eval_state() */
10536
10537         if (eval_state->subbeg) {
10538             regexp * const rex = eval_state->rex;
10539             rex->subbeg     = eval_state->subbeg;
10540             rex->sublen     = eval_state->sublen;
10541             rex->suboffset  = eval_state->suboffset;
10542             rex->subcoffset = eval_state->subcoffset;
10543 #ifdef PERL_ANY_COW
10544             rex->saved_copy = eval_state->saved_copy;
10545 #endif
10546             RXp_MATCH_COPIED_on(rex);
10547         }
10548         if (eval_state->pos_magic)
10549         {
10550             eval_state->pos_magic->mg_len = eval_state->pos;
10551             eval_state->pos_magic->mg_flags =
10552                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10553                | (eval_state->pos_flags & MGf_BYTES);
10554         }
10555
10556         PL_curpm = eval_state->curpm;
10557         SvREFCNT_dec(eval_state->sv);
10558     }
10559
10560     PL_regmatch_state = aux->old_regmatch_state;
10561     PL_regmatch_slab  = aux->old_regmatch_slab;
10562
10563     /* free all slabs above current one - this must be the last action
10564      * of this function, as aux and eval_state are allocated within
10565      * slabs and may be freed here */
10566
10567     s = PL_regmatch_slab->next;
10568     if (s) {
10569         PL_regmatch_slab->next = NULL;
10570         while (s) {
10571             regmatch_slab * const osl = s;
10572             s = s->next;
10573             Safefree(osl);
10574         }
10575     }
10576 }
10577
10578
10579 STATIC void
10580 S_to_utf8_substr(pTHX_ regexp *prog)
10581 {
10582     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10583      * on the converted value */
10584
10585     int i = 1;
10586
10587     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10588
10589     do {
10590         if (prog->substrs->data[i].substr
10591             && !prog->substrs->data[i].utf8_substr) {
10592             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10593             prog->substrs->data[i].utf8_substr = sv;
10594             sv_utf8_upgrade(sv);
10595             if (SvVALID(prog->substrs->data[i].substr)) {
10596                 if (SvTAIL(prog->substrs->data[i].substr)) {
10597                     /* Trim the trailing \n that fbm_compile added last
10598                        time.  */
10599                     SvCUR_set(sv, SvCUR(sv) - 1);
10600                     /* Whilst this makes the SV technically "invalid" (as its
10601                        buffer is no longer followed by "\0") when fbm_compile()
10602                        adds the "\n" back, a "\0" is restored.  */
10603                     fbm_compile(sv, FBMcf_TAIL);
10604                 } else
10605                     fbm_compile(sv, 0);
10606             }
10607             if (prog->substrs->data[i].substr == prog->check_substr)
10608                 prog->check_utf8 = sv;
10609         }
10610     } while (i--);
10611 }
10612
10613 STATIC bool
10614 S_to_byte_substr(pTHX_ regexp *prog)
10615 {
10616     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10617      * on the converted value; returns FALSE if can't be converted. */
10618
10619     int i = 1;
10620
10621     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10622
10623     do {
10624         if (prog->substrs->data[i].utf8_substr
10625             && !prog->substrs->data[i].substr) {
10626             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10627             if (! sv_utf8_downgrade(sv, TRUE)) {
10628                 SvREFCNT_dec_NN(sv);
10629                 return FALSE;
10630             }
10631             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10632                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10633                     /* Trim the trailing \n that fbm_compile added last
10634                         time.  */
10635                     SvCUR_set(sv, SvCUR(sv) - 1);
10636                     fbm_compile(sv, FBMcf_TAIL);
10637                 } else
10638                     fbm_compile(sv, 0);
10639             }
10640             prog->substrs->data[i].substr = sv;
10641             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10642                 prog->check_substr = sv;
10643         }
10644     } while (i--);
10645
10646     return TRUE;
10647 }
10648
10649 #ifndef PERL_IN_XSUB_RE
10650
10651 bool
10652 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10653 {
10654     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10655      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10656      * the larger string bounded by 'strbeg' and 'strend'.
10657      *
10658      * 'cp' needs to be assigned (if not, a future version of the Unicode
10659      * Standard could make it something that combines with adjacent characters,
10660      * so code using it would then break), and there has to be a GCB break
10661      * before and after the character. */
10662
10663     dVAR;
10664
10665     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10666     const U8 * prev_cp_start;
10667
10668     PERL_ARGS_ASSERT_IS_GRAPHEME;
10669
10670     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10671         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10672     {
10673         /* These are considered graphemes */
10674         return TRUE;
10675     }
10676
10677     /* Otherwise, unassigned code points are forbidden */
10678     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10679                                     _invlist_search(PL_Assigned_invlist, cp))))
10680     {
10681         return FALSE;
10682     }
10683
10684     cp_gcb_val = getGCB_VAL_CP(cp);
10685
10686     /* Find the GCB value of the previous code point in the input */
10687     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10688     if (UNLIKELY(prev_cp_start == s)) {
10689         prev_cp_gcb_val = GCB_EDGE;
10690     }
10691     else {
10692         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10693     }
10694
10695     /* And check that is a grapheme boundary */
10696     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10697                 TRUE /* is UTF-8 encoded */ ))
10698     {
10699         return FALSE;
10700     }
10701
10702     /* Similarly verify there is a break between the current character and the
10703      * following one */
10704     s += UTF8SKIP(s);
10705     if (s >= strend) {
10706         next_cp_gcb_val = GCB_EDGE;
10707     }
10708     else {
10709         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10710     }
10711
10712     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10713 }
10714
10715 /*
10716 =head1 Unicode Support
10717
10718 =for apidoc isSCRIPT_RUN
10719
10720 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10721 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10722 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10723 two degenerate cases given below, this function returns TRUE iff all code
10724 points in it come from any combination of three "scripts" given by the Unicode
10725 "Script Extensions" property: Common, Inherited, and possibly one other.
10726 Additionally all decimal digits must come from the same consecutive sequence of
10727 10.
10728
10729 For example, if all the characters in the sequence are Greek, or Common, or
10730 Inherited, this function will return TRUE, provided any decimal digits in it
10731 are from the same block of digits in Common.  (These are the ASCII digits
10732 "0".."9" and additionally a block for full width forms of these, and several
10733 others used in mathematical notation.)   For scripts (unlike Greek) that have
10734 their own digits defined this will accept either digits from that set or from
10735 one of the Common digit sets, but not a combination of the two.  Some scripts,
10736 such as Arabic, have more than one set of digits.  All digits must come from
10737 the same set for this function to return TRUE.
10738
10739 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10740 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10741 C<SCX_INVALID> if the function returns FALSE.
10742
10743 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10744 will be C<SCX_INVALID>.
10745
10746 If the sequence contains a single code point which is unassigned to a character
10747 in the version of Unicode being used, the function will return TRUE, and the
10748 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10749 in the input sequence will result in the function treating the input as not
10750 being a script run.
10751
10752 The returned script will be C<SCX_Inherited> iff all the code points in it are
10753 from the Inherited script.
10754
10755 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10756 it are from the Inherited or Common scripts.
10757
10758 =cut
10759
10760 */
10761
10762 bool
10763 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10764 {
10765     /* Basically, it looks at each character in the sequence to see if the
10766      * above conditions are met; if not it fails.  It uses an inversion map to
10767      * find the enum corresponding to the script of each character.  But this
10768      * is complicated by the fact that a few code points can be in any of
10769      * several scripts.  The data has been constructed so that there are
10770      * additional enum values (all negative) for these situations.  The
10771      * absolute value of those is an index into another table which contains
10772      * pointers to auxiliary tables for each such situation.  Each aux array
10773      * lists all the scripts for the given situation.  There is another,
10774      * parallel, table that gives the number of entries in each aux table.
10775      * These are all defined in charclass_invlists.h */
10776
10777     /* XXX Here are the additional things UTS 39 says could be done:
10778      *
10779      * Forbid sequences of the same nonspacing mark
10780      *
10781      * Check to see that all the characters are in the sets of exemplar
10782      * characters for at least one language in the Unicode Common Locale Data
10783      * Repository [CLDR]. */
10784
10785     dVAR;
10786
10787     /* Things that match /\d/u */
10788     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10789     UV * decimals_array = invlist_array(decimals_invlist);
10790
10791     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10792      * not currently known) */
10793     UV zero_of_run = 0;
10794
10795     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10796     SCX_enum script_of_char = SCX_INVALID;
10797
10798     /* If the script remains not fully determined from iteration to iteration,
10799      * this is the current intersection of the possiblities.  */
10800     SCX_enum * intersection = NULL;
10801     PERL_UINT_FAST8_T intersection_len = 0;
10802
10803     bool retval = TRUE;
10804     SCX_enum * ret_script = NULL;
10805
10806     assert(send >= s);
10807
10808     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10809
10810     /* All code points in 0..255 are either Common or Latin, so must be a
10811      * script run.  We can return immediately unless we need to know which
10812      * script it is. */
10813     if (! utf8_target && LIKELY(send > s)) {
10814         if (ret_script == NULL) {
10815             return TRUE;
10816         }
10817
10818         /* If any character is Latin, the run is Latin */
10819         while (s < send) {
10820             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10821                 *ret_script = SCX_Latin;
10822                 return TRUE;
10823             }
10824         }
10825
10826         /* Here, all are Common */
10827         *ret_script = SCX_Common;
10828         return TRUE;
10829     }
10830
10831     /* Look at each character in the sequence */
10832     while (s < send) {
10833         /* If the current character being examined is a digit, this is the code
10834          * point of the zero for its sequence of 10 */
10835         UV zero_of_char;
10836
10837         UV cp;
10838
10839         /* The code allows all scripts to use the ASCII digits.  This is
10840          * because they are in the Common script.  Hence any ASCII ones found
10841          * are ok, unless and until a digit from another set has already been
10842          * encountered.  digit ranges in Common are not similarly blessed) */
10843         if (UNLIKELY(isDIGIT(*s))) {
10844             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10845                 retval = FALSE;
10846                 break;
10847             }
10848             if (zero_of_run) {
10849                 if (zero_of_run != '0') {
10850                     retval = FALSE;
10851                     break;
10852                 }
10853             }
10854             else {
10855                 zero_of_run = '0';
10856             }
10857             s++;
10858             continue;
10859         }
10860
10861         /* Here, isn't an ASCII digit.  Find the code point of the character */
10862         if (! UTF8_IS_INVARIANT(*s)) {
10863             Size_t len;
10864             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10865             s += len;
10866         }
10867         else {
10868             cp = *(s++);
10869         }
10870
10871         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10872          * digit in that script.  We can skip the rest of this code for this
10873          * character. */
10874         if (UNLIKELY(   zero_of_run
10875                      && cp >= zero_of_run
10876                      && cp - zero_of_run <= 9))
10877         {
10878             continue;
10879         }
10880
10881         /* Find the character's script.  The correct values are hard-coded here
10882          * for small-enough code points. */
10883         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10884                                unlikely to change */
10885             if (       cp > 255
10886                 || (   isALPHA_L1(cp)
10887                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10888             {
10889                 script_of_char = SCX_Latin;
10890             }
10891             else {
10892                 script_of_char = SCX_Common;
10893             }
10894         }
10895         else {
10896             script_of_char = _Perl_SCX_invmap[
10897                                        _invlist_search(PL_SCX_invlist, cp)];
10898         }
10899
10900         /* We arbitrarily accept a single unassigned character, but not in
10901          * combination with anything else, and not a run of them. */
10902         if (   UNLIKELY(script_of_run == SCX_Unknown)
10903             || UNLIKELY(   script_of_run != SCX_INVALID
10904                         && script_of_char == SCX_Unknown))
10905         {
10906             retval = FALSE;
10907             break;
10908         }
10909
10910         /* For the first character, or the run is inherited, the run's script
10911          * is set to the char's */
10912         if (   UNLIKELY(script_of_run == SCX_INVALID)
10913             || UNLIKELY(script_of_run == SCX_Inherited))
10914         {
10915             script_of_run = script_of_char;
10916         }
10917
10918         /* For the character's script to be Unknown, it must be the first
10919          * character in the sequence (for otherwise a test above would have
10920          * prevented us from reaching here), and we have set the run's script
10921          * to it.  Nothing further to be done for this character */
10922         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10923             continue;
10924         }
10925
10926         /* We accept 'inherited' script characters currently even at the
10927          * beginning.  (We know that no characters in Inherited are digits, or
10928          * we'd have to check for that) */
10929         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10930             continue;
10931         }
10932
10933         /* If the run so far is Common, and the new character isn't, change the
10934          * run's script to that of this character */
10935         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10936             script_of_run = script_of_char;
10937         }
10938
10939         /* Now we can see if the script of the new character is the same as
10940          * that of the run */
10941         if (LIKELY(script_of_char == script_of_run)) {
10942             /* By far the most common case */
10943             goto scripts_match;
10944         }
10945
10946         /* Here, the script of the run isn't Common.  But characters in Common
10947          * match any script */
10948         if (script_of_char == SCX_Common) {
10949             goto scripts_match;
10950         }
10951
10952 #ifndef HAS_SCX_AUX_TABLES
10953
10954         /* Too early a Unicode version to have a code point belonging to more
10955          * than one script, so, if the scripts don't exactly match, fail */
10956         PERL_UNUSED_VAR(intersection_len);
10957         retval = FALSE;
10958         break;
10959
10960 #else
10961
10962         /* Here there is no exact match between the character's script and the
10963          * run's.  And we've handled the special cases of scripts Unknown,
10964          * Inherited, and Common.
10965          *
10966          * Negative script numbers signify that the value may be any of several
10967          * scripts, and we need to look at auxiliary information to make our
10968          * deterimination.  But if both are non-negative, we can fail now */
10969         if (LIKELY(script_of_char >= 0)) {
10970             const SCX_enum * search_in;
10971             PERL_UINT_FAST8_T search_in_len;
10972             PERL_UINT_FAST8_T i;
10973
10974             if (LIKELY(script_of_run >= 0)) {
10975                 retval = FALSE;
10976                 break;
10977             }
10978
10979             /* Use the previously constructed set of possible scripts, if any.
10980              * */
10981             if (intersection) {
10982                 search_in = intersection;
10983                 search_in_len = intersection_len;
10984             }
10985             else {
10986                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10987                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10988             }
10989
10990             for (i = 0; i < search_in_len; i++) {
10991                 if (search_in[i] == script_of_char) {
10992                     script_of_run = script_of_char;
10993                     goto scripts_match;
10994                 }
10995             }
10996
10997             retval = FALSE;
10998             break;
10999         }
11000         else if (LIKELY(script_of_run >= 0)) {
11001             /* script of character could be one of several, but run is a single
11002              * script */
11003             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11004             const PERL_UINT_FAST8_T search_in_len
11005                                      = SCX_AUX_TABLE_lengths[-script_of_char];
11006             PERL_UINT_FAST8_T i;
11007
11008             for (i = 0; i < search_in_len; i++) {
11009                 if (search_in[i] == script_of_run) {
11010                     script_of_char = script_of_run;
11011                     goto scripts_match;
11012                 }
11013             }
11014
11015             retval = FALSE;
11016             break;
11017         }
11018         else {
11019             /* Both run and char could be in one of several scripts.  If the
11020              * intersection is empty, then this character isn't in this script
11021              * run.  Otherwise, we need to calculate the intersection to use
11022              * for future iterations of the loop, unless we are already at the
11023              * final character */
11024             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11025             const PERL_UINT_FAST8_T char_len
11026                                       = SCX_AUX_TABLE_lengths[-script_of_char];
11027             const SCX_enum * search_run;
11028             PERL_UINT_FAST8_T run_len;
11029
11030             SCX_enum * new_overlap = NULL;
11031             PERL_UINT_FAST8_T i, j;
11032
11033             if (intersection) {
11034                 search_run = intersection;
11035                 run_len = intersection_len;
11036             }
11037             else {
11038                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11039                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11040             }
11041
11042             intersection_len = 0;
11043
11044             for (i = 0; i < run_len; i++) {
11045                 for (j = 0; j < char_len; j++) {
11046                     if (search_run[i] == search_char[j]) {
11047
11048                         /* Here, the script at i,j matches.  That means this
11049                          * character is in the run.  But continue on to find
11050                          * the complete intersection, for the next loop
11051                          * iteration, and for the digit check after it.
11052                          *
11053                          * On the first found common script, we malloc space
11054                          * for the intersection list for the worst case of the
11055                          * intersection, which is the minimum of the number of
11056                          * scripts remaining in each set. */
11057                         if (intersection_len == 0) {
11058                             Newx(new_overlap,
11059                                  MIN(run_len - i, char_len - j),
11060                                  SCX_enum);
11061                         }
11062                         new_overlap[intersection_len++] = search_run[i];
11063                     }
11064                 }
11065             }
11066
11067             /* Here we've looked through everything.  If they have no scripts
11068              * in common, not a run */
11069             if (intersection_len == 0) {
11070                 retval = FALSE;
11071                 break;
11072             }
11073
11074             /* If there is only a single script in common, set to that.
11075              * Otherwise, use the intersection going forward */
11076             Safefree(intersection);
11077             intersection = NULL;
11078             if (intersection_len == 1) {
11079                 script_of_run = script_of_char = new_overlap[0];
11080                 Safefree(new_overlap);
11081                 new_overlap = NULL;
11082             }
11083             else {
11084                 intersection = new_overlap;
11085             }
11086         }
11087
11088 #endif
11089
11090   scripts_match:
11091
11092         /* Here, the script of the character is compatible with that of the
11093          * run.  That means that in most cases, it continues the script run.
11094          * Either it and the run match exactly, or one or both can be in any of
11095          * several scripts, and the intersection is not empty.  However, if the
11096          * character is a decimal digit, it could still mean failure if it is
11097          * from the wrong sequence of 10.  So, we need to look at if it's a
11098          * digit.  We've already handled the 10 decimal digits, and the next
11099          * lowest one is this one: */
11100         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11101             continue;   /* Not a digit; this character is part of the run */
11102         }
11103
11104         /* If we have a definitive '0' for the script of this character, we
11105          * know that for this to be a digit, it must be in the range of +0..+9
11106          * of that zero. */
11107         if (   script_of_char >= 0
11108             && (zero_of_char = script_zeros[script_of_char]))
11109         {
11110             if (   cp < zero_of_char
11111                 || cp > zero_of_char + 9)
11112             {
11113                 continue;   /* Not a digit; this character is part of the run
11114                              */
11115             }
11116
11117         }
11118         else {  /* Need to look up if this character is a digit or not */
11119             SSize_t index_of_zero_of_char;
11120             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11121             if (     UNLIKELY(index_of_zero_of_char < 0)
11122                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11123             {
11124                 continue;   /* Not a digit; this character is part of the run.
11125                              */
11126             }
11127
11128             zero_of_char = decimals_array[index_of_zero_of_char];
11129         }
11130
11131         /* Here, the character is a decimal digit, and the zero of its sequence
11132          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
11133          * they better be the same. */
11134         if (zero_of_run) {
11135             if (zero_of_run != zero_of_char) {
11136                 retval = FALSE;
11137                 break;
11138             }
11139         }
11140         else {  /* Otherwise we now have a zero for this run */
11141             zero_of_run = zero_of_char;
11142         }
11143     } /* end of looping through CLOSESR text */
11144
11145     Safefree(intersection);
11146
11147     if (ret_script != NULL) {
11148         if (retval) {
11149             *ret_script = script_of_run;
11150         }
11151         else {
11152             *ret_script = SCX_INVALID;
11153         }
11154     }
11155
11156     return retval;
11157 }
11158
11159 #endif /* ifndef PERL_IN_XSUB_RE */
11160
11161 /*
11162  * ex: set ts=8 sts=4 sw=4 et:
11163  */