regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "inline_invlist.c"
  84 #include "unicode_constants.h"
  85
  86 #ifdef DEBUGGING
  87 /* At least one required character in the target string is expressible only in
  88  * UTF-8. */
  89 static const char* const non_utf8_target_but_utf8_required
  90                 = "Can't match, because target string needs to be in UTF-8\n";
  91 #endif
  92
  93 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  94     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  95     goto target; \
  96 } STMT_END
  97
  98 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 /* Valid only for non-utf8 strings: avoids the reginclass
 105  * call if there are no complications: i.e., if everything matchable is
 106  * straight forward in the bitmap */
 107 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
 108                                               : ANYOF_BITMAP_TEST(p,*(c)))
 109
 110 /*
 111  * Forwards.
 112  */
 113
 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 115 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 116
 117 #define HOPc(pos,off) \
 118         (char *)(reginfo->is_utf8_target \
 119             ? reghop3((U8*)pos, off, \
 120                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 121             : (U8*)(pos + off))
 122
 123 #define HOPBACKc(pos, off) \
 124         (char*)(reginfo->is_utf8_target \
 125             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 126             : (pos - off >= reginfo->strbeg)    \
 127                 ? (U8*)pos - off                \
 128                 : NULL)
 129
 130 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 131 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 132
 133 /* lim must be +ve. Returns NULL on overshoot */
 134 #define HOPMAYBE3(pos,off,lim) \
 135         (reginfo->is_utf8_target                        \
 136             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 137             : ((U8*)pos + off <= lim)                   \
 138                 ? (U8*)pos + off                        \
 139                 : NULL)
 140
 141 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 142  * off must be >=0; args should be vars rather than expressions */
 143 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 144     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 145     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 146
 147 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 148     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 149     : (U8*)(pos + off))
 150 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 151
 152 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 153 #define NEXTCHR_IS_EOS (nextchr < 0)
 154
 155 #define SET_nextchr \
 156     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 157
 158 #define SET_locinput(p) \
 159     locinput = (p);  \
 160     SET_nextchr
 161
 162
 163 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 164         if (!swash_ptr) {                                                     \
 165             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 166             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 167                                          1, 0, invlist, &flags);              \
 168             assert(swash_ptr);                                                \
 169         }                                                                     \
 170     } STMT_END
 171
 172 /* If in debug mode, we test that a known character properly matches */
 173 #ifdef DEBUGGING
 174 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 175                                           property_name,                      \
 176                                           invlist,                            \
 177                                           utf8_char_in_property)              \
 178         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 179         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 180 #else
 181 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 182                                           property_name,                      \
 183                                           invlist,                            \
 184                                           utf8_char_in_property)              \
 185         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 186 #endif
 187
 188 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 189                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 190                                         "",                                   \
 191                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 192                                         LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 193
 194 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 195     STMT_START {                                                              \
 196         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 197                                        "_X_regular_begin",                    \
 198                                        NULL,                                  \
 199                                        LATIN_CAPITAL_LETTER_SHARP_S_UTF8);    \
 200         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 201                                        "_X_extend",                           \
 202                                        NULL,                                  \
 203                                        COMBINING_GRAVE_ACCENT_UTF8);          \
 204     } STMT_END
 205
 206 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 207 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 208
 209 /* for use after a quantifier and before an EXACT-like node -- japhy */
 210 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 211  *
 212  * NOTE that *nothing* that affects backtracking should be in here, specifically
 213  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 214  * node that is in between two EXACT like nodes when ascertaining what the required
 215  * "follow" character is. This should probably be moved to regex compile time
 216  * although it may be done at run time beause of the REF possibility - more
 217  * investigation required. -- demerphq
 218 */
 219 #define JUMPABLE(rn) (                                                             \
 220     OP(rn) == OPEN ||                                                              \
 221     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 222     OP(rn) == EVAL ||                                                              \
 223     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 224     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 225     OP(rn) == KEEPS ||                                                             \
 226     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 227 )
 228 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 229
 230 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 231
 232 #if 0
 233 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 234    we don't need this definition.  XXX These are now out-of-sync*/
 235 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 236 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 237 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 238
 239 #else
 240 /* ... so we use this as its faster. */
 241 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 242 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 243 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 244 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 245
 246 #endif
 247
 248 /*
 249   Search for mandatory following text node; for lookahead, the text must
 250   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 251 */
 252 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 253     while (JUMPABLE(rn)) { \
 254         const OPCODE type = OP(rn); \
 255         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 256             rn = NEXTOPER(NEXTOPER(rn)); \
 257         else if (type == PLUS) \
 258             rn = NEXTOPER(rn); \
 259         else if (type == IFMATCH) \
 260             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 261         else rn += NEXT_OFF(rn); \
 262     } \
 263 } STMT_END
 264
 265 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 266  * These are for the pre-composed Hangul syllables, which are all in a
 267  * contiguous block and arranged there in such a way so as to facilitate
 268  * alorithmic determination of their characteristics.  As such, they don't need
 269  * a swash, but can be determined by simple arithmetic.  Almost all are
 270  * GCB=LVT, but every 28th one is a GCB=LV */
 271 #define SBASE 0xAC00    /* Start of block */
 272 #define SCount 11172    /* Length of block */
 273 #define TCount 28
 274
 275 #define SLAB_FIRST(s) (&(s)->states[0])
 276 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 277
 278 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 279 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 280 static regmatch_state * S_push_slab(pTHX);
 281
 282 #define REGCP_PAREN_ELEMS 3
 283 #define REGCP_OTHER_ELEMS 3
 284 #define REGCP_FRAME_ELEMS 1
 285 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 286  * are needed for the regexp context stack bookkeeping. */
 287
 288 STATIC CHECKPOINT
 289 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 290 {
 291     const int retval = PL_savestack_ix;
 292     const int paren_elems_to_push =
 293                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 294     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 295     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 296     I32 p;
 297     GET_RE_DEBUG_FLAGS_DECL;
 298
 299     PERL_ARGS_ASSERT_REGCPPUSH;
 300
 301     if (paren_elems_to_push < 0)
 302         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 303                    (int)paren_elems_to_push, (int)maxopenparen,
 304                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 305
 306     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 307         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 308                    " out of range (%lu-%ld)",
 309                    total_elems,
 310                    (unsigned long)maxopenparen,
 311                    (long)parenfloor);
 312
 313     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 314
 315     DEBUG_BUFFERS_r(
 316         if ((int)maxopenparen > (int)parenfloor)
 317             PerlIO_printf(Perl_debug_log,
 318                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 319                 PTR2UV(rex),
 320                 PTR2UV(rex->offs)
 321             );
 322     );
 323     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 324 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 325         SSPUSHIV(rex->offs[p].end);
 326         SSPUSHIV(rex->offs[p].start);
 327         SSPUSHINT(rex->offs[p].start_tmp);
 328         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 329             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 330             (UV)p,
 331             (IV)rex->offs[p].start,
 332             (IV)rex->offs[p].start_tmp,
 333             (IV)rex->offs[p].end
 334         ));
 335     }
 336 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 337     SSPUSHINT(maxopenparen);
 338     SSPUSHINT(rex->lastparen);
 339     SSPUSHINT(rex->lastcloseparen);
 340     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 341
 342     return retval;
 343 }
 344
 345 /* These are needed since we do not localize EVAL nodes: */
 346 #define REGCP_SET(cp)                                           \
 347     DEBUG_STATE_r(                                              \
 348             PerlIO_printf(Perl_debug_log,                       \
 349                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 350                 (IV)PL_savestack_ix));                          \
 351     cp = PL_savestack_ix
 352
 353 #define REGCP_UNWIND(cp)                                        \
 354     DEBUG_STATE_r(                                              \
 355         if (cp != PL_savestack_ix)                              \
 356             PerlIO_printf(Perl_debug_log,                       \
 357                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 358                 (IV)(cp), (IV)PL_savestack_ix));                \
 359     regcpblow(cp)
 360
 361 #define UNWIND_PAREN(lp, lcp)               \
 362     for (n = rex->lastparen; n > lp; n--)   \
 363         rex->offs[n].end = -1;              \
 364     rex->lastparen = n;                     \
 365     rex->lastcloseparen = lcp;
 366
 367
 368 STATIC void
 369 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 370 {
 371     UV i;
 372     U32 paren;
 373     GET_RE_DEBUG_FLAGS_DECL;
 374
 375     PERL_ARGS_ASSERT_REGCPPOP;
 376
 377     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 378     i = SSPOPUV;
 379     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 380     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 381     rex->lastcloseparen = SSPOPINT;
 382     rex->lastparen = SSPOPINT;
 383     *maxopenparen_p = SSPOPINT;
 384
 385     i -= REGCP_OTHER_ELEMS;
 386     /* Now restore the parentheses context. */
 387     DEBUG_BUFFERS_r(
 388         if (i || rex->lastparen + 1 <= rex->nparens)
 389             PerlIO_printf(Perl_debug_log,
 390                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 391                 PTR2UV(rex),
 392                 PTR2UV(rex->offs)
 393             );
 394     );
 395     paren = *maxopenparen_p;
 396     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 397         SSize_t tmps;
 398         rex->offs[paren].start_tmp = SSPOPINT;
 399         rex->offs[paren].start = SSPOPIV;
 400         tmps = SSPOPIV;
 401         if (paren <= rex->lastparen)
 402             rex->offs[paren].end = tmps;
 403         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 404             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 405             (UV)paren,
 406             (IV)rex->offs[paren].start,
 407             (IV)rex->offs[paren].start_tmp,
 408             (IV)rex->offs[paren].end,
 409             (paren > rex->lastparen ? "(skipped)" : ""));
 410         );
 411         paren--;
 412     }
 413 #if 1
 414     /* It would seem that the similar code in regtry()
 415      * already takes care of this, and in fact it is in
 416      * a better location to since this code can #if 0-ed out
 417      * but the code in regtry() is needed or otherwise tests
 418      * requiring null fields (pat.t#187 and split.t#{13,14}
 419      * (as of patchlevel 7877)  will fail.  Then again,
 420      * this code seems to be necessary or otherwise
 421      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 422      * --jhi updated by dapm */
 423     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 424         if (i > *maxopenparen_p)
 425             rex->offs[i].start = -1;
 426         rex->offs[i].end = -1;
 427         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 428             "    \\%"UVuf": %s   ..-1 undeffing\n",
 429             (UV)i,
 430             (i > *maxopenparen_p) ? "-1" : "  "
 431         ));
 432     }
 433 #endif
 434 }
 435
 436 /* restore the parens and associated vars at savestack position ix,
 437  * but without popping the stack */
 438
 439 STATIC void
 440 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 441 {
 442     I32 tmpix = PL_savestack_ix;
 443     PL_savestack_ix = ix;
 444     regcppop(rex, maxopenparen_p);
 445     PL_savestack_ix = tmpix;
 446 }
 447
 448 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 449
 450 STATIC bool
 451 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 452 {
 453     /* Returns a boolean as to whether or not 'character' is a member of the
 454      * Posix character class given by 'classnum' that should be equivalent to a
 455      * value in the typedef '_char_class_number'.
 456      *
 457      * Ideally this could be replaced by a just an array of function pointers
 458      * to the C library functions that implement the macros this calls.
 459      * However, to compile, the precise function signatures are required, and
 460      * these may vary from platform to to platform.  To avoid having to figure
 461      * out what those all are on each platform, I (khw) am using this method,
 462      * which adds an extra layer of function call overhead (unless the C
 463      * optimizer strips it away).  But we don't particularly care about
 464      * performance with locales anyway. */
 465
 466     switch ((_char_class_number) classnum) {
 467         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 468         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 469         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 470         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 471         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 472                                         || isUPPER_LC(character);
 473         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 474         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 475         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 476         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 477         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 478         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 479         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 480         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 481         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 482         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 483         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 484         default:    /* VERTSPACE should never occur in locales */
 485             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 486     }
 487
 488     NOT_REACHED; /* NOTREACHED */
 489     return FALSE;
 490 }
 491
 492 STATIC bool
 493 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 494 {
 495     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 496      * 'character' is a member of the Posix character class given by 'classnum'
 497      * that should be equivalent to a value in the typedef
 498      * '_char_class_number'.
 499      *
 500      * This just calls isFOO_lc on the code point for the character if it is in
 501      * the range 0-255.  Outside that range, all characters use Unicode
 502      * rules, ignoring any locale.  So use the Unicode function if this class
 503      * requires a swash, and use the Unicode macro otherwise. */
 504
 505     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 506
 507     if (UTF8_IS_INVARIANT(*character)) {
 508         return isFOO_lc(classnum, *character);
 509     }
 510     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 511         return isFOO_lc(classnum,
 512                         TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 513     }
 514
 515     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 516
 517     if (classnum < _FIRST_NON_SWASH_CC) {
 518
 519         /* Initialize the swash unless done already */
 520         if (! PL_utf8_swash_ptrs[classnum]) {
 521             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 522             PL_utf8_swash_ptrs[classnum] =
 523                     _core_swash_init("utf8",
 524                                      "",
 525                                      &PL_sv_undef, 1, 0,
 526                                      PL_XPosix_ptrs[classnum], &flags);
 527         }
 528
 529         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 530                                  character,
 531                                  TRUE /* is UTF */ ));
 532     }
 533
 534     switch ((_char_class_number) classnum) {
 535         case _CC_ENUM_SPACE:
 536         case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 537
 538         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 539         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 540         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 541         default:                 break;
 542     }
 543
 544     return FALSE; /* Things like CNTRL are always below 256 */
 545 }
 546
 547 /*
 548  * pregexec and friends
 549  */
 550
 551 #ifndef PERL_IN_XSUB_RE
 552 /*
 553  - pregexec - match a regexp against a string
 554  */
 555 I32
 556 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 557          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 558 /* stringarg: the point in the string at which to begin matching */
 559 /* strend:    pointer to null at end of string */
 560 /* strbeg:    real beginning of string */
 561 /* minend:    end of match must be >= minend bytes after stringarg. */
 562 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 563  *            itself is accessed via the pointers above */
 564 /* nosave:    For optimizations. */
 565 {
 566     PERL_ARGS_ASSERT_PREGEXEC;
 567
 568     return
 569         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 570                       nosave ? 0 : REXEC_COPY_STR);
 571 }
 572 #endif
 573
 574
 575
 576 /* re_intuit_start():
 577  *
 578  * Based on some optimiser hints, try to find the earliest position in the
 579  * string where the regex could match.
 580  *
 581  *   rx:     the regex to match against
 582  *   sv:     the SV being matched: only used for utf8 flag; the string
 583  *           itself is accessed via the pointers below. Note that on
 584  *           something like an overloaded SV, SvPOK(sv) may be false
 585  *           and the string pointers may point to something unrelated to
 586  *           the SV itself.
 587  *   strbeg: real beginning of string
 588  *   strpos: the point in the string at which to begin matching
 589  *   strend: pointer to the byte following the last char of the string
 590  *   flags   currently unused; set to 0
 591  *   data:   currently unused; set to NULL
 592  *
 593  * The basic idea of re_intuit_start() is to use some known information
 594  * about the pattern, namely:
 595  *
 596  *   a) the longest known anchored substring (i.e. one that's at a
 597  *      constant offset from the beginning of the pattern; but not
 598  *      necessarily at a fixed offset from the beginning of the
 599  *      string);
 600  *   b) the longest floating substring (i.e. one that's not at a constant
 601  *      offset from the beginning of the pattern);
 602  *   c) Whether the pattern is anchored to the string; either
 603  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 604  *      or anchored to pos(): /\G/;
 605  *   d) A start class: a real or synthetic character class which
 606  *      represents which characters are legal at the start of the pattern;
 607  *
 608  * to either quickly reject the match, or to find the earliest position
 609  * within the string at which the pattern might match, thus avoiding
 610  * running the full NFA engine at those earlier locations, only to
 611  * eventually fail and retry further along.
 612  *
 613  * Returns NULL if the pattern can't match, or returns the address within
 614  * the string which is the earliest place the match could occur.
 615  *
 616  * The longest of the anchored and floating substrings is called 'check'
 617  * and is checked first. The other is called 'other' and is checked
 618  * second. The 'other' substring may not be present.  For example,
 619  *
 620  *    /(abc|xyz)ABC\d{0,3}DEFG/
 621  *
 622  * will have
 623  *
 624  *   check substr (float)    = "DEFG", offset 6..9 chars
 625  *   other substr (anchored) = "ABC",  offset 3..3 chars
 626  *   stclass = [ax]
 627  *
 628  * Be aware that during the course of this function, sometimes 'anchored'
 629  * refers to a substring being anchored relative to the start of the
 630  * pattern, and sometimes to the pattern itself being anchored relative to
 631  * the string. For example:
 632  *
 633  *   /\dabc/:   "abc" is anchored to the pattern;
 634  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 635  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 636  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 637  *                    but the pattern is anchored to the string.
 638  */
 639
 640 char *
 641 Perl_re_intuit_start(pTHX_
 642                     REGEXP * const rx,
 643                     SV *sv,
 644                     const char * const strbeg,
 645                     char *strpos,
 646                     char *strend,
 647                     const U32 flags,
 648                     re_scream_pos_data *data)
 649 {
 650     struct regexp *const prog = ReANY(rx);
 651     SSize_t start_shift = prog->check_offset_min;
 652     /* Should be nonnegative! */
 653     SSize_t end_shift   = 0;
 654     /* current lowest pos in string where the regex can start matching */
 655     char *rx_origin = strpos;
 656     SV *check;
 657     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 658     U8   other_ix = 1 - prog->substrs->check_ix;
 659     bool ml_anch = 0;
 660     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 661     char *check_at = NULL;              /* check substr found at this pos */
 662     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 663     RXi_GET_DECL(prog,progi);
 664     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 665     regmatch_info *const reginfo = &reginfo_buf;
 666     GET_RE_DEBUG_FLAGS_DECL;
 667
 668     PERL_ARGS_ASSERT_RE_INTUIT_START;
 669     PERL_UNUSED_ARG(flags);
 670     PERL_UNUSED_ARG(data);
 671
 672     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 673                 "Intuit: trying to determine minimum start position...\n"));
 674
 675     /* for now, assume that all substr offsets are positive. If at some point
 676      * in the future someone wants to do clever things with look-behind and
 677      * -ve offsets, they'll need to fix up any code in this function
 678      * which uses these offsets. See the thread beginning
 679      * <20140113145929.GF27210@iabyn.com>
 680      */
 681     assert(prog->substrs->data[0].min_offset >= 0);
 682     assert(prog->substrs->data[0].max_offset >= 0);
 683     assert(prog->substrs->data[1].min_offset >= 0);
 684     assert(prog->substrs->data[1].max_offset >= 0);
 685     assert(prog->substrs->data[2].min_offset >= 0);
 686     assert(prog->substrs->data[2].max_offset >= 0);
 687
 688     /* for now, assume that if both present, that the floating substring
 689      * doesn't start before the anchored substring.
 690      * If you break this assumption (e.g. doing better optimisations
 691      * with lookahead/behind), then you'll need to audit the code in this
 692      * function carefully first
 693      */
 694     assert(
 695             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 696               && (prog->float_utf8    || prog->float_substr))
 697            || (prog->float_min_offset >= prog->anchored_offset));
 698
 699     /* byte rather than char calculation for efficiency. It fails
 700      * to quickly reject some cases that can't match, but will reject
 701      * them later after doing full char arithmetic */
 702     if (prog->minlen > strend - strpos) {
 703         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 704                               "  String too short...\n"));
 705         goto fail;
 706     }
 707
 708     RX_MATCH_UTF8_set(rx,utf8_target);
 709     reginfo->is_utf8_target = cBOOL(utf8_target);
 710     reginfo->info_aux = NULL;
 711     reginfo->strbeg = strbeg;
 712     reginfo->strend = strend;
 713     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 714     reginfo->intuit = 1;
 715     /* not actually used within intuit, but zero for safety anyway */
 716     reginfo->poscache_maxiter = 0;
 717
 718     if (utf8_target) {
 719         if (!prog->check_utf8 && prog->check_substr)
 720             to_utf8_substr(prog);
 721         check = prog->check_utf8;
 722     } else {
 723         if (!prog->check_substr && prog->check_utf8) {
 724             if (! to_byte_substr(prog)) {
 725                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 726             }
 727         }
 728         check = prog->check_substr;
 729     }
 730
 731     /* dump the various substring data */
 732     DEBUG_OPTIMISE_MORE_r({
 733         int i;
 734         for (i=0; i<=2; i++) {
 735             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 736                                   : prog->substrs->data[i].substr);
 737             if (!sv)
 738                 continue;
 739
 740             PerlIO_printf(Perl_debug_log,
 741                 "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 742                 " useful=%"IVdf" utf8=%d [%s]\n",
 743                 i,
 744                 (IV)prog->substrs->data[i].min_offset,
 745                 (IV)prog->substrs->data[i].max_offset,
 746                 (IV)prog->substrs->data[i].end_shift,
 747                 BmUSEFUL(sv),
 748                 utf8_target ? 1 : 0,
 749                 SvPEEK(sv));
 750         }
 751     });
 752
 753     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 754
 755         /* ml_anch: check after \n?
 756          *
 757          * A note about IMPLICIT: on an un-anchored pattern beginning
 758          * with /.*.../, these flags will have been added by the
 759          * compiler:
 760          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 761          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 762          */
 763         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 764                    && !(prog->intflags & PREGf_IMPLICIT);
 765
 766         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 767             /* we are only allowed to match at BOS or \G */
 768
 769             /* trivially reject if there's a BOS anchor and we're not at BOS.
 770              *
 771              * Note that we don't try to do a similar quick reject for
 772              * \G, since generally the caller will have calculated strpos
 773              * based on pos() and gofs, so the string is already correctly
 774              * anchored by definition; and handling the exceptions would
 775              * be too fiddly (e.g. REXEC_IGNOREPOS).
 776              */
 777             if (   strpos != strbeg
 778                 && (prog->intflags & PREGf_ANCH_SBOL))
 779             {
 780                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 781                                 "  Not at start...\n"));
 782                 goto fail;
 783             }
 784
 785             /* in the presence of an anchor, the anchored (relative to the
 786              * start of the regex) substr must also be anchored relative
 787              * to strpos. So quickly reject if substr isn't found there.
 788              * This works for \G too, because the caller will already have
 789              * subtracted gofs from pos, and gofs is the offset from the
 790              * \G to the start of the regex. For example, in /.abc\Gdef/,
 791              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 792              * caller will have set strpos=pos()-4; we look for the substr
 793              * at position pos()-4+1, which lines up with the "a" */
 794
 795             if (prog->check_offset_min == prog->check_offset_max
 796                 && !(prog->intflags & PREGf_CANY_SEEN))
 797             {
 798                 /* Substring at constant offset from beg-of-str... */
 799                 SSize_t slen = SvCUR(check);
 800                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 801
 802                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 803                     "  Looking for check substr at fixed offset %"IVdf"...\n",
 804                     (IV)prog->check_offset_min));
 805
 806                 if (SvTAIL(check)) {
 807                     /* In this case, the regex is anchored at the end too.
 808                      * Unless it's a multiline match, the lengths must match
 809                      * exactly, give or take a \n.  NB: slen >= 1 since
 810                      * the last char of check is \n */
 811                     if (!multiline
 812                         && (   strend - s > slen
 813                             || strend - s < slen - 1
 814                             || (strend - s == slen && strend[-1] != '\n')))
 815                     {
 816                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 817                                             "  String too long...\n"));
 818                         goto fail_finish;
 819                     }
 820                     /* Now should match s[0..slen-2] */
 821                     slen--;
 822                 }
 823                 if (slen && (*SvPVX_const(check) != *s
 824                     || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 825                 {
 826                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 827                                     "  String not equal...\n"));
 828                     goto fail_finish;
 829                 }
 830
 831                 check_at = s;
 832                 goto success_at_start;
 833             }
 834         }
 835     }
 836
 837     end_shift = prog->check_end_shift;
 838
 839 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 840     if (end_shift < 0)
 841         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 842                    (IV)end_shift, RX_PRECOMP(prog));
 843 #endif
 844
 845   restart:
 846
 847     /* This is the (re)entry point of the main loop in this function.
 848      * The goal of this loop is to:
 849      * 1) find the "check" substring in the region rx_origin..strend
 850      *    (adjusted by start_shift / end_shift). If not found, reject
 851      *    immediately.
 852      * 2) If it exists, look for the "other" substr too if defined; for
 853      *    example, if the check substr maps to the anchored substr, then
 854      *    check the floating substr, and vice-versa. If not found, go
 855      *    back to (1) with rx_origin suitably incremented.
 856      * 3) If we find an rx_origin position that doesn't contradict
 857      *    either of the substrings, then check the possible additional
 858      *    constraints on rx_origin of /^.../m or a known start class.
 859      *    If these fail, then depending on which constraints fail, jump
 860      *    back to here, or to various other re-entry points further along
 861      *    that skip some of the first steps.
 862      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 863      *    substring. If the start position was determined to be at the
 864      *    beginning of the string  - so, not rejected, but not optimised,
 865      *    since we have to run regmatch from position 0 - decrement the
 866      *    BmUSEFUL() count. Otherwise increment it.
 867      */
 868
 869
 870     /* first, look for the 'check' substring */
 871
 872     {
 873         U8* start_point;
 874         U8* end_point;
 875
 876         DEBUG_OPTIMISE_MORE_r({
 877             PerlIO_printf(Perl_debug_log,
 878                 "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 879                 " Start shift: %"IVdf" End shift %"IVdf
 880                 " Real end Shift: %"IVdf"\n",
 881                 (IV)(rx_origin - strpos),
 882                 (IV)prog->check_offset_min,
 883                 (IV)start_shift,
 884                 (IV)end_shift,
 885                 (IV)prog->check_end_shift);
 886         });
 887
 888         if (prog->intflags & PREGf_CANY_SEEN) {
 889             start_point= (U8*)(rx_origin + start_shift);
 890             end_point= (U8*)(strend - end_shift);
 891             if (start_point > end_point)
 892                 goto fail_finish;
 893         } else {
 894             end_point = HOP3(strend, -end_shift, strbeg);
 895             start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 896             if (!start_point)
 897                 goto fail_finish;
 898         }
 899
 900
 901         /* If the regex is absolutely anchored to either the start of the
 902          * string (SBOL) or to pos() (ANCH_GPOS), then
 903          * check_offset_max represents an upper bound on the string where
 904          * the substr could start. For the ANCH_GPOS case, we assume that
 905          * the caller of intuit will have already set strpos to
 906          * pos()-gofs, so in this case strpos + offset_max will still be
 907          * an upper bound on the substr.
 908          */
 909         if (!ml_anch
 910             && prog->intflags & PREGf_ANCH
 911             && prog->check_offset_max != SSize_t_MAX)
 912         {
 913             SSize_t len = SvCUR(check) - !!SvTAIL(check);
 914             const char * const anchor =
 915                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 916
 917             /* do a bytes rather than chars comparison. It's conservative;
 918              * so it skips doing the HOP if the result can't possibly end
 919              * up earlier than the old value of end_point.
 920              */
 921             if ((char*)end_point - anchor > prog->check_offset_max) {
 922                 end_point = HOP3lim((U8*)anchor,
 923                                 prog->check_offset_max,
 924                                 end_point -len)
 925                             + len;
 926             }
 927         }
 928
 929         DEBUG_OPTIMISE_MORE_r({
 930             PerlIO_printf(Perl_debug_log, "  fbm_instr len=%d str=<%.*s>\n",
 931                 (int)(end_point - start_point),
 932                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 933                 start_point);
 934         });
 935
 936         check_at = fbm_instr( start_point, end_point,
 937                       check, multiline ? FBMrf_MULTILINE : 0);
 938
 939         /* Update the count-of-usability, remove useless subpatterns,
 940             unshift s.  */
 941
 942         DEBUG_EXECUTE_r({
 943             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 944                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 945             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s%s",
 946                               (check_at ? "Found" : "Did not find"),
 947                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 948                     ? "anchored" : "floating"),
 949                 quoted,
 950                 RE_SV_TAIL(check),
 951                 (check_at ? " at offset " : "...\n") );
 952         });
 953
 954         if (!check_at)
 955             goto fail_finish;
 956         /* Finish the diagnostic message */
 957         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
 958
 959         /* set rx_origin to the minimum position where the regex could start
 960          * matching, given the constraint of the just-matched check substring.
 961          * But don't set it lower than previously.
 962          */
 963
 964         if (check_at - rx_origin > prog->check_offset_max)
 965             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 966     }
 967
 968
 969     /* now look for the 'other' substring if defined */
 970
 971     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 972                     : prog->substrs->data[other_ix].substr)
 973     {
 974         /* Take into account the "other" substring. */
 975         char *last, *last1;
 976         char *s;
 977         SV* must;
 978         struct reg_substr_datum *other;
 979
 980       do_other_substr:
 981         other = &prog->substrs->data[other_ix];
 982
 983         /* if "other" is anchored:
 984          * we've previously found a floating substr starting at check_at.
 985          * This means that the regex origin must lie somewhere
 986          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 987          * and max:                 HOP3(check_at, -check_offset_min)
 988          * (except that min will be >= strpos)
 989          * So the fixed  substr must lie somewhere between
 990          *  HOP3(min, anchored_offset)
 991          *  HOP3(max, anchored_offset) + SvCUR(substr)
 992          */
 993
 994         /* if "other" is floating
 995          * Calculate last1, the absolute latest point where the
 996          * floating substr could start in the string, ignoring any
 997          * constraints from the earlier fixed match. It is calculated
 998          * as follows:
 999          *
1000          * strend - prog->minlen (in chars) is the absolute latest
1001          * position within the string where the origin of the regex
1002          * could appear. The latest start point for the floating
1003          * substr is float_min_offset(*) on from the start of the
1004          * regex.  last1 simply combines thee two offsets.
1005          *
1006          * (*) You might think the latest start point should be
1007          * float_max_offset from the regex origin, and technically
1008          * you'd be correct. However, consider
1009          *    /a\d{2,4}bcd\w/
1010          * Here, float min, max are 3,5 and minlen is 7.
1011          * This can match either
1012          *    /a\d\dbcd\w/
1013          *    /a\d\d\dbcd\w/
1014          *    /a\d\d\d\dbcd\w/
1015          * In the first case, the regex matches minlen chars; in the
1016          * second, minlen+1, in the third, minlen+2.
1017          * In the first case, the floating offset is 3 (which equals
1018          * float_min), in the second, 4, and in the third, 5 (which
1019          * equals float_max). In all cases, the floating string bcd
1020          * can never start more than 4 chars from the end of the
1021          * string, which equals minlen - float_min. As the substring
1022          * starts to match more than float_min from the start of the
1023          * regex, it makes the regex match more than minlen chars,
1024          * and the two cancel each other out. So we can always use
1025          * float_min - minlen, rather than float_max - minlen for the
1026          * latest position in the string.
1027          *
1028          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1029          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1030          */
1031
1032         assert(prog->minlen >= other->min_offset);
1033         last1 = HOP3c(strend,
1034                         other->min_offset - prog->minlen, strbeg);
1035
1036         if (other_ix) {/* i.e. if (other-is-float) */
1037             /* last is the latest point where the floating substr could
1038              * start, *given* any constraints from the earlier fixed
1039              * match. This constraint is that the floating string starts
1040              * <= float_max_offset chars from the regex origin (rx_origin).
1041              * If this value is less than last1, use it instead.
1042              */
1043             assert(rx_origin <= last1);
1044             last =
1045                 /* this condition handles the offset==infinity case, and
1046                  * is a short-cut otherwise. Although it's comparing a
1047                  * byte offset to a char length, it does so in a safe way,
1048                  * since 1 char always occupies 1 or more bytes,
1049                  * so if a string range is  (last1 - rx_origin) bytes,
1050                  * it will be less than or equal to  (last1 - rx_origin)
1051                  * chars; meaning it errs towards doing the accurate HOP3
1052                  * rather than just using last1 as a short-cut */
1053                 (last1 - rx_origin) < other->max_offset
1054                     ? last1
1055                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1056         }
1057         else {
1058             assert(strpos + start_shift <= check_at);
1059             last = HOP4c(check_at, other->min_offset - start_shift,
1060                         strbeg, strend);
1061         }
1062
1063         s = HOP3c(rx_origin, other->min_offset, strend);
1064         if (s < other_last)     /* These positions already checked */
1065             s = other_last;
1066
1067         must = utf8_target ? other->utf8_substr : other->substr;
1068         assert(SvPOK(must));
1069         s = fbm_instr(
1070             (unsigned char*)s,
1071             (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1072             must,
1073             multiline ? FBMrf_MULTILINE : 0
1074         );
1075         DEBUG_EXECUTE_r({
1076             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1077                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1078             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s",
1079                 s ? "Found" : "Contradicts",
1080                 other_ix ? "floating" : "anchored",
1081                 quoted, RE_SV_TAIL(must));
1082         });
1083
1084
1085         if (!s) {
1086             /* last1 is latest possible substr location. If we didn't
1087              * find it before there, we never will */
1088             if (last >= last1) {
1089                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1090                                         ", giving up...\n"));
1091                 goto fail_finish;
1092             }
1093
1094             /* try to find the check substr again at a later
1095              * position. Maybe next time we'll find the "other" substr
1096              * in range too */
1097             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1098                 ", trying %s at offset %ld...\n",
1099                 (other_ix ? "floating" : "anchored"),
1100                 (long)(HOP3c(check_at, 1, strend) - strpos)));
1101
1102             other_last = HOP3c(last, 1, strend) /* highest failure */;
1103             rx_origin =
1104                 other_ix /* i.e. if other-is-float */
1105                     ? HOP3c(rx_origin, 1, strend)
1106                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1107             goto restart;
1108         }
1109         else {
1110             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1111                   (long)(s - strpos)));
1112
1113             if (other_ix) { /* if (other-is-float) */
1114                 /* other_last is set to s, not s+1, since its possible for
1115                  * a floating substr to fail first time, then succeed
1116                  * second time at the same floating position; e.g.:
1117                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1118                  * The first time round, anchored and float match at
1119                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1120                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1121                  */
1122                 other_last = s;
1123             }
1124             else {
1125                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1126                 other_last = HOP3c(s, 1, strend);
1127             }
1128         }
1129     }
1130     else {
1131         DEBUG_OPTIMISE_MORE_r(
1132             PerlIO_printf(Perl_debug_log,
1133                 "  Check-only match: offset min:%"IVdf" max:%"IVdf
1134                 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1135                 " strend-strpos:%"IVdf"\n",
1136                 (IV)prog->check_offset_min,
1137                 (IV)prog->check_offset_max,
1138                 (IV)(check_at-strpos),
1139                 (IV)(rx_origin-strpos),
1140                 (IV)(rx_origin-check_at),
1141                 (IV)(strend-strpos)
1142             )
1143         );
1144     }
1145
1146   postprocess_substr_matches:
1147
1148     /* handle the extra constraint of /^.../m if present */
1149
1150     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1151         char *s;
1152
1153         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1154                         "  looking for /^/m anchor"));
1155
1156         /* we have failed the constraint of a \n before rx_origin.
1157          * Find the next \n, if any, even if it's beyond the current
1158          * anchored and/or floating substrings. Whether we should be
1159          * scanning ahead for the next \n or the next substr is debatable.
1160          * On the one hand you'd expect rare substrings to appear less
1161          * often than \n's. On the other hand, searching for \n means
1162          * we're effectively flipping been check_substr and "\n" on each
1163          * iteration as the current "rarest" string candidate, which
1164          * means for example that we'll quickly reject the whole string if
1165          * hasn't got a \n, rather than trying every substr position
1166          * first
1167          */
1168
1169         s = HOP3c(strend, - prog->minlen, strpos);
1170         if (s <= rx_origin ||
1171             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1172         {
1173             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1174                             "  Did not find /%s^%s/m...\n",
1175                             PL_colors[0], PL_colors[1]));
1176             goto fail_finish;
1177         }
1178
1179         /* earliest possible origin is 1 char after the \n.
1180          * (since *rx_origin == '\n', it's safe to ++ here rather than
1181          * HOP(rx_origin, 1)) */
1182         rx_origin++;
1183
1184         if (prog->substrs->check_ix == 0  /* check is anchored */
1185             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1186         {
1187             /* Position contradicts check-string; either because
1188              * check was anchored (and thus has no wiggle room),
1189              * or check was float and rx_origin is above the float range */
1190             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1191                 "  Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1192                 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1193             goto restart;
1194         }
1195
1196         /* if we get here, the check substr must have been float,
1197          * is in range, and we may or may not have had an anchored
1198          * "other" substr which still contradicts */
1199         assert(prog->substrs->check_ix); /* check is float */
1200
1201         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1202             /* whoops, the anchored "other" substr exists, so we still
1203              * contradict. On the other hand, the float "check" substr
1204              * didn't contradict, so just retry the anchored "other"
1205              * substr */
1206             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1207                 "  Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1208                 PL_colors[0], PL_colors[1],
1209                 (long)(rx_origin - strpos),
1210                 (long)(rx_origin - strpos + prog->anchored_offset)));
1211             goto do_other_substr;
1212         }
1213
1214         /* success: we don't contradict the found floating substring
1215          * (and there's no anchored substr). */
1216         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1217             "  Found /%s^%s/m at offset %ld...\n",
1218             PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1219     }
1220     else {
1221         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1222             "  (multiline anchor test skipped)\n"));
1223     }
1224
1225   success_at_start:
1226
1227
1228     /* if we have a starting character class, then test that extra constraint.
1229      * (trie stclasses are too expensive to use here, we are better off to
1230      * leave it to regmatch itself) */
1231
1232     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1233         const U8* const str = (U8*)STRING(progi->regstclass);
1234
1235         /* XXX this value could be pre-computed */
1236         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1237                     ?  (reginfo->is_utf8_pat
1238                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1239                         : STR_LEN(progi->regstclass))
1240                     : 1);
1241         char * endpos;
1242         char *s;
1243         /* latest pos that a matching float substr constrains rx start to */
1244         char *rx_max_float = NULL;
1245
1246         /* if the current rx_origin is anchored, either by satisfying an
1247          * anchored substring constraint, or a /^.../m constraint, then we
1248          * can reject the current origin if the start class isn't found
1249          * at the current position. If we have a float-only match, then
1250          * rx_origin is constrained to a range; so look for the start class
1251          * in that range. if neither, then look for the start class in the
1252          * whole rest of the string */
1253
1254         /* XXX DAPM it's not clear what the minlen test is for, and why
1255          * it's not used in the floating case. Nothing in the test suite
1256          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1257          * Here are some old comments, which may or may not be correct:
1258          *
1259          *   minlen == 0 is possible if regstclass is \b or \B,
1260          *   and the fixed substr is ''$.
1261          *   Since minlen is already taken into account, rx_origin+1 is
1262          *   before strend; accidentally, minlen >= 1 guaranties no false
1263          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1264          *   0) below assumes that regstclass does not come from lookahead...
1265          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1266          *   This leaves EXACTF-ish only, which are dealt with in
1267          *   find_byclass().
1268          */
1269
1270         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1271             endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1272         else if (prog->float_substr || prog->float_utf8) {
1273             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1274             endpos= HOP3c(rx_max_float, cl_l, strend);
1275         }
1276         else
1277             endpos= strend;
1278
1279         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1280             "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1281             " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1282               (IV)start_shift, (IV)(check_at - strbeg),
1283               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1284
1285         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1286                             reginfo);
1287         if (!s) {
1288             if (endpos == strend) {
1289                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1290                                 "  Could not match STCLASS...\n") );
1291                 goto fail;
1292             }
1293             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1294                                "  This position contradicts STCLASS...\n") );
1295             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1296                         && !(prog->intflags & PREGf_IMPLICIT))
1297                 goto fail;
1298
1299             /* Contradict one of substrings */
1300             if (prog->anchored_substr || prog->anchored_utf8) {
1301                 if (prog->substrs->check_ix == 1) { /* check is float */
1302                     /* Have both, check_string is floating */
1303                     assert(rx_origin + start_shift <= check_at);
1304                     if (rx_origin + start_shift != check_at) {
1305                         /* not at latest position float substr could match:
1306                          * Recheck anchored substring, but not floating.
1307                          * The condition above is in bytes rather than
1308                          * chars for efficiency. It's conservative, in
1309                          * that it errs on the side of doing 'goto
1310                          * do_other_substr', where a more accurate
1311                          * char-based calculation will be done */
1312                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1313                                   "  Looking for anchored substr starting at offset %ld...\n",
1314                                   (long)(other_last - strpos)) );
1315                         goto do_other_substr;
1316                     }
1317                 }
1318             }
1319             else {
1320                 /* float-only */
1321
1322                 if (ml_anch) {
1323                     /* In the presence of ml_anch, we might be able to
1324                      * find another \n without breaking the current float
1325                      * constraint. */
1326
1327                     /* strictly speaking this should be HOP3c(..., 1, ...),
1328                      * but since we goto a block of code that's going to
1329                      * search for the next \n if any, its safe here */
1330                     rx_origin++;
1331                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1332                               "  Looking for /%s^%s/m starting at offset %ld...\n",
1333                               PL_colors[0], PL_colors[1],
1334                               (long)(rx_origin - strpos)) );
1335                     goto postprocess_substr_matches;
1336                 }
1337
1338                 /* strictly speaking this can never be true; but might
1339                  * be if we ever allow intuit without substrings */
1340                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1341                     goto fail;
1342
1343                 rx_origin = rx_max_float;
1344             }
1345
1346             /* at this point, any matching substrings have been
1347              * contradicted. Start again... */
1348
1349             rx_origin = HOP3c(rx_origin, 1, strend);
1350
1351             /* uses bytes rather than char calculations for efficiency.
1352              * It's conservative: it errs on the side of doing 'goto restart',
1353              * where there is code that does a proper char-based test */
1354             if (rx_origin + start_shift + end_shift > strend) {
1355                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1356                                        "  Could not match STCLASS...\n") );
1357                 goto fail;
1358             }
1359             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1360                 "  Looking for %s substr starting at offset %ld...\n",
1361                 (prog->substrs->check_ix ? "floating" : "anchored"),
1362                 (long)(rx_origin + start_shift - strpos)) );
1363             goto restart;
1364         }
1365
1366         /* Success !!! */
1367
1368         if (rx_origin != s) {
1369             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1370                         "  By STCLASS: moving %ld --> %ld\n",
1371                                   (long)(rx_origin - strpos), (long)(s - strpos))
1372                    );
1373         }
1374         else {
1375             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1376                                   "  Does not contradict STCLASS...\n");
1377                    );
1378         }
1379     }
1380
1381     /* Decide whether using the substrings helped */
1382
1383     if (rx_origin != strpos) {
1384         /* Fixed substring is found far enough so that the match
1385            cannot start at strpos. */
1386
1387         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  try at offset...\n"));
1388         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1389     }
1390     else {
1391         /* The found rx_origin position does not prohibit matching at
1392          * strpos, so calling intuit didn't gain us anything. Decrement
1393          * the BmUSEFUL() count on the check substring, and if we reach
1394          * zero, free it.  */
1395         if (!(prog->intflags & PREGf_NAUGHTY)
1396             && (utf8_target ? (
1397                 prog->check_utf8                /* Could be deleted already */
1398                 && --BmUSEFUL(prog->check_utf8) < 0
1399                 && (prog->check_utf8 == prog->float_utf8)
1400             ) : (
1401                 prog->check_substr              /* Could be deleted already */
1402                 && --BmUSEFUL(prog->check_substr) < 0
1403                 && (prog->check_substr == prog->float_substr)
1404             )))
1405         {
1406             /* If flags & SOMETHING - do not do it many times on the same match */
1407             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  ... Disabling check substring...\n"));
1408             /* XXX Does the destruction order has to change with utf8_target? */
1409             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1410             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1411             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1412             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1413             check = NULL;                       /* abort */
1414             /* XXXX This is a remnant of the old implementation.  It
1415                     looks wasteful, since now INTUIT can use many
1416                     other heuristics. */
1417             prog->extflags &= ~RXf_USE_INTUIT;
1418         }
1419     }
1420
1421     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1422             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1423              PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1424
1425     return rx_origin;
1426
1427   fail_finish:                          /* Substring not found */
1428     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1429         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1430   fail:
1431     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1432                           PL_colors[4], PL_colors[5]));
1433     return NULL;
1434 }
1435
1436
1437 #define DECL_TRIE_TYPE(scan) \
1438     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1439                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1440                  trie_utf8l, trie_flu8 }                                            \
1441                     trie_type = ((scan->flags == EXACT)                             \
1442                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1443                                  : (scan->flags == EXACTL)                          \
1444                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1445                                     : (scan->flags == EXACTFA)                      \
1446                                       ? (utf8_target                                \
1447                                          ? trie_utf8_exactfa_fold                   \
1448                                          : trie_latin_utf8_exactfa_fold)            \
1449                                       : (scan->flags == EXACTFLU8                   \
1450                                          ? trie_flu8                                \
1451                                          : (utf8_target                             \
1452                                            ? trie_utf8_fold                         \
1453                                            :   trie_latin_utf8_fold)))
1454
1455 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1456 STMT_START {                                                                        \
1457     STRLEN skiplen;                                                                 \
1458     U8 flags = FOLD_FLAGS_FULL;                                                     \
1459     switch (trie_type) {                                                            \
1460     case trie_flu8:                                                                 \
1461         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1462         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1463             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1464         }                                                                           \
1465         goto do_trie_utf8_fold;                                                     \
1466     case trie_utf8_exactfa_fold:                                                    \
1467         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1468         /* FALLTHROUGH */                                                           \
1469     case trie_utf8_fold:                                                            \
1470       do_trie_utf8_fold:                                                            \
1471         if ( foldlen>0 ) {                                                          \
1472             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1473             foldlen -= len;                                                         \
1474             uscan += len;                                                           \
1475             len=0;                                                                  \
1476         } else {                                                                    \
1477             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1478             len = UTF8SKIP(uc);                                                     \
1479             skiplen = UNISKIP( uvc );                                               \
1480             foldlen -= skiplen;                                                     \
1481             uscan = foldbuf + skiplen;                                              \
1482         }                                                                           \
1483         break;                                                                      \
1484     case trie_latin_utf8_exactfa_fold:                                              \
1485         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1486         /* FALLTHROUGH */                                                           \
1487     case trie_latin_utf8_fold:                                                      \
1488         if ( foldlen>0 ) {                                                          \
1489             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1490             foldlen -= len;                                                         \
1491             uscan += len;                                                           \
1492             len=0;                                                                  \
1493         } else {                                                                    \
1494             len = 1;                                                                \
1495             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1496             skiplen = UNISKIP( uvc );                                               \
1497             foldlen -= skiplen;                                                     \
1498             uscan = foldbuf + skiplen;                                              \
1499         }                                                                           \
1500         break;                                                                      \
1501     case trie_utf8l:                                                                \
1502         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1503         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1504             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1505         }                                                                           \
1506         /* FALLTHROUGH */                                                           \
1507     case trie_utf8:                                                                 \
1508         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1509         break;                                                                      \
1510     case trie_plain:                                                                \
1511         uvc = (UV)*uc;                                                              \
1512         len = 1;                                                                    \
1513     }                                                                               \
1514     if (uvc < 256) {                                                                \
1515         charid = trie->charmap[ uvc ];                                              \
1516     }                                                                               \
1517     else {                                                                          \
1518         charid = 0;                                                                 \
1519         if (widecharmap) {                                                          \
1520             SV** const svpp = hv_fetch(widecharmap,                                 \
1521                         (char*)&uvc, sizeof(UV), 0);                                \
1522             if (svpp)                                                               \
1523                 charid = (U16)SvIV(*svpp);                                          \
1524         }                                                                           \
1525     }                                                                               \
1526 } STMT_END
1527
1528 #define DUMP_EXEC_POS(li,s,doutf8)                          \
1529     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1530                 startpos, doutf8)
1531
1532 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1533 STMT_START {                                              \
1534     while (s <= e) {                                      \
1535         if ( (COND)                                       \
1536              && (ln == 1 || folder(s, pat_string, ln))    \
1537              && (reginfo->intuit || regtry(reginfo, &s)) )\
1538             goto got_it;                                  \
1539         s++;                                              \
1540     }                                                     \
1541 } STMT_END
1542
1543 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1544 STMT_START {                                          \
1545     while (s < strend) {                              \
1546         CODE                                          \
1547         s += UTF8SKIP(s);                             \
1548     }                                                 \
1549 } STMT_END
1550
1551 #define REXEC_FBC_SCAN(CODE)                          \
1552 STMT_START {                                          \
1553     while (s < strend) {                              \
1554         CODE                                          \
1555         s++;                                          \
1556     }                                                 \
1557 } STMT_END
1558
1559 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1560 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1561     if (COND) {                                                \
1562         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1563             goto got_it;                                       \
1564         else                                                   \
1565             tmp = doevery;                                     \
1566     }                                                          \
1567     else                                                       \
1568         tmp = 1;                                               \
1569 )
1570
1571 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1572 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1573     if (COND) {                                                \
1574         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1575             goto got_it;                                       \
1576         else                                                   \
1577             tmp = doevery;                                     \
1578     }                                                          \
1579     else                                                       \
1580         tmp = 1;                                               \
1581 )
1582
1583 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1584     if (utf8_target) {                                         \
1585         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1586     }                                                          \
1587     else {                                                     \
1588         REXEC_FBC_CLASS_SCAN(COND);                            \
1589     }
1590
1591 /* The three macros below are slightly different versions of the same logic.
1592  *
1593  * The first is for /a and /aa when the target string is UTF-8.  This can only
1594  * match ascii, but it must advance based on UTF-8.   The other two handle the
1595  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1596  * for the boundary (or non-boundary) between a word and non-word character.
1597  * The utf8 and non-utf8 cases have the same logic, but the details must be
1598  * different.  Find the "wordness" of the character just prior to this one, and
1599  * compare it with the wordness of this one.  If they differ, we have a
1600  * boundary.  At the beginning of the string, pretend that the previous
1601  * character was a new-line.
1602  *
1603  * All these macros uncleanly have side-effects with each other and outside
1604  * variables.  So far it's been too much trouble to clean-up
1605  *
1606  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1607  *               a word character or not.
1608  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1609  *               word/non-word
1610  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1611  *
1612  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1613  * are looking for a boundary or for a non-boundary.  If we are looking for a
1614  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1615  * see if this tentative match actually works, and if so, to quit the loop
1616  * here.  And vice-versa if we are looking for a non-boundary.
1617  *
1618  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1619  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1620  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1621  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1622  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1623  * complement.  But in that branch we complement tmp, meaning that at the
1624  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1625  * which means at the top of the loop in the next iteration, it is
1626  * TEST_NON_UTF8(s-1) */
1627 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1628     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1629     tmp = TEST_NON_UTF8(tmp);                                                  \
1630     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1631         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1632             tmp = !tmp;                                                        \
1633             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1634         }                                                                      \
1635         else {                                                                 \
1636             IF_FAIL;                                                           \
1637         }                                                                      \
1638     );                                                                         \
1639
1640 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1641  * TEST_UTF8 is a macro that for the same input code points returns identically
1642  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1643 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1644     if (s == reginfo->strbeg) {                                                \
1645         tmp = '\n';                                                            \
1646     }                                                                          \
1647     else { /* Back-up to the start of the previous character */                \
1648         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1649         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1650                                                        0, UTF8_ALLOW_DEFAULT); \
1651     }                                                                          \
1652     tmp = TEST_UV(tmp);                                                        \
1653     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1654     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1655         if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1656             tmp = !tmp;                                                        \
1657             IF_SUCCESS;                                                        \
1658         }                                                                      \
1659         else {                                                                 \
1660             IF_FAIL;                                                           \
1661         }                                                                      \
1662     );
1663
1664 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1665  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1666  * macros below */
1667 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1668     if (utf8_target) {                                                         \
1669         UTF8_CODE                                                              \
1670     }                                                                          \
1671     else {  /* Not utf8 */                                                     \
1672         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1673         tmp = TEST_NON_UTF8(tmp);                                              \
1674         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1675             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1676                 IF_SUCCESS;                                                    \
1677                 tmp = !tmp;                                                    \
1678             }                                                                  \
1679             else {                                                             \
1680                 IF_FAIL;                                                       \
1681             }                                                                  \
1682         );                                                                     \
1683     }                                                                          \
1684     /* Here, things have been set up by the previous code so that tmp is the   \
1685      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1686      * utf8ness of the target).  We also have to check if this matches against \
1687      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1688      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1689      * string */                                                               \
1690     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1691         IF_SUCCESS;                                                            \
1692     }                                                                          \
1693     else {                                                                     \
1694         IF_FAIL;                                                               \
1695     }
1696
1697 /* This is the macro to use when we want to see if something that looks like it
1698  * could match, actually does, and if so exits the loop */
1699 #define REXEC_FBC_TRYIT                            \
1700     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1701         goto got_it
1702
1703 /* The only difference between the BOUND and NBOUND cases is that
1704  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1705  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1706  * with the other one being empty (PLACEHOLDER is defined as empty).
1707  *
1708  * The TEST_FOO parameters are for operating on different forms of input, but
1709  * all should be ones that return identically for the same underlying code
1710  * points */
1711 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1712     FBC_BOUND_COMMON(                                                          \
1713           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1714           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1715
1716 #define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                         \
1717     FBC_BOUND_COMMON(                                                          \
1718             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1719             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1720
1721 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1722     FBC_BOUND_COMMON(                                                          \
1723           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1724           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1725
1726 #define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                        \
1727     FBC_BOUND_COMMON(                                                          \
1728             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1729             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1730
1731
1732 /* We know what class REx starts with.  Try to find this position... */
1733 /* if reginfo->intuit, its a dryrun */
1734 /* annoyingly all the vars in this routine have different names from their counterparts
1735    in regmatch. /grrr */
1736 STATIC char *
1737 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1738     const char *strend, regmatch_info *reginfo)
1739 {
1740     dVAR;
1741     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1742     char *pat_string;   /* The pattern's exactish string */
1743     char *pat_end;          /* ptr to end char of pat_string */
1744     re_fold_t folder;   /* Function for computing non-utf8 folds */
1745     const U8 *fold_array;   /* array for folding ords < 256 */
1746     STRLEN ln;
1747     STRLEN lnc;
1748     U8 c1;
1749     U8 c2;
1750     char *e;
1751     I32 tmp = 1;        /* Scratch variable? */
1752     const bool utf8_target = reginfo->is_utf8_target;
1753     UV utf8_fold_flags = 0;
1754     const bool is_utf8_pat = reginfo->is_utf8_pat;
1755     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1756                                    with a result inverts that result, as 0^1 =
1757                                    1 and 1^1 = 0 */
1758     _char_class_number classnum;
1759
1760     RXi_GET_DECL(prog,progi);
1761
1762     PERL_ARGS_ASSERT_FIND_BYCLASS;
1763
1764     /* We know what class it must start with. */
1765     switch (OP(c)) {
1766     case ANYOFL:
1767         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1768         /* FALLTHROUGH */
1769     case ANYOF:
1770         if (utf8_target) {
1771             REXEC_FBC_UTF8_CLASS_SCAN(
1772                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1773         }
1774         else {
1775             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1776         }
1777         break;
1778     case CANY:
1779         REXEC_FBC_SCAN(
1780             if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1781                 goto got_it;
1782             else
1783                 tmp = doevery;
1784         );
1785         break;
1786
1787     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1788         assert(! is_utf8_pat);
1789         /* FALLTHROUGH */
1790     case EXACTFA:
1791         if (is_utf8_pat || utf8_target) {
1792             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1793             goto do_exactf_utf8;
1794         }
1795         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1796         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1797         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1798
1799     case EXACTF:   /* This node only generated for non-utf8 patterns */
1800         assert(! is_utf8_pat);
1801         if (utf8_target) {
1802             utf8_fold_flags = 0;
1803             goto do_exactf_utf8;
1804         }
1805         fold_array = PL_fold;
1806         folder = foldEQ;
1807         goto do_exactf_non_utf8;
1808
1809     case EXACTFL:
1810         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1811         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1812             utf8_fold_flags = FOLDEQ_LOCALE;
1813             goto do_exactf_utf8;
1814         }
1815         fold_array = PL_fold_locale;
1816         folder = foldEQ_locale;
1817         goto do_exactf_non_utf8;
1818
1819     case EXACTFU_SS:
1820         if (is_utf8_pat) {
1821             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1822         }
1823         goto do_exactf_utf8;
1824
1825     case EXACTFLU8:
1826             if (! utf8_target) {    /* All code points in this node require
1827                                        UTF-8 to express.  */
1828                 break;
1829             }
1830             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1831                                              | FOLDEQ_S2_FOLDS_SANE;
1832             goto do_exactf_utf8;
1833
1834     case EXACTFU:
1835         if (is_utf8_pat || utf8_target) {
1836             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1837             goto do_exactf_utf8;
1838         }
1839
1840         /* Any 'ss' in the pattern should have been replaced by regcomp,
1841          * so we don't have to worry here about this single special case
1842          * in the Latin1 range */
1843         fold_array = PL_fold_latin1;
1844         folder = foldEQ_latin1;
1845
1846         /* FALLTHROUGH */
1847
1848       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1849                            are no glitches with fold-length differences
1850                            between the target string and pattern */
1851
1852         /* The idea in the non-utf8 EXACTF* cases is to first find the
1853          * first character of the EXACTF* node and then, if necessary,
1854          * case-insensitively compare the full text of the node.  c1 is the
1855          * first character.  c2 is its fold.  This logic will not work for
1856          * Unicode semantics and the german sharp ss, which hence should
1857          * not be compiled into a node that gets here. */
1858         pat_string = STRING(c);
1859         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1860
1861         /* We know that we have to match at least 'ln' bytes (which is the
1862          * same as characters, since not utf8).  If we have to match 3
1863          * characters, and there are only 2 availabe, we know without
1864          * trying that it will fail; so don't start a match past the
1865          * required minimum number from the far end */
1866         e = HOP3c(strend, -((SSize_t)ln), s);
1867
1868         if (reginfo->intuit && e < s) {
1869             e = s;                      /* Due to minlen logic of intuit() */
1870         }
1871
1872         c1 = *pat_string;
1873         c2 = fold_array[c1];
1874         if (c1 == c2) { /* If char and fold are the same */
1875             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1876         }
1877         else {
1878             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1879         }
1880         break;
1881
1882       do_exactf_utf8:
1883       {
1884         unsigned expansion;
1885
1886         /* If one of the operands is in utf8, we can't use the simpler folding
1887          * above, due to the fact that many different characters can have the
1888          * same fold, or portion of a fold, or different- length fold */
1889         pat_string = STRING(c);
1890         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1891         pat_end = pat_string + ln;
1892         lnc = is_utf8_pat       /* length to match in characters */
1893                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1894                 : ln;
1895
1896         /* We have 'lnc' characters to match in the pattern, but because of
1897          * multi-character folding, each character in the target can match
1898          * up to 3 characters (Unicode guarantees it will never exceed
1899          * this) if it is utf8-encoded; and up to 2 if not (based on the
1900          * fact that the Latin 1 folds are already determined, and the
1901          * only multi-char fold in that range is the sharp-s folding to
1902          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1903          * string character.  Adjust lnc accordingly, rounding up, so that
1904          * if we need to match at least 4+1/3 chars, that really is 5. */
1905         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1906         lnc = (lnc + expansion - 1) / expansion;
1907
1908         /* As in the non-UTF8 case, if we have to match 3 characters, and
1909          * only 2 are left, it's guaranteed to fail, so don't start a
1910          * match that would require us to go beyond the end of the string
1911          */
1912         e = HOP3c(strend, -((SSize_t)lnc), s);
1913
1914         if (reginfo->intuit && e < s) {
1915             e = s;                      /* Due to minlen logic of intuit() */
1916         }
1917
1918         /* XXX Note that we could recalculate e to stop the loop earlier,
1919          * as the worst case expansion above will rarely be met, and as we
1920          * go along we would usually find that e moves further to the left.
1921          * This would happen only after we reached the point in the loop
1922          * where if there were no expansion we should fail.  Unclear if
1923          * worth the expense */
1924
1925         while (s <= e) {
1926             char *my_strend= (char *)strend;
1927             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1928                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1929                 && (reginfo->intuit || regtry(reginfo, &s)) )
1930             {
1931                 goto got_it;
1932             }
1933             s += (utf8_target) ? UTF8SKIP(s) : 1;
1934         }
1935         break;
1936     }
1937
1938     case BOUNDL:
1939         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1940         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1941         break;
1942     case NBOUNDL:
1943         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1944         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1945         break;
1946     case BOUND:
1947         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1948         break;
1949     case BOUNDA:
1950         FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1951         break;
1952     case NBOUND:
1953         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1954         break;
1955     case NBOUNDA:
1956         FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1957         break;
1958     case BOUNDU:
1959         FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1960         break;
1961     case NBOUNDU:
1962         FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1963         break;
1964     case LNBREAK:
1965         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1966                         is_LNBREAK_latin1_safe(s, strend)
1967         );
1968         break;
1969
1970     /* The argument to all the POSIX node types is the class number to pass to
1971      * _generic_isCC() to build a mask for searching in PL_charclass[] */
1972
1973     case NPOSIXL:
1974         to_complement = 1;
1975         /* FALLTHROUGH */
1976
1977     case POSIXL:
1978         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1979         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1980                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1981         break;
1982
1983     case NPOSIXD:
1984         to_complement = 1;
1985         /* FALLTHROUGH */
1986
1987     case POSIXD:
1988         if (utf8_target) {
1989             goto posix_utf8;
1990         }
1991         goto posixa;
1992
1993     case NPOSIXA:
1994         if (utf8_target) {
1995             /* The complement of something that matches only ASCII matches all
1996              * non-ASCII, plus everything in ASCII that isn't in the class. */
1997             REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
1998                                       || ! _generic_isCC_A(*s, FLAGS(c)));
1999             break;
2000         }
2001
2002         to_complement = 1;
2003         /* FALLTHROUGH */
2004
2005     case POSIXA:
2006       posixa:
2007         /* Don't need to worry about utf8, as it can match only a single
2008          * byte invariant character. */
2009         REXEC_FBC_CLASS_SCAN(
2010                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2011         break;
2012
2013     case NPOSIXU:
2014         to_complement = 1;
2015         /* FALLTHROUGH */
2016
2017     case POSIXU:
2018         if (! utf8_target) {
2019             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2020                                                                     FLAGS(c))));
2021         }
2022         else {
2023
2024           posix_utf8:
2025             classnum = (_char_class_number) FLAGS(c);
2026             if (classnum < _FIRST_NON_SWASH_CC) {
2027                 while (s < strend) {
2028
2029                     /* We avoid loading in the swash as long as possible, but
2030                      * should we have to, we jump to a separate loop.  This
2031                      * extra 'if' statement is what keeps this code from being
2032                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2033                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2034                         goto found_above_latin1;
2035                     }
2036                     if ((UTF8_IS_INVARIANT(*s)
2037                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2038                                                                 classnum)))
2039                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
2040                             && to_complement ^ cBOOL(
2041                                 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2042                                                                       *(s + 1)),
2043                                               classnum))))
2044                     {
2045                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2046                             goto got_it;
2047                         else {
2048                             tmp = doevery;
2049                         }
2050                     }
2051                     else {
2052                         tmp = 1;
2053                     }
2054                     s += UTF8SKIP(s);
2055                 }
2056             }
2057             else switch (classnum) {    /* These classes are implemented as
2058                                            macros */
2059                 case _CC_ENUM_SPACE: /* XXX would require separate code if we
2060                                         revert the change of \v matching this */
2061                     /* FALLTHROUGH */
2062
2063                 case _CC_ENUM_PSXSPC:
2064                     REXEC_FBC_UTF8_CLASS_SCAN(
2065                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
2066                     break;
2067
2068                 case _CC_ENUM_BLANK:
2069                     REXEC_FBC_UTF8_CLASS_SCAN(
2070                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
2071                     break;
2072
2073                 case _CC_ENUM_XDIGIT:
2074                     REXEC_FBC_UTF8_CLASS_SCAN(
2075                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2076                     break;
2077
2078                 case _CC_ENUM_VERTSPACE:
2079                     REXEC_FBC_UTF8_CLASS_SCAN(
2080                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
2081                     break;
2082
2083                 case _CC_ENUM_CNTRL:
2084                     REXEC_FBC_UTF8_CLASS_SCAN(
2085                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
2086                     break;
2087
2088                 default:
2089                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2090                     NOT_REACHED; /* NOTREACHED */
2091             }
2092         }
2093         break;
2094
2095       found_above_latin1:   /* Here we have to load a swash to get the result
2096                                for the current code point */
2097         if (! PL_utf8_swash_ptrs[classnum]) {
2098             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2099             PL_utf8_swash_ptrs[classnum] =
2100                     _core_swash_init("utf8",
2101                                      "",
2102                                      &PL_sv_undef, 1, 0,
2103                                      PL_XPosix_ptrs[classnum], &flags);
2104         }
2105
2106         /* This is a copy of the loop above for swash classes, though using the
2107          * FBC macro instead of being expanded out.  Since we've loaded the
2108          * swash, we don't have to check for that each time through the loop */
2109         REXEC_FBC_UTF8_CLASS_SCAN(
2110                 to_complement ^ cBOOL(_generic_utf8(
2111                                       classnum,
2112                                       s,
2113                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2114                                                   (U8 *) s, TRUE))));
2115         break;
2116
2117     case AHOCORASICKC:
2118     case AHOCORASICK:
2119         {
2120             DECL_TRIE_TYPE(c);
2121             /* what trie are we using right now */
2122             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2123             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2124             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2125
2126             const char *last_start = strend - trie->minlen;
2127 #ifdef DEBUGGING
2128             const char *real_start = s;
2129 #endif
2130             STRLEN maxlen = trie->maxlen;
2131             SV *sv_points;
2132             U8 **points; /* map of where we were in the input string
2133                             when reading a given char. For ASCII this
2134                             is unnecessary overhead as the relationship
2135                             is always 1:1, but for Unicode, especially
2136                             case folded Unicode this is not true. */
2137             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2138             U8 *bitmap=NULL;
2139
2140
2141             GET_RE_DEBUG_FLAGS_DECL;
2142
2143             /* We can't just allocate points here. We need to wrap it in
2144              * an SV so it gets freed properly if there is a croak while
2145              * running the match */
2146             ENTER;
2147             SAVETMPS;
2148             sv_points=newSV(maxlen * sizeof(U8 *));
2149             SvCUR_set(sv_points,
2150                 maxlen * sizeof(U8 *));
2151             SvPOK_on(sv_points);
2152             sv_2mortal(sv_points);
2153             points=(U8**)SvPV_nolen(sv_points );
2154             if ( trie_type != trie_utf8_fold
2155                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2156             {
2157                 if (trie->bitmap)
2158                     bitmap=(U8*)trie->bitmap;
2159                 else
2160                     bitmap=(U8*)ANYOF_BITMAP(c);
2161             }
2162             /* this is the Aho-Corasick algorithm modified a touch
2163                to include special handling for long "unknown char" sequences.
2164                The basic idea being that we use AC as long as we are dealing
2165                with a possible matching char, when we encounter an unknown char
2166                (and we have not encountered an accepting state) we scan forward
2167                until we find a legal starting char.
2168                AC matching is basically that of trie matching, except that when
2169                we encounter a failing transition, we fall back to the current
2170                states "fail state", and try the current char again, a process
2171                we repeat until we reach the root state, state 1, or a legal
2172                transition. If we fail on the root state then we can either
2173                terminate if we have reached an accepting state previously, or
2174                restart the entire process from the beginning if we have not.
2175
2176              */
2177             while (s <= last_start) {
2178                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2179                 U8 *uc = (U8*)s;
2180                 U16 charid = 0;
2181                 U32 base = 1;
2182                 U32 state = 1;
2183                 UV uvc = 0;
2184                 STRLEN len = 0;
2185                 STRLEN foldlen = 0;
2186                 U8 *uscan = (U8*)NULL;
2187                 U8 *leftmost = NULL;
2188 #ifdef DEBUGGING
2189                 U32 accepted_word= 0;
2190 #endif
2191                 U32 pointpos = 0;
2192
2193                 while ( state && uc <= (U8*)strend ) {
2194                     int failed=0;
2195                     U32 word = aho->states[ state ].wordnum;
2196
2197                     if( state==1 ) {
2198                         if ( bitmap ) {
2199                             DEBUG_TRIE_EXECUTE_r(
2200                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2201                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2202                                         (char *)uc, utf8_target );
2203                                     PerlIO_printf( Perl_debug_log,
2204                                         " Scanning for legal start char...\n");
2205                                 }
2206                             );
2207                             if (utf8_target) {
2208                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2209                                     uc += UTF8SKIP(uc);
2210                                 }
2211                             } else {
2212                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2213                                     uc++;
2214                                 }
2215                             }
2216                             s= (char *)uc;
2217                         }
2218                         if (uc >(U8*)last_start) break;
2219                     }
2220
2221                     if ( word ) {
2222                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2223                         if (!leftmost || lpos < leftmost) {
2224                             DEBUG_r(accepted_word=word);
2225                             leftmost= lpos;
2226                         }
2227                         if (base==0) break;
2228
2229                     }
2230                     points[pointpos++ % maxlen]= uc;
2231                     if (foldlen || uc < (U8*)strend) {
2232                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2233                                          widecharmap, uc,
2234                                          uscan, len, uvc, charid, foldlen,
2235                                          foldbuf, uniflags);
2236                         DEBUG_TRIE_EXECUTE_r({
2237                             dump_exec_pos( (char *)uc, c, strend,
2238                                         real_start, s, utf8_target);
2239                             PerlIO_printf(Perl_debug_log,
2240                                 " Charid:%3u CP:%4"UVxf" ",
2241                                  charid, uvc);
2242                         });
2243                     }
2244                     else {
2245                         len = 0;
2246                         charid = 0;
2247                     }
2248
2249
2250                     do {
2251 #ifdef DEBUGGING
2252                         word = aho->states[ state ].wordnum;
2253 #endif
2254                         base = aho->states[ state ].trans.base;
2255
2256                         DEBUG_TRIE_EXECUTE_r({
2257                             if (failed)
2258                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2259                                     s,   utf8_target );
2260                             PerlIO_printf( Perl_debug_log,
2261                                 "%sState: %4"UVxf", word=%"UVxf,
2262                                 failed ? " Fail transition to " : "",
2263                                 (UV)state, (UV)word);
2264                         });
2265                         if ( base ) {
2266                             U32 tmp;
2267                             I32 offset;
2268                             if (charid &&
2269                                  ( ((offset = base + charid
2270                                     - 1 - trie->uniquecharcount)) >= 0)
2271                                  && ((U32)offset < trie->lasttrans)
2272                                  && trie->trans[offset].check == state
2273                                  && (tmp=trie->trans[offset].next))
2274                             {
2275                                 DEBUG_TRIE_EXECUTE_r(
2276                                     PerlIO_printf( Perl_debug_log," - legal\n"));
2277                                 state = tmp;
2278                                 break;
2279                             }
2280                             else {
2281                                 DEBUG_TRIE_EXECUTE_r(
2282                                     PerlIO_printf( Perl_debug_log," - fail\n"));
2283                                 failed = 1;
2284                                 state = aho->fail[state];
2285                             }
2286                         }
2287                         else {
2288                             /* we must be accepting here */
2289                             DEBUG_TRIE_EXECUTE_r(
2290                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
2291                             failed = 1;
2292                             break;
2293                         }
2294                     } while(state);
2295                     uc += len;
2296                     if (failed) {
2297                         if (leftmost)
2298                             break;
2299                         if (!state) state = 1;
2300                     }
2301                 }
2302                 if ( aho->states[ state ].wordnum ) {
2303                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2304                     if (!leftmost || lpos < leftmost) {
2305                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2306                         leftmost = lpos;
2307                     }
2308                 }
2309                 if (leftmost) {
2310                     s = (char*)leftmost;
2311                     DEBUG_TRIE_EXECUTE_r({
2312                         PerlIO_printf(
2313                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2314                             (UV)accepted_word, (IV)(s - real_start)
2315                         );
2316                     });
2317                     if (reginfo->intuit || regtry(reginfo, &s)) {
2318                         FREETMPS;
2319                         LEAVE;
2320                         goto got_it;
2321                     }
2322                     s = HOPc(s,1);
2323                     DEBUG_TRIE_EXECUTE_r({
2324                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2325                     });
2326                 } else {
2327                     DEBUG_TRIE_EXECUTE_r(
2328                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2329                     break;
2330                 }
2331             }
2332             FREETMPS;
2333             LEAVE;
2334         }
2335         break;
2336     default:
2337         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2338     }
2339     return 0;
2340   got_it:
2341     return s;
2342 }
2343
2344 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2345  * flags have same meanings as with regexec_flags() */
2346
2347 static void
2348 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2349                             char *strbeg,
2350                             char *strend,
2351                             SV *sv,
2352                             U32 flags,
2353                             bool utf8_target)
2354 {
2355     struct regexp *const prog = ReANY(rx);
2356
2357     if (flags & REXEC_COPY_STR) {
2358 #ifdef PERL_ANY_COW
2359         if (SvCANCOW(sv)) {
2360             if (DEBUG_C_TEST) {
2361                 PerlIO_printf(Perl_debug_log,
2362                               "Copy on write: regexp capture, type %d\n",
2363                               (int) SvTYPE(sv));
2364             }
2365             /* Create a new COW SV to share the match string and store
2366              * in saved_copy, unless the current COW SV in saved_copy
2367              * is valid and suitable for our purpose */
2368             if ((   prog->saved_copy
2369                  && SvIsCOW(prog->saved_copy)
2370                  && SvPOKp(prog->saved_copy)
2371                  && SvIsCOW(sv)
2372                  && SvPOKp(sv)
2373                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2374             {
2375                 /* just reuse saved_copy SV */
2376                 if (RXp_MATCH_COPIED(prog)) {
2377                     Safefree(prog->subbeg);
2378                     RXp_MATCH_COPIED_off(prog);
2379                 }
2380             }
2381             else {
2382                 /* create new COW SV to share string */
2383                 RX_MATCH_COPY_FREE(rx);
2384                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2385             }
2386             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2387             assert (SvPOKp(prog->saved_copy));
2388             prog->sublen  = strend - strbeg;
2389             prog->suboffset = 0;
2390             prog->subcoffset = 0;
2391         } else
2392 #endif
2393         {
2394             SSize_t min = 0;
2395             SSize_t max = strend - strbeg;
2396             SSize_t sublen;
2397
2398             if (    (flags & REXEC_COPY_SKIP_POST)
2399                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2400                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2401             ) { /* don't copy $' part of string */
2402                 U32 n = 0;
2403                 max = -1;
2404                 /* calculate the right-most part of the string covered
2405                  * by a capture. Due to look-ahead, this may be to
2406                  * the right of $&, so we have to scan all captures */
2407                 while (n <= prog->lastparen) {
2408                     if (prog->offs[n].end > max)
2409                         max = prog->offs[n].end;
2410                     n++;
2411                 }
2412                 if (max == -1)
2413                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2414                             ? prog->offs[0].start
2415                             : 0;
2416                 assert(max >= 0 && max <= strend - strbeg);
2417             }
2418
2419             if (    (flags & REXEC_COPY_SKIP_PRE)
2420                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2421                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2422             ) { /* don't copy $` part of string */
2423                 U32 n = 0;
2424                 min = max;
2425                 /* calculate the left-most part of the string covered
2426                  * by a capture. Due to look-behind, this may be to
2427                  * the left of $&, so we have to scan all captures */
2428                 while (min && n <= prog->lastparen) {
2429                     if (   prog->offs[n].start != -1
2430                         && prog->offs[n].start < min)
2431                     {
2432                         min = prog->offs[n].start;
2433                     }
2434                     n++;
2435                 }
2436                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2437                     && min >  prog->offs[0].end
2438                 )
2439                     min = prog->offs[0].end;
2440
2441             }
2442
2443             assert(min >= 0 && min <= max && min <= strend - strbeg);
2444             sublen = max - min;
2445
2446             if (RX_MATCH_COPIED(rx)) {
2447                 if (sublen > prog->sublen)
2448                     prog->subbeg =
2449                             (char*)saferealloc(prog->subbeg, sublen+1);
2450             }
2451             else
2452                 prog->subbeg = (char*)safemalloc(sublen+1);
2453             Copy(strbeg + min, prog->subbeg, sublen, char);
2454             prog->subbeg[sublen] = '\0';
2455             prog->suboffset = min;
2456             prog->sublen = sublen;
2457             RX_MATCH_COPIED_on(rx);
2458         }
2459         prog->subcoffset = prog->suboffset;
2460         if (prog->suboffset && utf8_target) {
2461             /* Convert byte offset to chars.
2462              * XXX ideally should only compute this if @-/@+
2463              * has been seen, a la PL_sawampersand ??? */
2464
2465             /* If there's a direct correspondence between the
2466              * string which we're matching and the original SV,
2467              * then we can use the utf8 len cache associated with
2468              * the SV. In particular, it means that under //g,
2469              * sv_pos_b2u() will use the previously cached
2470              * position to speed up working out the new length of
2471              * subcoffset, rather than counting from the start of
2472              * the string each time. This stops
2473              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2474              * from going quadratic */
2475             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2476                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2477                                                 SV_GMAGIC|SV_CONST_RETURN);
2478             else
2479                 prog->subcoffset = utf8_length((U8*)strbeg,
2480                                     (U8*)(strbeg+prog->suboffset));
2481         }
2482     }
2483     else {
2484         RX_MATCH_COPY_FREE(rx);
2485         prog->subbeg = strbeg;
2486         prog->suboffset = 0;
2487         prog->subcoffset = 0;
2488         prog->sublen = strend - strbeg;
2489     }
2490 }
2491
2492
2493
2494
2495 /*
2496  - regexec_flags - match a regexp against a string
2497  */
2498 I32
2499 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2500               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2501 /* stringarg: the point in the string at which to begin matching */
2502 /* strend:    pointer to null at end of string */
2503 /* strbeg:    real beginning of string */
2504 /* minend:    end of match must be >= minend bytes after stringarg. */
2505 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2506  *            itself is accessed via the pointers above */
2507 /* data:      May be used for some additional optimizations.
2508               Currently unused. */
2509 /* flags:     For optimizations. See REXEC_* in regexp.h */
2510
2511 {
2512     struct regexp *const prog = ReANY(rx);
2513     char *s;
2514     regnode *c;
2515     char *startpos;
2516     SSize_t minlen;             /* must match at least this many chars */
2517     SSize_t dontbother = 0;     /* how many characters not to try at end */
2518     const bool utf8_target = cBOOL(DO_UTF8(sv));
2519     I32 multiline;
2520     RXi_GET_DECL(prog,progi);
2521     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2522     regmatch_info *const reginfo = &reginfo_buf;
2523     regexp_paren_pair *swap = NULL;
2524     I32 oldsave;
2525     GET_RE_DEBUG_FLAGS_DECL;
2526
2527     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2528     PERL_UNUSED_ARG(data);
2529
2530     /* Be paranoid... */
2531     if (prog == NULL || stringarg == NULL) {
2532         Perl_croak(aTHX_ "NULL regexp parameter");
2533     }
2534
2535     DEBUG_EXECUTE_r(
2536         debug_start_match(rx, utf8_target, stringarg, strend,
2537         "Matching");
2538     );
2539
2540     startpos = stringarg;
2541
2542     if (prog->intflags & PREGf_GPOS_SEEN) {
2543         MAGIC *mg;
2544
2545         /* set reginfo->ganch, the position where \G can match */
2546
2547         reginfo->ganch =
2548             (flags & REXEC_IGNOREPOS)
2549             ? stringarg /* use start pos rather than pos() */
2550             : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2551               /* Defined pos(): */
2552             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2553             : strbeg; /* pos() not defined; use start of string */
2554
2555         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2556             "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2557
2558         /* in the presence of \G, we may need to start looking earlier in
2559          * the string than the suggested start point of stringarg:
2560          * if prog->gofs is set, then that's a known, fixed minimum
2561          * offset, such as
2562          * /..\G/:   gofs = 2
2563          * /ab|c\G/: gofs = 1
2564          * or if the minimum offset isn't known, then we have to go back
2565          * to the start of the string, e.g. /w+\G/
2566          */
2567
2568         if (prog->intflags & PREGf_ANCH_GPOS) {
2569             startpos  = reginfo->ganch - prog->gofs;
2570             if (startpos <
2571                 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2572             {
2573                 DEBUG_r(PerlIO_printf(Perl_debug_log,
2574                         "fail: ganch-gofs before earliest possible start\n"));
2575                 return 0;
2576             }
2577         }
2578         else if (prog->gofs) {
2579             if (startpos - prog->gofs < strbeg)
2580                 startpos = strbeg;
2581             else
2582                 startpos -= prog->gofs;
2583         }
2584         else if (prog->intflags & PREGf_GPOS_FLOAT)
2585             startpos = strbeg;
2586     }
2587
2588     minlen = prog->minlen;
2589     if ((startpos + minlen) > strend || startpos < strbeg) {
2590         DEBUG_r(PerlIO_printf(Perl_debug_log,
2591                     "Regex match can't succeed, so not even tried\n"));
2592         return 0;
2593     }
2594
2595     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2596      * which will call destuctors to reset PL_regmatch_state, free higher
2597      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2598      * regmatch_info_aux_eval */
2599
2600     oldsave = PL_savestack_ix;
2601
2602     s = startpos;
2603
2604     if ((prog->extflags & RXf_USE_INTUIT)
2605         && !(flags & REXEC_CHECKED))
2606     {
2607         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2608                                     flags, NULL);
2609         if (!s)
2610             return 0;
2611
2612         if (prog->extflags & RXf_CHECK_ALL) {
2613             /* we can match based purely on the result of INTUIT.
2614              * Set up captures etc just for $& and $-[0]
2615              * (an intuit-only match wont have $1,$2,..) */
2616             assert(!prog->nparens);
2617
2618             /* s/// doesn't like it if $& is earlier than where we asked it to
2619              * start searching (which can happen on something like /.\G/) */
2620             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2621                     && (s < stringarg))
2622             {
2623                 /* this should only be possible under \G */
2624                 assert(prog->intflags & PREGf_GPOS_SEEN);
2625                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2626                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2627                 goto phooey;
2628             }
2629
2630             /* match via INTUIT shouldn't have any captures.
2631              * Let @-, @+, $^N know */
2632             prog->lastparen = prog->lastcloseparen = 0;
2633             RX_MATCH_UTF8_set(rx, utf8_target);
2634             prog->offs[0].start = s - strbeg;
2635             prog->offs[0].end = utf8_target
2636                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2637                 : s - strbeg + prog->minlenret;
2638             if ( !(flags & REXEC_NOT_FIRST) )
2639                 S_reg_set_capture_string(aTHX_ rx,
2640                                         strbeg, strend,
2641                                         sv, flags, utf8_target);
2642
2643             return 1;
2644         }
2645     }
2646
2647     multiline = prog->extflags & RXf_PMf_MULTILINE;
2648
2649     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2650         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2651                               "String too short [regexec_flags]...\n"));
2652         goto phooey;
2653     }
2654
2655     /* Check validity of program. */
2656     if (UCHARAT(progi->program) != REG_MAGIC) {
2657         Perl_croak(aTHX_ "corrupted regexp program");
2658     }
2659
2660     RX_MATCH_TAINTED_off(rx);
2661     RX_MATCH_UTF8_set(rx, utf8_target);
2662
2663     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2664     reginfo->intuit = 0;
2665     reginfo->is_utf8_target = cBOOL(utf8_target);
2666     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2667     reginfo->warned = FALSE;
2668     reginfo->strbeg  = strbeg;
2669     reginfo->sv = sv;
2670     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2671     reginfo->strend = strend;
2672     /* see how far we have to get to not match where we matched before */
2673     reginfo->till = stringarg + minend;
2674
2675     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2676         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2677            S_cleanup_regmatch_info_aux has executed (registered by
2678            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2679            magic belonging to this SV.
2680            Not newSVsv, either, as it does not COW.
2681         */
2682         reginfo->sv = newSV(0);
2683         SvSetSV_nosteal(reginfo->sv, sv);
2684         SAVEFREESV(reginfo->sv);
2685     }
2686
2687     /* reserve next 2 or 3 slots in PL_regmatch_state:
2688      * slot N+0: may currently be in use: skip it
2689      * slot N+1: use for regmatch_info_aux struct
2690      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2691      * slot N+3: ready for use by regmatch()
2692      */
2693
2694     {
2695         regmatch_state *old_regmatch_state;
2696         regmatch_slab  *old_regmatch_slab;
2697         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2698
2699         /* on first ever match, allocate first slab */
2700         if (!PL_regmatch_slab) {
2701             Newx(PL_regmatch_slab, 1, regmatch_slab);
2702             PL_regmatch_slab->prev = NULL;
2703             PL_regmatch_slab->next = NULL;
2704             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2705         }
2706
2707         old_regmatch_state = PL_regmatch_state;
2708         old_regmatch_slab  = PL_regmatch_slab;
2709
2710         for (i=0; i <= max; i++) {
2711             if (i == 1)
2712                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2713             else if (i ==2)
2714                 reginfo->info_aux_eval =
2715                 reginfo->info_aux->info_aux_eval =
2716                             &(PL_regmatch_state->u.info_aux_eval);
2717
2718             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2719                 PL_regmatch_state = S_push_slab(aTHX);
2720         }
2721
2722         /* note initial PL_regmatch_state position; at end of match we'll
2723          * pop back to there and free any higher slabs */
2724
2725         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2726         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2727         reginfo->info_aux->poscache = NULL;
2728
2729         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2730
2731         if ((prog->extflags & RXf_EVAL_SEEN))
2732             S_setup_eval_state(aTHX_ reginfo);
2733         else
2734             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2735     }
2736
2737     /* If there is a "must appear" string, look for it. */
2738
2739     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2740         /* We have to be careful. If the previous successful match
2741            was from this regex we don't want a subsequent partially
2742            successful match to clobber the old results.
2743            So when we detect this possibility we add a swap buffer
2744            to the re, and switch the buffer each match. If we fail,
2745            we switch it back; otherwise we leave it swapped.
2746         */
2747         swap = prog->offs;
2748         /* do we need a save destructor here for eval dies? */
2749         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2750         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2751             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2752             PTR2UV(prog),
2753             PTR2UV(swap),
2754             PTR2UV(prog->offs)
2755         ));
2756     }
2757
2758     /* Simplest case:  anchored match need be tried only once. */
2759     /*  [unless only anchor is MBOL - implying multiline is set] */
2760     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
2761         if (s == startpos && regtry(reginfo, &s))
2762             goto got_it;
2763         else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
2764         {
2765             char *end;
2766
2767             if (minlen)
2768                 dontbother = minlen - 1;
2769             end = HOP3c(strend, -dontbother, strbeg) - 1;
2770             /* for multiline we only have to try after newlines */
2771             if (prog->check_substr || prog->check_utf8) {
2772                 /* because of the goto we can not easily reuse the macros for bifurcating the
2773                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2774                 if (utf8_target) {
2775                     if (s == startpos)
2776                         goto after_try_utf8;
2777                     while (1) {
2778                         if (regtry(reginfo, &s)) {
2779                             goto got_it;
2780                         }
2781                       after_try_utf8:
2782                         if (s > end) {
2783                             goto phooey;
2784                         }
2785                         if (prog->extflags & RXf_USE_INTUIT) {
2786                             s = re_intuit_start(rx, sv, strbeg,
2787                                     s + UTF8SKIP(s), strend, flags, NULL);
2788                             if (!s) {
2789                                 goto phooey;
2790                             }
2791                         }
2792                         else {
2793                             s += UTF8SKIP(s);
2794                         }
2795                     }
2796                 } /* end search for check string in unicode */
2797                 else {
2798                     if (s == startpos) {
2799                         goto after_try_latin;
2800                     }
2801                     while (1) {
2802                         if (regtry(reginfo, &s)) {
2803                             goto got_it;
2804                         }
2805                       after_try_latin:
2806                         if (s > end) {
2807                             goto phooey;
2808                         }
2809                         if (prog->extflags & RXf_USE_INTUIT) {
2810                             s = re_intuit_start(rx, sv, strbeg,
2811                                         s + 1, strend, flags, NULL);
2812                             if (!s) {
2813                                 goto phooey;
2814                             }
2815                         }
2816                         else {
2817                             s++;
2818                         }
2819                     }
2820                 } /* end search for check string in latin*/
2821             } /* end search for check string */
2822             else { /* search for newline */
2823                 if (s > startpos) {
2824                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2825                     s--;
2826                 }
2827                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2828                 while (s <= end) { /* note it could be possible to match at the end of the string */
2829                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2830                         if (regtry(reginfo, &s))
2831                             goto got_it;
2832                     }
2833                 }
2834             } /* end search for newline */
2835         } /* end anchored/multiline check string search */
2836         goto phooey;
2837     } else if (prog->intflags & PREGf_ANCH_GPOS)
2838     {
2839         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2840         assert(prog->intflags & PREGf_GPOS_SEEN);
2841         /* For anchored \G, the only position it can match from is
2842          * (ganch-gofs); we already set startpos to this above; if intuit
2843          * moved us on from there, we can't possibly succeed */
2844         assert(startpos == reginfo->ganch - prog->gofs);
2845         if (s == startpos && regtry(reginfo, &s))
2846             goto got_it;
2847         goto phooey;
2848     }
2849
2850     /* Messy cases:  unanchored match. */
2851     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2852         /* we have /x+whatever/ */
2853         /* it must be a one character string (XXXX Except is_utf8_pat?) */
2854         char ch;
2855 #ifdef DEBUGGING
2856         int did_match = 0;
2857 #endif
2858         if (utf8_target) {
2859             if (! prog->anchored_utf8) {
2860                 to_utf8_substr(prog);
2861             }
2862             ch = SvPVX_const(prog->anchored_utf8)[0];
2863             REXEC_FBC_SCAN(
2864                 if (*s == ch) {
2865                     DEBUG_EXECUTE_r( did_match = 1 );
2866                     if (regtry(reginfo, &s)) goto got_it;
2867                     s += UTF8SKIP(s);
2868                     while (s < strend && *s == ch)
2869                         s += UTF8SKIP(s);
2870                 }
2871             );
2872
2873         }
2874         else {
2875             if (! prog->anchored_substr) {
2876                 if (! to_byte_substr(prog)) {
2877                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2878                 }
2879             }
2880             ch = SvPVX_const(prog->anchored_substr)[0];
2881             REXEC_FBC_SCAN(
2882                 if (*s == ch) {
2883                     DEBUG_EXECUTE_r( did_match = 1 );
2884                     if (regtry(reginfo, &s)) goto got_it;
2885                     s++;
2886                     while (s < strend && *s == ch)
2887                         s++;
2888                 }
2889             );
2890         }
2891         DEBUG_EXECUTE_r(if (!did_match)
2892                 PerlIO_printf(Perl_debug_log,
2893                                   "Did not find anchored character...\n")
2894                );
2895     }
2896     else if (prog->anchored_substr != NULL
2897               || prog->anchored_utf8 != NULL
2898               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2899                   && prog->float_max_offset < strend - s)) {
2900         SV *must;
2901         SSize_t back_max;
2902         SSize_t back_min;
2903         char *last;
2904         char *last1;            /* Last position checked before */
2905 #ifdef DEBUGGING
2906         int did_match = 0;
2907 #endif
2908         if (prog->anchored_substr || prog->anchored_utf8) {
2909             if (utf8_target) {
2910                 if (! prog->anchored_utf8) {
2911                     to_utf8_substr(prog);
2912                 }
2913                 must = prog->anchored_utf8;
2914             }
2915             else {
2916                 if (! prog->anchored_substr) {
2917                     if (! to_byte_substr(prog)) {
2918                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2919                     }
2920                 }
2921                 must = prog->anchored_substr;
2922             }
2923             back_max = back_min = prog->anchored_offset;
2924         } else {
2925             if (utf8_target) {
2926                 if (! prog->float_utf8) {
2927                     to_utf8_substr(prog);
2928                 }
2929                 must = prog->float_utf8;
2930             }
2931             else {
2932                 if (! prog->float_substr) {
2933                     if (! to_byte_substr(prog)) {
2934                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2935                     }
2936                 }
2937                 must = prog->float_substr;
2938             }
2939             back_max = prog->float_max_offset;
2940             back_min = prog->float_min_offset;
2941         }
2942
2943         if (back_min<0) {
2944             last = strend;
2945         } else {
2946             last = HOP3c(strend,        /* Cannot start after this */
2947                   -(SSize_t)(CHR_SVLEN(must)
2948                          - (SvTAIL(must) != 0) + back_min), strbeg);
2949         }
2950         if (s > reginfo->strbeg)
2951             last1 = HOPc(s, -1);
2952         else
2953             last1 = s - 1;      /* bogus */
2954
2955         /* XXXX check_substr already used to find "s", can optimize if
2956            check_substr==must. */
2957         dontbother = 0;
2958         strend = HOPc(strend, -dontbother);
2959         while ( (s <= last) &&
2960                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
2961                                   (unsigned char*)strend, must,
2962                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2963             DEBUG_EXECUTE_r( did_match = 1 );
2964             if (HOPc(s, -back_max) > last1) {
2965                 last1 = HOPc(s, -back_min);
2966                 s = HOPc(s, -back_max);
2967             }
2968             else {
2969                 char * const t = (last1 >= reginfo->strbeg)
2970                                     ? HOPc(last1, 1) : last1 + 1;
2971
2972                 last1 = HOPc(s, -back_min);
2973                 s = t;
2974             }
2975             if (utf8_target) {
2976                 while (s <= last1) {
2977                     if (regtry(reginfo, &s))
2978                         goto got_it;
2979                     if (s >= last1) {
2980                         s++; /* to break out of outer loop */
2981                         break;
2982                     }
2983                     s += UTF8SKIP(s);
2984                 }
2985             }
2986             else {
2987                 while (s <= last1) {
2988                     if (regtry(reginfo, &s))
2989                         goto got_it;
2990                     s++;
2991                 }
2992             }
2993         }
2994         DEBUG_EXECUTE_r(if (!did_match) {
2995             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2996                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2997             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2998                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2999                                ? "anchored" : "floating"),
3000                 quoted, RE_SV_TAIL(must));
3001         });
3002         goto phooey;
3003     }
3004     else if ( (c = progi->regstclass) ) {
3005         if (minlen) {
3006             const OPCODE op = OP(progi->regstclass);
3007             /* don't bother with what can't match */
3008             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
3009                 strend = HOPc(strend, -(minlen - 1));
3010         }
3011         DEBUG_EXECUTE_r({
3012             SV * const prop = sv_newmortal();
3013             regprop(prog, prop, c, reginfo, NULL);
3014             {
3015                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3016                     s,strend-s,60);
3017                 PerlIO_printf(Perl_debug_log,
3018                     "Matching stclass %.*s against %s (%d bytes)\n",
3019                     (int)SvCUR(prop), SvPVX_const(prop),
3020                      quoted, (int)(strend - s));
3021             }
3022         });
3023         if (find_byclass(prog, c, s, strend, reginfo))
3024             goto got_it;
3025         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
3026     }
3027     else {
3028         dontbother = 0;
3029         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3030             /* Trim the end. */
3031             char *last= NULL;
3032             SV* float_real;
3033             STRLEN len;
3034             const char *little;
3035
3036             if (utf8_target) {
3037                 if (! prog->float_utf8) {
3038                     to_utf8_substr(prog);
3039                 }
3040                 float_real = prog->float_utf8;
3041             }
3042             else {
3043                 if (! prog->float_substr) {
3044                     if (! to_byte_substr(prog)) {
3045                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3046                     }
3047                 }
3048                 float_real = prog->float_substr;
3049             }
3050
3051             little = SvPV_const(float_real, len);
3052             if (SvTAIL(float_real)) {
3053                     /* This means that float_real contains an artificial \n on
3054                      * the end due to the presence of something like this:
3055                      * /foo$/ where we can match both "foo" and "foo\n" at the
3056                      * end of the string.  So we have to compare the end of the
3057                      * string first against the float_real without the \n and
3058                      * then against the full float_real with the string.  We
3059                      * have to watch out for cases where the string might be
3060                      * smaller than the float_real or the float_real without
3061                      * the \n. */
3062                     char *checkpos= strend - len;
3063                     DEBUG_OPTIMISE_r(
3064                         PerlIO_printf(Perl_debug_log,
3065                             "%sChecking for float_real.%s\n",
3066                             PL_colors[4], PL_colors[5]));
3067                     if (checkpos + 1 < strbeg) {
3068                         /* can't match, even if we remove the trailing \n
3069                          * string is too short to match */
3070                         DEBUG_EXECUTE_r(
3071                             PerlIO_printf(Perl_debug_log,
3072                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3073                                 PL_colors[4], PL_colors[5]));
3074                         goto phooey;
3075                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3076                         /* can match, the end of the string matches without the
3077                          * "\n" */
3078                         last = checkpos + 1;
3079                     } else if (checkpos < strbeg) {
3080                         /* cant match, string is too short when the "\n" is
3081                          * included */
3082                         DEBUG_EXECUTE_r(
3083                             PerlIO_printf(Perl_debug_log,
3084                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3085                                 PL_colors[4], PL_colors[5]));
3086                         goto phooey;
3087                     } else if (!multiline) {
3088                         /* non multiline match, so compare with the "\n" at the
3089                          * end of the string */
3090                         if (memEQ(checkpos, little, len)) {
3091                             last= checkpos;
3092                         } else {
3093                             DEBUG_EXECUTE_r(
3094                                 PerlIO_printf(Perl_debug_log,
3095                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3096                                     PL_colors[4], PL_colors[5]));
3097                             goto phooey;
3098                         }
3099                     } else {
3100                         /* multiline match, so we have to search for a place
3101                          * where the full string is located */
3102                         goto find_last;
3103                     }
3104             } else {
3105                   find_last:
3106                     if (len)
3107                         last = rninstr(s, strend, little, little + len);
3108                     else
3109                         last = strend;  /* matching "$" */
3110             }
3111             if (!last) {
3112                 /* at one point this block contained a comment which was
3113                  * probably incorrect, which said that this was a "should not
3114                  * happen" case.  Even if it was true when it was written I am
3115                  * pretty sure it is not anymore, so I have removed the comment
3116                  * and replaced it with this one. Yves */
3117                 DEBUG_EXECUTE_r(
3118                     PerlIO_printf(Perl_debug_log,
3119                         "%sString does not contain required substring, cannot match.%s\n",
3120                         PL_colors[4], PL_colors[5]
3121                     ));
3122                 goto phooey;
3123             }
3124             dontbother = strend - last + prog->float_min_offset;
3125         }
3126         if (minlen && (dontbother < minlen))
3127             dontbother = minlen - 1;
3128         strend -= dontbother;              /* this one's always in bytes! */
3129         /* We don't know much -- general case. */
3130         if (utf8_target) {
3131             for (;;) {
3132                 if (regtry(reginfo, &s))
3133                     goto got_it;
3134                 if (s >= strend)
3135                     break;
3136                 s += UTF8SKIP(s);
3137             };
3138         }
3139         else {
3140             do {
3141                 if (regtry(reginfo, &s))
3142                     goto got_it;
3143             } while (s++ < strend);
3144         }
3145     }
3146
3147     /* Failure. */
3148     goto phooey;
3149
3150   got_it:
3151     /* s/// doesn't like it if $& is earlier than where we asked it to
3152      * start searching (which can happen on something like /.\G/) */
3153     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3154             && (prog->offs[0].start < stringarg - strbeg))
3155     {
3156         /* this should only be possible under \G */
3157         assert(prog->intflags & PREGf_GPOS_SEEN);
3158         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3159             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3160         goto phooey;
3161     }
3162
3163     DEBUG_BUFFERS_r(
3164         if (swap)
3165             PerlIO_printf(Perl_debug_log,
3166                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3167                 PTR2UV(prog),
3168                 PTR2UV(swap)
3169             );
3170     );
3171     Safefree(swap);
3172
3173     /* clean up; this will trigger destructors that will free all slabs
3174      * above the current one, and cleanup the regmatch_info_aux
3175      * and regmatch_info_aux_eval sructs */
3176
3177     LEAVE_SCOPE(oldsave);
3178
3179     if (RXp_PAREN_NAMES(prog))
3180         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3181
3182     /* make sure $`, $&, $', and $digit will work later */
3183     if ( !(flags & REXEC_NOT_FIRST) )
3184         S_reg_set_capture_string(aTHX_ rx,
3185                                     strbeg, reginfo->strend,
3186                                     sv, flags, utf8_target);
3187
3188     return 1;
3189
3190   phooey:
3191     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3192                           PL_colors[4], PL_colors[5]));
3193
3194     /* clean up; this will trigger destructors that will free all slabs
3195      * above the current one, and cleanup the regmatch_info_aux
3196      * and regmatch_info_aux_eval sructs */
3197
3198     LEAVE_SCOPE(oldsave);
3199
3200     if (swap) {
3201         /* we failed :-( roll it back */
3202         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3203             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3204             PTR2UV(prog),
3205             PTR2UV(prog->offs),
3206             PTR2UV(swap)
3207         ));
3208         Safefree(prog->offs);
3209         prog->offs = swap;
3210     }
3211     return 0;
3212 }
3213
3214
3215 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3216  * Do inc before dec, in case old and new rex are the same */
3217 #define SET_reg_curpm(Re2)                          \
3218     if (reginfo->info_aux_eval) {                   \
3219         (void)ReREFCNT_inc(Re2);                    \
3220         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3221         PM_SETRE((PL_reg_curpm), (Re2));            \
3222     }
3223
3224
3225 /*
3226  - regtry - try match at specific point
3227  */
3228 STATIC I32                      /* 0 failure, 1 success */
3229 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3230 {
3231     CHECKPOINT lastcp;
3232     REGEXP *const rx = reginfo->prog;
3233     regexp *const prog = ReANY(rx);
3234     SSize_t result;
3235     RXi_GET_DECL(prog,progi);
3236     GET_RE_DEBUG_FLAGS_DECL;
3237
3238     PERL_ARGS_ASSERT_REGTRY;
3239
3240     reginfo->cutpoint=NULL;
3241
3242     prog->offs[0].start = *startposp - reginfo->strbeg;
3243     prog->lastparen = 0;
3244     prog->lastcloseparen = 0;
3245
3246     /* XXXX What this code is doing here?!!!  There should be no need
3247        to do this again and again, prog->lastparen should take care of
3248        this!  --ilya*/
3249
3250     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3251      * Actually, the code in regcppop() (which Ilya may be meaning by
3252      * prog->lastparen), is not needed at all by the test suite
3253      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3254      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3255      * Meanwhile, this code *is* needed for the
3256      * above-mentioned test suite tests to succeed.  The common theme
3257      * on those tests seems to be returning null fields from matches.
3258      * --jhi updated by dapm */
3259 #if 1
3260     if (prog->nparens) {
3261         regexp_paren_pair *pp = prog->offs;
3262         I32 i;
3263         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3264             ++pp;
3265             pp->start = -1;
3266             pp->end = -1;
3267         }
3268     }
3269 #endif
3270     REGCP_SET(lastcp);
3271     result = regmatch(reginfo, *startposp, progi->program + 1);
3272     if (result != -1) {
3273         prog->offs[0].end = result;
3274         return 1;
3275     }
3276     if (reginfo->cutpoint)
3277         *startposp= reginfo->cutpoint;
3278     REGCP_UNWIND(lastcp);
3279     return 0;
3280 }
3281
3282
3283 #define sayYES goto yes
3284 #define sayNO goto no
3285 #define sayNO_SILENT goto no_silent
3286
3287 /* we dont use STMT_START/END here because it leads to
3288    "unreachable code" warnings, which are bogus, but distracting. */
3289 #define CACHEsayNO \
3290     if (ST.cache_mask) \
3291        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3292     sayNO
3293
3294 /* this is used to determine how far from the left messages like
3295    'failed...' are printed. It should be set such that messages
3296    are inline with the regop output that created them.
3297 */
3298 #define REPORT_CODE_OFF 32
3299
3300
3301 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3302 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3303 #define CHRTEST_NOT_A_CP_1 -999
3304 #define CHRTEST_NOT_A_CP_2 -998
3305
3306 /* grab a new slab and return the first slot in it */
3307
3308 STATIC regmatch_state *
3309 S_push_slab(pTHX)
3310 {
3311 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3312     dMY_CXT;
3313 #endif
3314     regmatch_slab *s = PL_regmatch_slab->next;
3315     if (!s) {
3316         Newx(s, 1, regmatch_slab);
3317         s->prev = PL_regmatch_slab;
3318         s->next = NULL;
3319         PL_regmatch_slab->next = s;
3320     }
3321     PL_regmatch_slab = s;
3322     return SLAB_FIRST(s);
3323 }
3324
3325
3326 /* push a new state then goto it */
3327
3328 #define PUSH_STATE_GOTO(state, node, input) \
3329     pushinput = input; \
3330     scan = node; \
3331     st->resume_state = state; \
3332     goto push_state;
3333
3334 /* push a new state with success backtracking, then goto it */
3335
3336 #define PUSH_YES_STATE_GOTO(state, node, input) \
3337     pushinput = input; \
3338     scan = node; \
3339     st->resume_state = state; \
3340     goto push_yes_state;
3341
3342
3343
3344
3345 /*
3346
3347 regmatch() - main matching routine
3348
3349 This is basically one big switch statement in a loop. We execute an op,
3350 set 'next' to point the next op, and continue. If we come to a point which
3351 we may need to backtrack to on failure such as (A|B|C), we push a
3352 backtrack state onto the backtrack stack. On failure, we pop the top
3353 state, and re-enter the loop at the state indicated. If there are no more
3354 states to pop, we return failure.
3355
3356 Sometimes we also need to backtrack on success; for example /A+/, where
3357 after successfully matching one A, we need to go back and try to
3358 match another one; similarly for lookahead assertions: if the assertion
3359 completes successfully, we backtrack to the state just before the assertion
3360 and then carry on.  In these cases, the pushed state is marked as
3361 'backtrack on success too'. This marking is in fact done by a chain of
3362 pointers, each pointing to the previous 'yes' state. On success, we pop to
3363 the nearest yes state, discarding any intermediate failure-only states.
3364 Sometimes a yes state is pushed just to force some cleanup code to be
3365 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3366 it to free the inner regex.
3367
3368 Note that failure backtracking rewinds the cursor position, while
3369 success backtracking leaves it alone.
3370
3371 A pattern is complete when the END op is executed, while a subpattern
3372 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3373 ops trigger the "pop to last yes state if any, otherwise return true"
3374 behaviour.
3375
3376 A common convention in this function is to use A and B to refer to the two
3377 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3378 the subpattern to be matched possibly multiple times, while B is the entire
3379 rest of the pattern. Variable and state names reflect this convention.
3380
3381 The states in the main switch are the union of ops and failure/success of
3382 substates associated with with that op.  For example, IFMATCH is the op
3383 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3384 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3385 successfully matched A and IFMATCH_A_fail is a state saying that we have
3386 just failed to match A. Resume states always come in pairs. The backtrack
3387 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3388 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3389 on success or failure.
3390
3391 The struct that holds a backtracking state is actually a big union, with
3392 one variant for each major type of op. The variable st points to the
3393 top-most backtrack struct. To make the code clearer, within each
3394 block of code we #define ST to alias the relevant union.
3395
3396 Here's a concrete example of a (vastly oversimplified) IFMATCH
3397 implementation:
3398
3399     switch (state) {
3400     ....
3401
3402 #define ST st->u.ifmatch
3403
3404     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3405         ST.foo = ...; // some state we wish to save
3406         ...
3407         // push a yes backtrack state with a resume value of
3408         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3409         // first node of A:
3410         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3411         // NOTREACHED
3412
3413     case IFMATCH_A: // we have successfully executed A; now continue with B
3414         next = B;
3415         bar = ST.foo; // do something with the preserved value
3416         break;
3417
3418     case IFMATCH_A_fail: // A failed, so the assertion failed
3419         ...;   // do some housekeeping, then ...
3420         sayNO; // propagate the failure
3421
3422 #undef ST
3423
3424     ...
3425     }
3426
3427 For any old-timers reading this who are familiar with the old recursive
3428 approach, the code above is equivalent to:
3429
3430     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3431     {
3432         int foo = ...
3433         ...
3434         if (regmatch(A)) {
3435             next = B;
3436             bar = foo;
3437             break;
3438         }
3439         ...;   // do some housekeeping, then ...
3440         sayNO; // propagate the failure
3441     }
3442
3443 The topmost backtrack state, pointed to by st, is usually free. If you
3444 want to claim it, populate any ST.foo fields in it with values you wish to
3445 save, then do one of
3446
3447         PUSH_STATE_GOTO(resume_state, node, newinput);
3448         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3449
3450 which sets that backtrack state's resume value to 'resume_state', pushes a
3451 new free entry to the top of the backtrack stack, then goes to 'node'.
3452 On backtracking, the free slot is popped, and the saved state becomes the
3453 new free state. An ST.foo field in this new top state can be temporarily
3454 accessed to retrieve values, but once the main loop is re-entered, it
3455 becomes available for reuse.
3456
3457 Note that the depth of the backtrack stack constantly increases during the
3458 left-to-right execution of the pattern, rather than going up and down with
3459 the pattern nesting. For example the stack is at its maximum at Z at the
3460 end of the pattern, rather than at X in the following:
3461
3462     /(((X)+)+)+....(Y)+....Z/
3463
3464 The only exceptions to this are lookahead/behind assertions and the cut,
3465 (?>A), which pop all the backtrack states associated with A before
3466 continuing.
3467
3468 Backtrack state structs are allocated in slabs of about 4K in size.
3469 PL_regmatch_state and st always point to the currently active state,
3470 and PL_regmatch_slab points to the slab currently containing
3471 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3472 allocated, and is never freed until interpreter destruction. When the slab
3473 is full, a new one is allocated and chained to the end. At exit from
3474 regmatch(), slabs allocated since entry are freed.
3475
3476 */
3477
3478
3479 #define DEBUG_STATE_pp(pp)                                  \
3480     DEBUG_STATE_r({                                         \
3481         DUMP_EXEC_POS(locinput, scan, utf8_target);         \
3482         PerlIO_printf(Perl_debug_log,                       \
3483             "    %*s"pp" %s%s%s%s%s\n",                     \
3484             depth*2, "",                                    \
3485             PL_reg_name[st->resume_state],                  \
3486             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3487             ((st==yes_state) ? "Y" : ""),                   \
3488             ((st==mark_state) ? "M" : ""),                  \
3489             ((st==yes_state||st==mark_state) ? "]" : "")    \
3490         );                                                  \
3491     });
3492
3493
3494 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3495
3496 #ifdef DEBUGGING
3497
3498 STATIC void
3499 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3500     const char *start, const char *end, const char *blurb)
3501 {
3502     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3503
3504     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3505
3506     if (!PL_colorset)
3507             reginitcolors();
3508     {
3509         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3510             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3511
3512         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3513             start, end - start, 60);
3514
3515         PerlIO_printf(Perl_debug_log,
3516             "%s%s REx%s %s against %s\n",
3517                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3518
3519         if (utf8_target||utf8_pat)
3520             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3521                 utf8_pat ? "pattern" : "",
3522                 utf8_pat && utf8_target ? " and " : "",
3523                 utf8_target ? "string" : ""
3524             );
3525     }
3526 }
3527
3528 STATIC void
3529 S_dump_exec_pos(pTHX_ const char *locinput,
3530                       const regnode *scan,
3531                       const char *loc_regeol,
3532                       const char *loc_bostr,
3533                       const char *loc_reg_starttry,
3534                       const bool utf8_target)
3535 {
3536     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3537     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3538     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3539     /* The part of the string before starttry has one color
3540        (pref0_len chars), between starttry and current
3541        position another one (pref_len - pref0_len chars),
3542        after the current position the third one.
3543        We assume that pref0_len <= pref_len, otherwise we
3544        decrease pref0_len.  */
3545     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3546         ? (5 + taill) - l : locinput - loc_bostr;
3547     int pref0_len;
3548
3549     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3550
3551     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3552         pref_len++;
3553     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3554     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3555         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3556               ? (5 + taill) - pref_len : loc_regeol - locinput);
3557     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3558         l--;
3559     if (pref0_len < 0)
3560         pref0_len = 0;
3561     if (pref0_len > pref_len)
3562         pref0_len = pref_len;
3563     {
3564         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3565
3566         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3567             (locinput - pref_len),pref0_len, 60, 4, 5);
3568
3569         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3570                     (locinput - pref_len + pref0_len),
3571                     pref_len - pref0_len, 60, 2, 3);
3572
3573         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3574                     locinput, loc_regeol - locinput, 10, 0, 1);
3575
3576         const STRLEN tlen=len0+len1+len2;
3577         PerlIO_printf(Perl_debug_log,
3578                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3579                     (IV)(locinput - loc_bostr),
3580                     len0, s0,
3581                     len1, s1,
3582                     (docolor ? "" : "> <"),
3583                     len2, s2,
3584                     (int)(tlen > 19 ? 0 :  19 - tlen),
3585                     "");
3586     }
3587 }
3588
3589 #endif
3590
3591 /* reg_check_named_buff_matched()
3592  * Checks to see if a named buffer has matched. The data array of
3593  * buffer numbers corresponding to the buffer is expected to reside
3594  * in the regexp->data->data array in the slot stored in the ARG() of
3595  * node involved. Note that this routine doesn't actually care about the
3596  * name, that information is not preserved from compilation to execution.
3597  * Returns the index of the leftmost defined buffer with the given name
3598  * or 0 if non of the buffers matched.
3599  */
3600 STATIC I32
3601 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3602 {
3603     I32 n;
3604     RXi_GET_DECL(rex,rexi);
3605     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3606     I32 *nums=(I32*)SvPVX(sv_dat);
3607
3608     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3609
3610     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3611         if ((I32)rex->lastparen >= nums[n] &&
3612             rex->offs[nums[n]].end != -1)
3613         {
3614             return nums[n];
3615         }
3616     }
3617     return 0;
3618 }
3619
3620
3621 static bool
3622 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3623         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3624 {
3625     /* This function determines if there are one or two characters that match
3626      * the first character of the passed-in EXACTish node <text_node>, and if
3627      * so, returns them in the passed-in pointers.
3628      *
3629      * If it determines that no possible character in the target string can
3630      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3631      * the first character in <text_node> requires UTF-8 to represent, and the
3632      * target string isn't in UTF-8.)
3633      *
3634      * If there are more than two characters that could match the beginning of
3635      * <text_node>, or if more context is required to determine a match or not,
3636      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3637      *
3638      * The motiviation behind this function is to allow the caller to set up
3639      * tight loops for matching.  If <text_node> is of type EXACT, there is
3640      * only one possible character that can match its first character, and so
3641      * the situation is quite simple.  But things get much more complicated if
3642      * folding is involved.  It may be that the first character of an EXACTFish
3643      * node doesn't participate in any possible fold, e.g., punctuation, so it
3644      * can be matched only by itself.  The vast majority of characters that are
3645      * in folds match just two things, their lower and upper-case equivalents.
3646      * But not all are like that; some have multiple possible matches, or match
3647      * sequences of more than one character.  This function sorts all that out.
3648      *
3649      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3650      * loop of trying to match A*, we know we can't exit where the thing
3651      * following it isn't a B.  And something can't be a B unless it is the
3652      * beginning of B.  By putting a quick test for that beginning in a tight
3653      * loop, we can rule out things that can't possibly be B without having to
3654      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3655      * character, we can make a tight loop matching A*, using the outputs of
3656      * this function.
3657      *
3658      * If the target string to match isn't in UTF-8, and there aren't
3659      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3660      * the one or two possible octets (which are characters in this situation)
3661      * that can match.  In all cases, if there is only one character that can
3662      * match, *<c1p> and *<c2p> will be identical.
3663      *
3664      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3665      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3666      * can match the beginning of <text_node>.  They should be declared with at
3667      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3668      * undefined what these contain.)  If one or both of the buffers are
3669      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3670      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3671      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3672      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3673      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3674
3675     const bool utf8_target = reginfo->is_utf8_target;
3676
3677     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3678     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
3679     bool use_chrtest_void = FALSE;
3680     const bool is_utf8_pat = reginfo->is_utf8_pat;
3681
3682     /* Used when we have both utf8 input and utf8 output, to avoid converting
3683      * to/from code points */
3684     bool utf8_has_been_setup = FALSE;
3685
3686     dVAR;
3687
3688     U8 *pat = (U8*)STRING(text_node);
3689     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3690
3691     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
3692
3693         /* In an exact node, only one thing can be matched, that first
3694          * character.  If both the pat and the target are UTF-8, we can just
3695          * copy the input to the output, avoiding finding the code point of
3696          * that character */
3697         if (!is_utf8_pat) {
3698             c2 = c1 = *pat;
3699         }
3700         else if (utf8_target) {
3701             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3702             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3703             utf8_has_been_setup = TRUE;
3704         }
3705         else {
3706             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3707         }
3708     }
3709     else { /* an EXACTFish node */
3710         U8 *pat_end = pat + STR_LEN(text_node);
3711
3712         /* An EXACTFL node has at least some characters unfolded, because what
3713          * they match is not known until now.  So, now is the time to fold
3714          * the first few of them, as many as are needed to determine 'c1' and
3715          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
3716          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3717          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
3718          * need to fold as many characters as a single character can fold to,
3719          * so that later we can check if the first ones are such a multi-char
3720          * fold.  But, in such a pattern only locale-problematic characters
3721          * aren't folded, so we can skip this completely if the first character
3722          * in the node isn't one of the tricky ones */
3723         if (OP(text_node) == EXACTFL) {
3724
3725             if (! is_utf8_pat) {
3726                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3727                 {
3728                     folded[0] = folded[1] = 's';
3729                     pat = folded;
3730                     pat_end = folded + 2;
3731                 }
3732             }
3733             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3734                 U8 *s = pat;
3735                 U8 *d = folded;
3736                 int i;
3737
3738                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3739                     if (isASCII(*s)) {
3740                         *(d++) = (U8) toFOLD_LC(*s);
3741                         s++;
3742                     }
3743                     else {
3744                         STRLEN len;
3745                         _to_utf8_fold_flags(s,
3746                                             d,
3747                                             &len,
3748                                             FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3749                         d += len;
3750                         s += UTF8SKIP(s);
3751                     }
3752                 }
3753
3754                 pat = folded;
3755                 pat_end = d;
3756             }
3757         }
3758
3759         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3760              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
3761         {
3762             /* Multi-character folds require more context to sort out.  Also
3763              * PL_utf8_foldclosures used below doesn't handle them, so have to
3764              * be handled outside this routine */
3765             use_chrtest_void = TRUE;
3766         }
3767         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3768             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3769             if (c1 > 255) {
3770                 /* Load the folds hash, if not already done */
3771                 SV** listp;
3772                 if (! PL_utf8_foldclosures) {
3773                     _load_PL_utf8_foldclosures();
3774                 }
3775
3776                 /* The fold closures data structure is a hash with the keys
3777                  * being the UTF-8 of every character that is folded to, like
3778                  * 'k', and the values each an array of all code points that
3779                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
3780                  * Multi-character folds are not included */
3781                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3782                                         (char *) pat,
3783                                         UTF8SKIP(pat),
3784                                         FALSE))))
3785                 {
3786                     /* Not found in the hash, therefore there are no folds
3787                     * containing it, so there is only a single character that
3788                     * could match */
3789                     c2 = c1;
3790                 }
3791                 else {  /* Does participate in folds */
3792                     AV* list = (AV*) *listp;
3793                     if (av_tindex(list) != 1) {
3794
3795                         /* If there aren't exactly two folds to this, it is
3796                          * outside the scope of this function */
3797                         use_chrtest_void = TRUE;
3798                     }
3799                     else {  /* There are two.  Get them */
3800                         SV** c_p = av_fetch(list, 0, FALSE);
3801                         if (c_p == NULL) {
3802                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3803                         }
3804                         c1 = SvUV(*c_p);
3805
3806                         c_p = av_fetch(list, 1, FALSE);
3807                         if (c_p == NULL) {
3808                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3809                         }
3810                         c2 = SvUV(*c_p);
3811
3812                         /* Folds that cross the 255/256 boundary are forbidden
3813                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3814                          * one is ASCIII.  Since the pattern character is above
3815                          * 255, and its only other match is below 256, the only
3816                          * legal match will be to itself.  We have thrown away
3817                          * the original, so have to compute which is the one
3818                          * above 255. */
3819                         if ((c1 < 256) != (c2 < 256)) {
3820                             if ((OP(text_node) == EXACTFL
3821                                  && ! IN_UTF8_CTYPE_LOCALE)
3822                                 || ((OP(text_node) == EXACTFA
3823                                     || OP(text_node) == EXACTFA_NO_TRIE)
3824                                     && (isASCII(c1) || isASCII(c2))))
3825                             {
3826                                 if (c1 < 256) {
3827                                     c1 = c2;
3828                                 }
3829                                 else {
3830                                     c2 = c1;
3831                                 }
3832                             }
3833                         }
3834                     }
3835                 }
3836             }
3837             else /* Here, c1 is <= 255 */
3838                 if (utf8_target
3839                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3840                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3841                     && ((OP(text_node) != EXACTFA
3842                         && OP(text_node) != EXACTFA_NO_TRIE)
3843                         || ! isASCII(c1)))
3844             {
3845                 /* Here, there could be something above Latin1 in the target
3846                  * which folds to this character in the pattern.  All such
3847                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3848                  * than two characters involved in their folds, so are outside
3849                  * the scope of this function */
3850                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3851                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3852                 }
3853                 else {
3854                     use_chrtest_void = TRUE;
3855                 }
3856             }
3857             else { /* Here nothing above Latin1 can fold to the pattern
3858                       character */
3859                 switch (OP(text_node)) {
3860
3861                     case EXACTFL:   /* /l rules */
3862                         c2 = PL_fold_locale[c1];
3863                         break;
3864
3865                     case EXACTF:   /* This node only generated for non-utf8
3866                                     patterns */
3867                         assert(! is_utf8_pat);
3868                         if (! utf8_target) {    /* /d rules */
3869                             c2 = PL_fold[c1];
3870                             break;
3871                         }
3872                         /* FALLTHROUGH */
3873                         /* /u rules for all these.  This happens to work for
3874                         * EXACTFA as nothing in Latin1 folds to ASCII */
3875                     case EXACTFA_NO_TRIE:   /* This node only generated for
3876                                             non-utf8 patterns */
3877                         assert(! is_utf8_pat);
3878                         /* FALLTHROUGH */
3879                     case EXACTFA:
3880                     case EXACTFU_SS:
3881                     case EXACTFU:
3882                         c2 = PL_fold_latin1[c1];
3883                         break;
3884
3885                     default:
3886                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3887                         NOT_REACHED; /* NOTREACHED */
3888                 }
3889             }
3890         }
3891     }
3892
3893     /* Here have figured things out.  Set up the returns */
3894     if (use_chrtest_void) {
3895         *c2p = *c1p = CHRTEST_VOID;
3896     }
3897     else if (utf8_target) {
3898         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3899             uvchr_to_utf8(c1_utf8, c1);
3900             uvchr_to_utf8(c2_utf8, c2);
3901         }
3902
3903         /* Invariants are stored in both the utf8 and byte outputs; Use
3904          * negative numbers otherwise for the byte ones.  Make sure that the
3905          * byte ones are the same iff the utf8 ones are the same */
3906         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3907         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3908                 ? *c2_utf8
3909                 : (c1 == c2)
3910                   ? CHRTEST_NOT_A_CP_1
3911                   : CHRTEST_NOT_A_CP_2;
3912     }
3913     else if (c1 > 255) {
3914        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3915                            can represent */
3916            return FALSE;
3917        }
3918
3919        *c1p = *c2p = c2;    /* c2 is the only representable value */
3920     }
3921     else {  /* c1 is representable; see about c2 */
3922        *c1p = c1;
3923        *c2p = (c2 < 256) ? c2 : c1;
3924     }
3925
3926     return TRUE;
3927 }
3928
3929 /* returns -1 on failure, $+[0] on success */
3930 STATIC SSize_t
3931 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3932 {
3933 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3934     dMY_CXT;
3935 #endif
3936     dVAR;
3937     const bool utf8_target = reginfo->is_utf8_target;
3938     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3939     REGEXP *rex_sv = reginfo->prog;
3940     regexp *rex = ReANY(rex_sv);
3941     RXi_GET_DECL(rex,rexi);
3942     /* the current state. This is a cached copy of PL_regmatch_state */
3943     regmatch_state *st;
3944     /* cache heavy used fields of st in registers */
3945     regnode *scan;
3946     regnode *next;
3947     U32 n = 0;  /* general value; init to avoid compiler warning */
3948     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
3949     char *locinput = startpos;
3950     char *pushinput; /* where to continue after a PUSH */
3951     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3952
3953     bool result = 0;        /* return value of S_regmatch */
3954     int depth = 0;          /* depth of backtrack stack */
3955     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3956     const U32 max_nochange_depth =
3957         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3958         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3959     regmatch_state *yes_state = NULL; /* state to pop to on success of
3960                                                             subpattern */
3961     /* mark_state piggy backs on the yes_state logic so that when we unwind
3962        the stack on success we can update the mark_state as we go */
3963     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3964     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3965     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3966     U32 state_num;
3967     bool no_final = 0;      /* prevent failure from backtracking? */
3968     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3969     char *startpoint = locinput;
3970     SV *popmark = NULL;     /* are we looking for a mark? */
3971     SV *sv_commit = NULL;   /* last mark name seen in failure */
3972     SV *sv_yes_mark = NULL; /* last mark name we have seen
3973                                during a successful match */
3974     U32 lastopen = 0;       /* last open we saw */
3975     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3976     SV* const oreplsv = GvSVn(PL_replgv);
3977     /* these three flags are set by various ops to signal information to
3978      * the very next op. They have a useful lifetime of exactly one loop
3979      * iteration, and are not preserved or restored by state pushes/pops
3980      */
3981     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3982     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3983     int logical = 0;        /* the following EVAL is:
3984                                 0: (?{...})
3985                                 1: (?(?{...})X|Y)
3986                                 2: (??{...})
3987                                or the following IFMATCH/UNLESSM is:
3988                                 false: plain (?=foo)
3989                                 true:  used as a condition: (?(?=foo))
3990                             */
3991     PAD* last_pad = NULL;
3992     dMULTICALL;
3993     I32 gimme = G_SCALAR;
3994     CV *caller_cv = NULL;       /* who called us */
3995     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3996     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3997     U32 maxopenparen = 0;       /* max '(' index seen so far */
3998     int to_complement;  /* Invert the result? */
3999     _char_class_number classnum;
4000     bool is_utf8_pat = reginfo->is_utf8_pat;
4001
4002 #ifdef DEBUGGING
4003     GET_RE_DEBUG_FLAGS_DECL;
4004 #endif
4005
4006     /* protect against undef(*^R) */
4007     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
4008
4009     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
4010     multicall_oldcatch = 0;
4011     multicall_cv = NULL;
4012     cx = NULL;
4013     PERL_UNUSED_VAR(multicall_cop);
4014     PERL_UNUSED_VAR(newsp);
4015
4016
4017     PERL_ARGS_ASSERT_REGMATCH;
4018
4019     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
4020             PerlIO_printf(Perl_debug_log,"regmatch start\n");
4021     }));
4022
4023     st = PL_regmatch_state;
4024
4025     /* Note that nextchr is a byte even in UTF */
4026     SET_nextchr;
4027     scan = prog;
4028     while (scan != NULL) {
4029
4030         DEBUG_EXECUTE_r( {
4031             SV * const prop = sv_newmortal();
4032             regnode *rnext=regnext(scan);
4033             DUMP_EXEC_POS( locinput, scan, utf8_target );
4034             regprop(rex, prop, scan, reginfo, NULL);
4035
4036             PerlIO_printf(Perl_debug_log,
4037                     "%3"IVdf":%*s%s(%"IVdf")\n",
4038                     (IV)(scan - rexi->program), depth*2, "",
4039                     SvPVX_const(prop),
4040                     (PL_regkind[OP(scan)] == END || !rnext) ?
4041                         0 : (IV)(rnext - rexi->program));
4042         });
4043
4044         next = scan + NEXT_OFF(scan);
4045         if (next == scan)
4046             next = NULL;
4047         state_num = OP(scan);
4048
4049       reenter_switch:
4050         to_complement = 0;
4051
4052         SET_nextchr;
4053         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
4054
4055         switch (state_num) {
4056         case SBOL: /*  /^../ and /\A../  */
4057             if (locinput == reginfo->strbeg)
4058                 break;
4059             sayNO;
4060
4061         case MBOL: /*  /^../m  */
4062             if (locinput == reginfo->strbeg ||
4063                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4064             {
4065                 break;
4066             }
4067             sayNO;
4068
4069         case GPOS: /*  \G  */
4070             if (locinput == reginfo->ganch)
4071                 break;
4072             sayNO;
4073
4074         case KEEPS: /*   \K  */
4075             /* update the startpoint */
4076             st->u.keeper.val = rex->offs[0].start;
4077             rex->offs[0].start = locinput - reginfo->strbeg;
4078             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4079             /* NOTREACHED */
4080             NOT_REACHED;
4081
4082         case KEEPS_next_fail:
4083             /* rollback the start point change */
4084             rex->offs[0].start = st->u.keeper.val;
4085             sayNO_SILENT;
4086             /* NOTREACHED */
4087             NOT_REACHED;
4088
4089         case MEOL: /* /..$/m  */
4090             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4091                 sayNO;
4092             break;
4093
4094         case SEOL: /* /..$/  */
4095             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4096                 sayNO;
4097             if (reginfo->strend - locinput > 1)
4098                 sayNO;
4099             break;
4100
4101         case EOS: /*  \z  */
4102             if (!NEXTCHR_IS_EOS)
4103                 sayNO;
4104             break;
4105
4106         case SANY: /*  /./s  */
4107             if (NEXTCHR_IS_EOS)
4108                 sayNO;
4109             goto increment_locinput;
4110
4111         case CANY: /*  \C  */
4112             if (NEXTCHR_IS_EOS)
4113                 sayNO;
4114             locinput++;
4115             break;
4116
4117         case REG_ANY: /*  /./  */
4118             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4119                 sayNO;
4120             goto increment_locinput;
4121
4122
4123 #undef  ST
4124 #define ST st->u.trie
4125         case TRIEC: /* (ab|cd) with known charclass */
4126             /* In this case the charclass data is available inline so
4127                we can fail fast without a lot of extra overhead.
4128              */
4129             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4130                 DEBUG_EXECUTE_r(
4131                     PerlIO_printf(Perl_debug_log,
4132                               "%*s  %sfailed to match trie start class...%s\n",
4133                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4134                 );
4135                 sayNO_SILENT;
4136                 /* NOTREACHED */
4137                 NOT_REACHED;
4138             }
4139             /* FALLTHROUGH */
4140         case TRIE:  /* (ab|cd)  */
4141             /* the basic plan of execution of the trie is:
4142              * At the beginning, run though all the states, and
4143              * find the longest-matching word. Also remember the position
4144              * of the shortest matching word. For example, this pattern:
4145              *    1  2 3 4    5
4146              *    ab|a|x|abcd|abc
4147              * when matched against the string "abcde", will generate
4148              * accept states for all words except 3, with the longest
4149              * matching word being 4, and the shortest being 2 (with
4150              * the position being after char 1 of the string).
4151              *
4152              * Then for each matching word, in word order (i.e. 1,2,4,5),
4153              * we run the remainder of the pattern; on each try setting
4154              * the current position to the character following the word,
4155              * returning to try the next word on failure.
4156              *
4157              * We avoid having to build a list of words at runtime by
4158              * using a compile-time structure, wordinfo[].prev, which
4159              * gives, for each word, the previous accepting word (if any).
4160              * In the case above it would contain the mappings 1->2, 2->0,
4161              * 3->0, 4->5, 5->1.  We can use this table to generate, from
4162              * the longest word (4 above), a list of all words, by
4163              * following the list of prev pointers; this gives us the
4164              * unordered list 4,5,1,2. Then given the current word we have
4165              * just tried, we can go through the list and find the
4166              * next-biggest word to try (so if we just failed on word 2,
4167              * the next in the list is 4).
4168              *
4169              * Since at runtime we don't record the matching position in
4170              * the string for each word, we have to work that out for
4171              * each word we're about to process. The wordinfo table holds
4172              * the character length of each word; given that we recorded
4173              * at the start: the position of the shortest word and its
4174              * length in chars, we just need to move the pointer the
4175              * difference between the two char lengths. Depending on
4176              * Unicode status and folding, that's cheap or expensive.
4177              *
4178              * This algorithm is optimised for the case where are only a
4179              * small number of accept states, i.e. 0,1, or maybe 2.
4180              * With lots of accepts states, and having to try all of them,
4181              * it becomes quadratic on number of accept states to find all
4182              * the next words.
4183              */
4184
4185             {
4186                 /* what type of TRIE am I? (utf8 makes this contextual) */
4187                 DECL_TRIE_TYPE(scan);
4188
4189                 /* what trie are we using right now */
4190                 reg_trie_data * const trie
4191                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
4192                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
4193                 U32 state = trie->startstate;
4194
4195                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
4196                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
4197                     if (utf8_target
4198                         && UTF8_IS_ABOVE_LATIN1(nextchr)
4199                         && scan->flags == EXACTL)
4200                     {
4201                         /* We only output for EXACTL, as we let the folder
4202                          * output this message for EXACTFLU8 to avoid
4203                          * duplication */
4204                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
4205                                                                reginfo->strend);
4206                     }
4207                 }
4208                 if (   trie->bitmap
4209                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
4210                 {
4211                     if (trie->states[ state ].wordnum) {
4212                          DEBUG_EXECUTE_r(
4213                             PerlIO_printf(Perl_debug_log,
4214                                           "%*s  %smatched empty string...%s\n",
4215                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4216                         );
4217                         if (!trie->jump)
4218                             break;
4219                     } else {
4220                         DEBUG_EXECUTE_r(
4221                             PerlIO_printf(Perl_debug_log,
4222                                           "%*s  %sfailed to match trie start class...%s\n",
4223                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4224                         );
4225                         sayNO_SILENT;
4226                    }
4227                 }
4228
4229             {
4230                 U8 *uc = ( U8* )locinput;
4231
4232                 STRLEN len = 0;
4233                 STRLEN foldlen = 0;
4234                 U8 *uscan = (U8*)NULL;
4235                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
4236                 U32 charcount = 0; /* how many input chars we have matched */
4237                 U32 accepted = 0; /* have we seen any accepting states? */
4238
4239                 ST.jump = trie->jump;
4240                 ST.me = scan;
4241                 ST.firstpos = NULL;
4242                 ST.longfold = FALSE; /* char longer if folded => it's harder */
4243                 ST.nextword = 0;
4244
4245                 /* fully traverse the TRIE; note the position of the
4246                    shortest accept state and the wordnum of the longest
4247                    accept state */
4248
4249                 while ( state && uc <= (U8*)(reginfo->strend) ) {
4250                     U32 base = trie->states[ state ].trans.base;
4251                     UV uvc = 0;
4252                     U16 charid = 0;
4253                     U16 wordnum;
4254                     wordnum = trie->states[ state ].wordnum;
4255
4256                     if (wordnum) { /* it's an accept state */
4257                         if (!accepted) {
4258                             accepted = 1;
4259                             /* record first match position */
4260                             if (ST.longfold) {
4261                                 ST.firstpos = (U8*)locinput;
4262                                 ST.firstchars = 0;
4263                             }
4264                             else {
4265                                 ST.firstpos = uc;
4266                                 ST.firstchars = charcount;
4267                             }
4268                         }
4269                         if (!ST.nextword || wordnum < ST.nextword)
4270                             ST.nextword = wordnum;
4271                         ST.topword = wordnum;
4272                     }
4273
4274                     DEBUG_TRIE_EXECUTE_r({
4275                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
4276                                 PerlIO_printf( Perl_debug_log,
4277                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
4278                                     2+depth * 2, "", PL_colors[4],
4279                                     (UV)state, (accepted ? 'Y' : 'N'));
4280                     });
4281
4282                     /* read a char and goto next state */
4283                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
4284                         I32 offset;
4285                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
4286                                              uscan, len, uvc, charid, foldlen,
4287                                              foldbuf, uniflags);
4288                         charcount++;
4289                         if (foldlen>0)
4290                             ST.longfold = TRUE;
4291                         if (charid &&
4292                              ( ((offset =
4293                               base + charid - 1 - trie->uniquecharcount)) >= 0)
4294
4295                              && ((U32)offset < trie->lasttrans)
4296                              && trie->trans[offset].check == state)
4297                         {
4298                             state = trie->trans[offset].next;
4299                         }
4300                         else {
4301                             state = 0;
4302                         }
4303                         uc += len;
4304
4305                     }
4306                     else {
4307                         state = 0;
4308                     }
4309                     DEBUG_TRIE_EXECUTE_r(
4310                         PerlIO_printf( Perl_debug_log,
4311                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
4312                             charid, uvc, (UV)state, PL_colors[5] );
4313                     );
4314                 }
4315                 if (!accepted)
4316                    sayNO;
4317
4318                 /* calculate total number of accept states */
4319                 {
4320                     U16 w = ST.topword;
4321                     accepted = 0;
4322                     while (w) {
4323                         w = trie->wordinfo[w].prev;
4324                         accepted++;
4325                     }
4326                     ST.accepted = accepted;
4327                 }
4328
4329                 DEBUG_EXECUTE_r(
4330                     PerlIO_printf( Perl_debug_log,
4331                         "%*s  %sgot %"IVdf" possible matches%s\n",
4332                         REPORT_CODE_OFF + depth * 2, "",
4333                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4334                 );
4335                 goto trie_first_try; /* jump into the fail handler */
4336             }}
4337             /* NOTREACHED */
4338             NOT_REACHED;
4339
4340         case TRIE_next_fail: /* we failed - try next alternative */
4341         {
4342             U8 *uc;
4343             if ( ST.jump) {
4344                 REGCP_UNWIND(ST.cp);
4345                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4346             }
4347             if (!--ST.accepted) {
4348                 DEBUG_EXECUTE_r({
4349                     PerlIO_printf( Perl_debug_log,
4350                         "%*s  %sTRIE failed...%s\n",
4351                         REPORT_CODE_OFF+depth*2, "",
4352                         PL_colors[4],
4353                         PL_colors[5] );
4354                 });
4355                 sayNO_SILENT;
4356             }
4357             {
4358                 /* Find next-highest word to process.  Note that this code
4359                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
4360                 U16 min = 0;
4361                 U16 word;
4362                 U16 const nextword = ST.nextword;
4363                 reg_trie_wordinfo * const wordinfo
4364                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4365                 for (word=ST.topword; word; word=wordinfo[word].prev) {
4366                     if (word > nextword && (!min || word < min))
4367                         min = word;
4368                 }
4369                 ST.nextword = min;
4370             }
4371
4372           trie_first_try:
4373             if (do_cutgroup) {
4374                 do_cutgroup = 0;
4375                 no_final = 0;
4376             }
4377
4378             if ( ST.jump) {
4379                 ST.lastparen = rex->lastparen;
4380                 ST.lastcloseparen = rex->lastcloseparen;
4381                 REGCP_SET(ST.cp);
4382             }
4383
4384             /* find start char of end of current word */
4385             {
4386                 U32 chars; /* how many chars to skip */
4387                 reg_trie_data * const trie
4388                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4389
4390                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4391                             >=  ST.firstchars);
4392                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4393                             - ST.firstchars;
4394                 uc = ST.firstpos;
4395
4396                 if (ST.longfold) {
4397                     /* the hard option - fold each char in turn and find
4398                      * its folded length (which may be different */
4399                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4400                     STRLEN foldlen;
4401                     STRLEN len;
4402                     UV uvc;
4403                     U8 *uscan;
4404
4405                     while (chars) {
4406                         if (utf8_target) {
4407                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4408                                                     uniflags);
4409                             uc += len;
4410                         }
4411                         else {
4412                             uvc = *uc;
4413                             uc++;
4414                         }
4415                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4416                         uscan = foldbuf;
4417                         while (foldlen) {
4418                             if (!--chars)
4419                                 break;
4420                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4421                                             uniflags);
4422                             uscan += len;
4423                             foldlen -= len;
4424                         }
4425                     }
4426                 }
4427                 else {
4428                     if (utf8_target)
4429                         while (chars--)
4430                             uc += UTF8SKIP(uc);
4431                     else
4432                         uc += chars;
4433                 }
4434             }
4435
4436             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4437                             ? ST.jump[ST.nextword]
4438                             : NEXT_OFF(ST.me));
4439
4440             DEBUG_EXECUTE_r({
4441                 PerlIO_printf( Perl_debug_log,
4442                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4443                     REPORT_CODE_OFF+depth*2, "",
4444                     PL_colors[4],
4445                     ST.nextword,
4446                     PL_colors[5]
4447                     );
4448             });
4449
4450             if (ST.accepted > 1 || has_cutgroup) {
4451                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4452                 /* NOTREACHED */
4453                 NOT_REACHED;
4454             }
4455             /* only one choice left - just continue */
4456             DEBUG_EXECUTE_r({
4457                 AV *const trie_words
4458                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4459                 SV ** const tmp = av_fetch( trie_words,
4460                     ST.nextword-1, 0 );
4461                 SV *sv= tmp ? sv_newmortal() : NULL;
4462
4463                 PerlIO_printf( Perl_debug_log,
4464                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4465                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4466                     ST.nextword,
4467                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4468                             PL_colors[0], PL_colors[1],
4469                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4470                         )
4471                     : "not compiled under -Dr",
4472                     PL_colors[5] );
4473             });
4474
4475             locinput = (char*)uc;
4476             continue; /* execute rest of RE */
4477             /* NOTREACHED */
4478         }
4479 #undef  ST
4480
4481         case EXACTL:             /*  /abc/l       */
4482             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
4483
4484             /* Complete checking would involve going through every character
4485              * matched by the string to see if any is above latin1.  But the
4486              * comparision otherwise might very well be a fast assembly
4487              * language routine, and I (khw) don't think slowing things down
4488              * just to check for this warning is worth it.  So this just checks
4489              * the first character */
4490             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
4491                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
4492             }
4493             /* FALLTHROUGH */
4494         case EXACT: {            /*  /abc/        */
4495             char *s = STRING(scan);
4496             ln = STR_LEN(scan);
4497             if (utf8_target != is_utf8_pat) {
4498                 /* The target and the pattern have differing utf8ness. */
4499                 char *l = locinput;
4500                 const char * const e = s + ln;
4501
4502                 if (utf8_target) {
4503                     /* The target is utf8, the pattern is not utf8.
4504                      * Above-Latin1 code points can't match the pattern;
4505                      * invariants match exactly, and the other Latin1 ones need
4506                      * to be downgraded to a single byte in order to do the
4507                      * comparison.  (If we could be confident that the target
4508                      * is not malformed, this could be refactored to have fewer
4509                      * tests by just assuming that if the first bytes match, it
4510                      * is an invariant, but there are tests in the test suite
4511                      * dealing with (??{...}) which violate this) */
4512                     while (s < e) {
4513                         if (l >= reginfo->strend
4514                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4515                         {
4516                             sayNO;
4517                         }
4518                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4519                             if (*l != *s) {
4520                                 sayNO;
4521                             }
4522                             l++;
4523                         }
4524                         else {
4525                             if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4526                             {
4527                                 sayNO;
4528                             }
4529                             l += 2;
4530                         }
4531                         s++;
4532                     }
4533                 }
4534                 else {
4535                     /* The target is not utf8, the pattern is utf8. */
4536                     while (s < e) {
4537                         if (l >= reginfo->strend
4538                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4539                         {
4540                             sayNO;
4541                         }
4542                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4543                             if (*s != *l) {
4544                                 sayNO;
4545                             }
4546                             s++;
4547                         }
4548                         else {
4549                             if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4550                             {
4551                                 sayNO;
4552                             }
4553                             s += 2;
4554                         }
4555                         l++;
4556                     }
4557                 }
4558                 locinput = l;
4559             }
4560             else {
4561                 /* The target and the pattern have the same utf8ness. */
4562                 /* Inline the first character, for speed. */
4563                 if (reginfo->strend - locinput < ln
4564                     || UCHARAT(s) != nextchr
4565                     || (ln > 1 && memNE(s, locinput, ln)))
4566                 {
4567                     sayNO;
4568                 }
4569                 locinput += ln;
4570             }
4571             break;
4572             }
4573
4574         case EXACTFL: {          /*  /abc/il      */
4575             re_fold_t folder;
4576             const U8 * fold_array;
4577             const char * s;
4578             U32 fold_utf8_flags;
4579
4580             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
4581             folder = foldEQ_locale;
4582             fold_array = PL_fold_locale;
4583             fold_utf8_flags = FOLDEQ_LOCALE;
4584             goto do_exactf;
4585
4586         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
4587                                       is effectively /u; hence to match, target
4588                                       must be UTF-8. */
4589             if (! utf8_target) {
4590                 sayNO;
4591             }
4592             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
4593                                              | FOLDEQ_S1_FOLDS_SANE;
4594             folder = foldEQ_latin1;
4595             fold_array = PL_fold_latin1;
4596             goto do_exactf;
4597
4598         case EXACTFU_SS:         /*  /\x{df}/iu   */
4599         case EXACTFU:            /*  /abc/iu      */
4600             folder = foldEQ_latin1;
4601             fold_array = PL_fold_latin1;
4602             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4603             goto do_exactf;
4604
4605         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
4606                                    patterns */
4607             assert(! is_utf8_pat);
4608             /* FALLTHROUGH */
4609         case EXACTFA:            /*  /abc/iaa     */
4610             folder = foldEQ_latin1;
4611             fold_array = PL_fold_latin1;
4612             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4613             goto do_exactf;
4614
4615         case EXACTF:             /*  /abc/i    This node only generated for
4616                                                non-utf8 patterns */
4617             assert(! is_utf8_pat);
4618             folder = foldEQ;
4619             fold_array = PL_fold;
4620             fold_utf8_flags = 0;
4621
4622           do_exactf:
4623             s = STRING(scan);
4624             ln = STR_LEN(scan);
4625
4626             if (utf8_target
4627                 || is_utf8_pat
4628                 || state_num == EXACTFU_SS
4629                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4630             {
4631               /* Either target or the pattern are utf8, or has the issue where
4632                * the fold lengths may differ. */
4633                 const char * const l = locinput;
4634                 char *e = reginfo->strend;
4635
4636                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4637                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4638                 {
4639                     sayNO;
4640                 }
4641                 locinput = e;
4642                 break;
4643             }
4644
4645             /* Neither the target nor the pattern are utf8 */
4646             if (UCHARAT(s) != nextchr
4647                 && !NEXTCHR_IS_EOS
4648                 && UCHARAT(s) != fold_array[nextchr])
4649             {
4650                 sayNO;
4651             }
4652             if (reginfo->strend - locinput < ln)
4653                 sayNO;
4654             if (ln > 1 && ! folder(s, locinput, ln))
4655                 sayNO;
4656             locinput += ln;
4657             break;
4658         }
4659
4660         /* XXX At that point regcomp.c would no longer * have to set the FLAGS fields of these */
4661         case NBOUNDL: /*  /\B/l  */
4662             to_complement = 1;
4663             /* FALLTHROUGH */
4664
4665         case BOUNDL:  /*  /\b/l  */
4666             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
4667             if (utf8_target) {
4668                 if (locinput == reginfo->strbeg)
4669                     ln = isWORDCHAR_LC('\n');
4670                 else {
4671                     ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
4672                                                         (U8*)(reginfo->strbeg)));
4673                 }
4674                 n = (NEXTCHR_IS_EOS)
4675                     ? isWORDCHAR_LC('\n')
4676                     : isWORDCHAR_LC_utf8((U8*)locinput);
4677             }
4678             else { /* Here the string isn't utf8 */
4679                 ln = (locinput == reginfo->strbeg)
4680                      ? isWORDCHAR_LC('\n')
4681                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
4682                 n = (NEXTCHR_IS_EOS)
4683                     ? isWORDCHAR_LC('\n')
4684                     : isWORDCHAR_LC(nextchr);
4685             }
4686             if (to_complement ^ (ln == n)) {
4687                 sayNO;
4688             }
4689             break;
4690
4691         case NBOUND:  /*  /\B/   */
4692             to_complement = 1;
4693             /* FALLTHROUGH */
4694
4695         case BOUND:   /*  /\b/   */
4696             if (utf8_target) {
4697                 goto bound_utf8;
4698             }
4699             goto bound_ascii_match_only;
4700
4701         case NBOUNDA: /*  /\B/a  */
4702             to_complement = 1;
4703             /* FALLTHROUGH */
4704
4705         case BOUNDA:  /*  /\b/a  */
4706
4707           bound_ascii_match_only:
4708             /* Here the string isn't utf8, or is utf8 and only ascii characters
4709              * are to match \w.  In the latter case looking at the byte just
4710              * prior to the current one may be just the final byte of a
4711              * multi-byte character.  This is ok.  There are two cases:
4712              * 1) it is a single byte character, and then the test is doing
4713              *    just what it's supposed to.
4714              * 2) it is a multi-byte character, in which case the final byte is
4715              *    never mistakable for ASCII, and so the test will say it is
4716              *    not a word character, which is the correct answer. */
4717             ln = (locinput == reginfo->strbeg)
4718                  ? isWORDCHAR_A('\n')
4719                  : isWORDCHAR_A(UCHARAT(locinput - 1));
4720             n = (NEXTCHR_IS_EOS)
4721                 ? isWORDCHAR_A('\n')
4722                 : isWORDCHAR_A(nextchr);
4723             if (to_complement ^ (ln == n)) {
4724                 sayNO;
4725             }
4726             break;
4727
4728         case NBOUNDU: /*  /\B/u  */
4729             to_complement = 1;
4730             /* FALLTHROUGH */
4731
4732         case BOUNDU:  /*  /\b/u  */
4733             if (utf8_target) {
4734
4735           bound_utf8:
4736                 ln = (locinput == reginfo->strbeg)
4737                      ? isWORDCHAR_L1('\n')
4738                      : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
4739                                                         (U8*)(reginfo->strbeg)));
4740                 n = (NEXTCHR_IS_EOS)
4741                     ? isWORDCHAR_L1('\n')
4742                     : isWORDCHAR_utf8((U8*)locinput);
4743             }
4744             else {
4745                 ln = (locinput == reginfo->strbeg)
4746                     ? isWORDCHAR_L1('\n')
4747                     : isWORDCHAR_L1(UCHARAT(locinput - 1));
4748                 n = (NEXTCHR_IS_EOS)
4749                     ? isWORDCHAR_L1('\n')
4750                     : isWORDCHAR_L1(nextchr);
4751
4752             }
4753
4754             if (to_complement ^ (ln == n)) {
4755                 sayNO;
4756             }
4757             break;
4758
4759         case ANYOFL:  /*  /[abc]/l      */
4760             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
4761             /* FALLTHROUGH */
4762         case ANYOF:  /*   /[abc]/       */
4763             if (NEXTCHR_IS_EOS)
4764                 sayNO;
4765             if (utf8_target) {
4766                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4767                                                                    utf8_target))
4768                     sayNO;
4769                 locinput += UTF8SKIP(locinput);
4770             }
4771             else {
4772                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4773                     sayNO;
4774                 locinput++;
4775             }
4776             break;
4777
4778         /* The argument (FLAGS) to all the POSIX node types is the class number
4779          * */
4780
4781         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4782             to_complement = 1;
4783             /* FALLTHROUGH */
4784
4785         case POSIXL:    /* \w or [:punct:] etc. under /l */
4786             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
4787             if (NEXTCHR_IS_EOS)
4788                 sayNO;
4789
4790             /* Use isFOO_lc() for characters within Latin1.  (Note that
4791              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4792              * wouldn't be invariant) */
4793             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4794                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4795                     sayNO;
4796                 }
4797             }
4798             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4799                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4800                                            (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4801                                                             *(locinput + 1))))))
4802                 {
4803                     sayNO;
4804                 }
4805             }
4806             else { /* Here, must be an above Latin-1 code point */
4807                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
4808                 goto utf8_posix_above_latin1;
4809             }
4810
4811             /* Here, must be utf8 */
4812             locinput += UTF8SKIP(locinput);
4813             break;
4814
4815         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4816             to_complement = 1;
4817             /* FALLTHROUGH */
4818
4819         case POSIXD:    /* \w or [:punct:] etc. under /d */
4820             if (utf8_target) {
4821                 goto utf8_posix;
4822             }
4823             goto posixa;
4824
4825         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4826
4827             if (NEXTCHR_IS_EOS) {
4828                 sayNO;
4829             }
4830
4831             /* All UTF-8 variants match */
4832             if (! UTF8_IS_INVARIANT(nextchr)) {
4833                 goto increment_locinput;
4834             }
4835
4836             to_complement = 1;
4837             /* FALLTHROUGH */
4838
4839         case POSIXA:    /* \w or [:punct:] etc. under /a */
4840
4841           posixa:
4842             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4843              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4844              * character is a single byte */
4845
4846             if (NEXTCHR_IS_EOS
4847                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4848                                                             FLAGS(scan)))))
4849             {
4850                 sayNO;
4851             }
4852
4853             /* Here we are either not in utf8, or we matched a utf8-invariant,
4854              * so the next char is the next byte */
4855             locinput++;
4856             break;
4857
4858         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4859             to_complement = 1;
4860             /* FALLTHROUGH */
4861
4862         case POSIXU:    /* \w or [:punct:] etc. under /u */
4863           utf8_posix:
4864             if (NEXTCHR_IS_EOS) {
4865                 sayNO;
4866             }
4867
4868             /* Use _generic_isCC() for characters within Latin1.  (Note that
4869              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4870              * wouldn't be invariant) */
4871             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4872                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4873                                                            FLAGS(scan)))))
4874                 {
4875                     sayNO;
4876                 }
4877                 locinput++;
4878             }
4879             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4880                 if (! (to_complement
4881                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4882                                                                *(locinput + 1)),
4883                                              FLAGS(scan)))))
4884                 {
4885                     sayNO;
4886                 }
4887                 locinput += 2;
4888             }
4889             else {  /* Handle above Latin-1 code points */
4890               utf8_posix_above_latin1:
4891                 classnum = (_char_class_number) FLAGS(scan);
4892                 if (classnum < _FIRST_NON_SWASH_CC) {
4893
4894                     /* Here, uses a swash to find such code points.  Load if if
4895                      * not done already */
4896                     if (! PL_utf8_swash_ptrs[classnum]) {
4897                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4898                         PL_utf8_swash_ptrs[classnum]
4899                                 = _core_swash_init("utf8",
4900                                         "",
4901                                         &PL_sv_undef, 1, 0,
4902                                         PL_XPosix_ptrs[classnum], &flags);
4903                     }
4904                     if (! (to_complement
4905                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4906                                                (U8 *) locinput, TRUE))))
4907                     {
4908                         sayNO;
4909                     }
4910                 }
4911                 else {  /* Here, uses macros to find above Latin-1 code points */
4912                     switch (classnum) {
4913                         case _CC_ENUM_SPACE:    /* XXX would require separate
4914                                                    code if we revert the change
4915                                                    of \v matching this */
4916                         case _CC_ENUM_PSXSPC:
4917                             if (! (to_complement
4918                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
4919                             {
4920                                 sayNO;
4921                             }
4922                             break;
4923                         case _CC_ENUM_BLANK:
4924                             if (! (to_complement
4925                                             ^ cBOOL(is_HORIZWS_high(locinput))))
4926                             {
4927                                 sayNO;
4928                             }
4929                             break;
4930                         case _CC_ENUM_XDIGIT:
4931                             if (! (to_complement
4932                                             ^ cBOOL(is_XDIGIT_high(locinput))))
4933                             {
4934                                 sayNO;
4935                             }
4936                             break;
4937                         case _CC_ENUM_VERTSPACE:
4938                             if (! (to_complement
4939                                             ^ cBOOL(is_VERTWS_high(locinput))))
4940                             {
4941                                 sayNO;
4942                             }
4943                             break;
4944                         default:    /* The rest, e.g. [:cntrl:], can't match
4945                                        above Latin1 */
4946                             if (! to_complement) {
4947                                 sayNO;
4948                             }
4949                             break;
4950                     }
4951                 }
4952                 locinput += UTF8SKIP(locinput);
4953             }
4954             break;
4955
4956         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4957                        a Unicode extended Grapheme Cluster */
4958             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4959               extended Grapheme Cluster is:
4960
4961             CR LF
4962             | Prepend* Begin Extend*
4963             | .
4964
4965             Begin is:           ( Special_Begin | ! Control )
4966             Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4967             Extend is:          ( Grapheme_Extend | Spacing_Mark )
4968             Control is:         [ GCB_Control | CR | LF ]
4969             Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4970
4971                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4972                we can rewrite
4973
4974                    Begin is ( Regular_Begin + Special Begin )
4975
4976                It turns out that 98.4% of all Unicode code points match
4977                Regular_Begin.  Doing it this way eliminates a table match in
4978                the previous implementation for almost all Unicode code points.
4979
4980                There is a subtlety with Prepend* which showed up in testing.
4981                Note that the Begin, and only the Begin is required in:
4982                 | Prepend* Begin Extend*
4983                Also, Begin contains '! Control'.  A Prepend must be a
4984                '!  Control', which means it must also be a Begin.  What it
4985                comes down to is that if we match Prepend* and then find no
4986                suitable Begin afterwards, that if we backtrack the last
4987                Prepend, that one will be a suitable Begin.
4988             */
4989
4990             if (NEXTCHR_IS_EOS)
4991                 sayNO;
4992             if  (! utf8_target) {
4993
4994                 /* Match either CR LF  or '.', as all the other possibilities
4995                  * require utf8 */
4996                 locinput++;         /* Match the . or CR */
4997                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4998                                        match the LF */
4999                     && locinput < reginfo->strend
5000                     && UCHARAT(locinput) == '\n')
5001                 {
5002                     locinput++;
5003                 }
5004             }
5005             else {
5006
5007                 /* Utf8: See if is ( CR LF ); already know that locinput <
5008                  * reginfo->strend, so locinput+1 is in bounds */
5009                 if ( nextchr == '\r' && locinput+1 < reginfo->strend
5010                      && UCHARAT(locinput + 1) == '\n')
5011                 {
5012                     locinput += 2;
5013                 }
5014                 else {
5015                     STRLEN len;
5016
5017                     /* In case have to backtrack to beginning, then match '.' */
5018                     char *starting = locinput;
5019
5020                     /* In case have to backtrack the last prepend */
5021                     char *previous_prepend = NULL;
5022
5023                     LOAD_UTF8_CHARCLASS_GCB();
5024
5025                     /* Match (prepend)*   */
5026                     while (locinput < reginfo->strend
5027                            && (len = is_GCB_Prepend_utf8(locinput)))
5028                     {
5029                         previous_prepend = locinput;
5030                         locinput += len;
5031                     }
5032
5033                     /* As noted above, if we matched a prepend character, but
5034                      * the next thing won't match, back off the last prepend we
5035                      * matched, as it is guaranteed to match the begin */
5036                     if (previous_prepend
5037                         && (locinput >=  reginfo->strend
5038                             || (! swash_fetch(PL_utf8_X_regular_begin,
5039                                              (U8*)locinput, utf8_target)
5040                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
5041                         )
5042                     {
5043                         locinput = previous_prepend;
5044                     }
5045
5046                     /* Note that here we know reginfo->strend > locinput, as we
5047                      * tested that upon input to this switch case, and if we
5048                      * moved locinput forward, we tested the result just above
5049                      * and it either passed, or we backed off so that it will
5050                      * now pass */
5051                     if (swash_fetch(PL_utf8_X_regular_begin,
5052                                     (U8*)locinput, utf8_target)) {
5053                         locinput += UTF8SKIP(locinput);
5054                     }
5055                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
5056
5057                         /* Here did not match the required 'Begin' in the
5058                          * second term.  So just match the very first
5059                          * character, the '.' of the final term of the regex */
5060                         locinput = starting + UTF8SKIP(starting);
5061                         goto exit_utf8;
5062                     } else {
5063
5064                         /* Here is a special begin.  It can be composed of
5065                          * several individual characters.  One possibility is
5066                          * RI+ */
5067                         if ((len = is_GCB_RI_utf8(locinput))) {
5068                             locinput += len;
5069                             while (locinput < reginfo->strend
5070                                    && (len = is_GCB_RI_utf8(locinput)))
5071                             {
5072                                 locinput += len;
5073                             }
5074                         } else if ((len = is_GCB_T_utf8(locinput))) {
5075                             /* Another possibility is T+ */
5076                             locinput += len;
5077                             while (locinput < reginfo->strend
5078                                 && (len = is_GCB_T_utf8(locinput)))
5079                             {
5080                                 locinput += len;
5081                             }
5082                         } else {
5083
5084                             /* Here, neither RI+ nor T+; must be some other
5085                              * Hangul.  That means it is one of the others: L,
5086                              * LV, LVT or V, and matches:
5087                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
5088
5089                             /* Match L*           */
5090                             while (locinput < reginfo->strend
5091                                    && (len = is_GCB_L_utf8(locinput)))
5092                             {
5093                                 locinput += len;
5094                             }
5095
5096                             /* Here, have exhausted L*.  If the next character
5097                              * is not an LV, LVT nor V, it means we had to have
5098                              * at least one L, so matches L+ in the original
5099                              * equation, we have a complete hangul syllable.
5100                              * Are done. */
5101
5102                             if (locinput < reginfo->strend
5103                                 && is_GCB_LV_LVT_V_utf8(locinput))
5104                             {
5105                                 /* Otherwise keep going.  Must be LV, LVT or V.
5106                                  * See if LVT, by first ruling out V, then LV */
5107                                 if (! is_GCB_V_utf8(locinput)
5108                                         /* All but every TCount one is LV */
5109                                     && (valid_utf8_to_uvchr((U8 *) locinput,
5110                                                                          NULL)
5111                                                                         - SBASE)
5112                                         % TCount != 0)
5113                                 {
5114                                     locinput += UTF8SKIP(locinput);
5115                                 } else {
5116
5117                                     /* Must be  V or LV.  Take it, then match
5118                                      * V*     */
5119                                     locinput += UTF8SKIP(locinput);
5120                                     while (locinput < reginfo->strend
5121                                            && (len = is_GCB_V_utf8(locinput)))
5122                                     {
5123                                         locinput += len;
5124                                     }
5125                                 }
5126
5127                                 /* And any of LV, LVT, or V can be followed
5128                                  * by T*            */
5129                                 while (locinput < reginfo->strend
5130                                        && (len = is_GCB_T_utf8(locinput)))
5131                                 {
5132                                     locinput += len;
5133                                 }
5134                             }
5135                         }
5136                     }
5137
5138                     /* Match any extender */
5139                     while (locinput < reginfo->strend
5140                             && swash_fetch(PL_utf8_X_extend,
5141                                             (U8*)locinput, utf8_target))
5142                     {
5143                         locinput += UTF8SKIP(locinput);
5144                     }
5145                 }
5146               exit_utf8:
5147                 if (locinput > reginfo->strend) sayNO;
5148             }
5149             break;
5150
5151         case NREFFL:  /*  /\g{name}/il  */
5152         {   /* The capture buffer cases.  The ones beginning with N for the
5153                named buffers just convert to the equivalent numbered and
5154                pretend they were called as the corresponding numbered buffer
5155                op.  */
5156             /* don't initialize these in the declaration, it makes C++
5157                unhappy */
5158             const char *s;
5159             char type;
5160             re_fold_t folder;
5161             const U8 *fold_array;
5162             UV utf8_fold_flags;
5163
5164             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5165             folder = foldEQ_locale;
5166             fold_array = PL_fold_locale;
5167             type = REFFL;
5168             utf8_fold_flags = FOLDEQ_LOCALE;
5169             goto do_nref;
5170
5171         case NREFFA:  /*  /\g{name}/iaa  */
5172             folder = foldEQ_latin1;
5173             fold_array = PL_fold_latin1;
5174             type = REFFA;
5175             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5176             goto do_nref;
5177
5178         case NREFFU:  /*  /\g{name}/iu  */
5179             folder = foldEQ_latin1;
5180             fold_array = PL_fold_latin1;
5181             type = REFFU;
5182             utf8_fold_flags = 0;
5183             goto do_nref;
5184
5185         case NREFF:  /*  /\g{name}/i  */
5186             folder = foldEQ;
5187             fold_array = PL_fold;
5188             type = REFF;
5189             utf8_fold_flags = 0;
5190             goto do_nref;
5191
5192         case NREF:  /*  /\g{name}/   */
5193             type = REF;
5194             folder = NULL;
5195             fold_array = NULL;
5196             utf8_fold_flags = 0;
5197           do_nref:
5198
5199             /* For the named back references, find the corresponding buffer
5200              * number */
5201             n = reg_check_named_buff_matched(rex,scan);
5202
5203             if ( ! n ) {
5204                 sayNO;
5205             }
5206             goto do_nref_ref_common;
5207
5208         case REFFL:  /*  /\1/il  */
5209             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5210             folder = foldEQ_locale;
5211             fold_array = PL_fold_locale;
5212             utf8_fold_flags = FOLDEQ_LOCALE;
5213             goto do_ref;
5214
5215         case REFFA:  /*  /\1/iaa  */
5216             folder = foldEQ_latin1;
5217             fold_array = PL_fold_latin1;
5218             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5219             goto do_ref;
5220
5221         case REFFU:  /*  /\1/iu  */
5222             folder = foldEQ_latin1;
5223             fold_array = PL_fold_latin1;
5224             utf8_fold_flags = 0;
5225             goto do_ref;
5226
5227         case REFF:  /*  /\1/i  */
5228             folder = foldEQ;
5229             fold_array = PL_fold;
5230             utf8_fold_flags = 0;
5231             goto do_ref;
5232
5233         case REF:  /*  /\1/    */
5234             folder = NULL;
5235             fold_array = NULL;
5236             utf8_fold_flags = 0;
5237
5238           do_ref:
5239             type = OP(scan);
5240             n = ARG(scan);  /* which paren pair */
5241
5242           do_nref_ref_common:
5243             ln = rex->offs[n].start;
5244             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5245             if (rex->lastparen < n || ln == -1)
5246                 sayNO;                  /* Do not match unless seen CLOSEn. */
5247             if (ln == rex->offs[n].end)
5248                 break;
5249
5250             s = reginfo->strbeg + ln;
5251             if (type != REF     /* REF can do byte comparison */
5252                 && (utf8_target || type == REFFU || type == REFFL))
5253             {
5254                 char * limit = reginfo->strend;
5255
5256                 /* This call case insensitively compares the entire buffer
5257                     * at s, with the current input starting at locinput, but
5258                     * not going off the end given by reginfo->strend, and
5259                     * returns in <limit> upon success, how much of the
5260                     * current input was matched */
5261                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
5262                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
5263                 {
5264                     sayNO;
5265                 }
5266                 locinput = limit;
5267                 break;
5268             }
5269
5270             /* Not utf8:  Inline the first character, for speed. */
5271             if (!NEXTCHR_IS_EOS &&
5272                 UCHARAT(s) != nextchr &&
5273                 (type == REF ||
5274                  UCHARAT(s) != fold_array[nextchr]))
5275                 sayNO;
5276             ln = rex->offs[n].end - ln;
5277             if (locinput + ln > reginfo->strend)
5278                 sayNO;
5279             if (ln > 1 && (type == REF
5280                            ? memNE(s, locinput, ln)
5281                            : ! folder(s, locinput, ln)))
5282                 sayNO;
5283             locinput += ln;
5284             break;
5285         }
5286
5287         case NOTHING: /* null op; e.g. the 'nothing' following
5288                        * the '*' in m{(a+|b)*}' */
5289             break;
5290         case TAIL: /* placeholder while compiling (A|B|C) */
5291             break;
5292
5293 #undef  ST
5294 #define ST st->u.eval
5295         {
5296             SV *ret;
5297             REGEXP *re_sv;
5298             regexp *re;
5299             regexp_internal *rei;
5300             regnode *startpoint;
5301
5302         case GOSTART: /*  (?R)  */
5303         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
5304             if (cur_eval && cur_eval->locinput==locinput) {
5305                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
5306                     Perl_croak(aTHX_ "Infinite recursion in regex");
5307                 if ( ++nochange_depth > max_nochange_depth )
5308                     Perl_croak(aTHX_
5309                         "Pattern subroutine nesting without pos change"
5310                         " exceeded limit in regex");
5311             } else {
5312                 nochange_depth = 0;
5313             }
5314             re_sv = rex_sv;
5315             re = rex;
5316             rei = rexi;
5317             if (OP(scan)==GOSUB) {
5318                 startpoint = scan + ARG2L(scan);
5319                 ST.close_paren = ARG(scan);
5320             } else {
5321                 startpoint = rei->program+1;
5322                 ST.close_paren = 0;
5323             }
5324
5325             /* Save all the positions seen so far. */
5326             ST.cp = regcppush(rex, 0, maxopenparen);
5327             REGCP_SET(ST.lastcp);
5328
5329             /* and then jump to the code we share with EVAL */
5330             goto eval_recurse_doit;
5331             /* NOTREACHED */
5332
5333         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
5334             if (cur_eval && cur_eval->locinput==locinput) {
5335                 if ( ++nochange_depth > max_nochange_depth )
5336                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
5337             } else {
5338                 nochange_depth = 0;
5339             }
5340             {
5341                 /* execute the code in the {...} */
5342
5343                 dSP;
5344                 IV before;
5345                 OP * const oop = PL_op;
5346                 COP * const ocurcop = PL_curcop;
5347                 OP *nop;
5348                 CV *newcv;
5349
5350                 /* save *all* paren positions */
5351                 regcppush(rex, 0, maxopenparen);
5352                 REGCP_SET(runops_cp);
5353
5354                 if (!caller_cv)
5355                     caller_cv = find_runcv(NULL);
5356
5357                 n = ARG(scan);
5358
5359                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
5360                     newcv = (ReANY(
5361                                                 (REGEXP*)(rexi->data->data[n])
5362                                             ))->qr_anoncv
5363                                         ;
5364                     nop = (OP*)rexi->data->data[n+1];
5365                 }
5366                 else if (rexi->data->what[n] == 'l') { /* literal code */
5367                     newcv = caller_cv;
5368                     nop = (OP*)rexi->data->data[n];
5369                     assert(CvDEPTH(newcv));
5370                 }
5371                 else {
5372                     /* literal with own CV */
5373                     assert(rexi->data->what[n] == 'L');
5374                     newcv = rex->qr_anoncv;
5375                     nop = (OP*)rexi->data->data[n];
5376                 }
5377
5378                 /* normally if we're about to execute code from the same
5379                  * CV that we used previously, we just use the existing
5380                  * CX stack entry. However, its possible that in the
5381                  * meantime we may have backtracked, popped from the save
5382                  * stack, and undone the SAVECOMPPAD(s) associated with
5383                  * PUSH_MULTICALL; in which case PL_comppad no longer
5384                  * points to newcv's pad. */
5385                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
5386                 {
5387                     U8 flags = (CXp_SUB_RE |
5388                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5389                     if (last_pushed_cv) {
5390                         CHANGE_MULTICALL_FLAGS(newcv, flags);
5391                     }
5392                     else {
5393                         PUSH_MULTICALL_FLAGS(newcv, flags);
5394                     }
5395                     last_pushed_cv = newcv;
5396                 }
5397                 else {
5398                     /* these assignments are just to silence compiler
5399                      * warnings */
5400                     multicall_cop = NULL;
5401                     newsp = NULL;
5402                 }
5403                 last_pad = PL_comppad;
5404
5405                 /* the initial nextstate you would normally execute
5406                  * at the start of an eval (which would cause error
5407                  * messages to come from the eval), may be optimised
5408                  * away from the execution path in the regex code blocks;
5409                  * so manually set PL_curcop to it initially */
5410                 {
5411                     OP *o = cUNOPx(nop)->op_first;
5412                     assert(o->op_type == OP_NULL);
5413                     if (o->op_targ == OP_SCOPE) {
5414                         o = cUNOPo->op_first;
5415                     }
5416                     else {
5417                         assert(o->op_targ == OP_LEAVE);
5418                         o = cUNOPo->op_first;
5419                         assert(o->op_type == OP_ENTER);
5420                         o = OpSIBLING(o);
5421                     }
5422
5423                     if (o->op_type != OP_STUB) {
5424                         assert(    o->op_type == OP_NEXTSTATE
5425                                 || o->op_type == OP_DBSTATE
5426                                 || (o->op_type == OP_NULL
5427                                     &&  (  o->op_targ == OP_NEXTSTATE
5428                                         || o->op_targ == OP_DBSTATE
5429                                         )
5430                                     )
5431                         );
5432                         PL_curcop = (COP*)o;
5433                     }
5434                 }
5435                 nop = nop->op_next;
5436
5437                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5438                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5439
5440                 rex->offs[0].end = locinput - reginfo->strbeg;
5441                 if (reginfo->info_aux_eval->pos_magic)
5442                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5443                                   reginfo->sv, reginfo->strbeg,
5444                                   locinput - reginfo->strbeg);
5445
5446                 if (sv_yes_mark) {
5447                     SV *sv_mrk = get_sv("REGMARK", 1);
5448                     sv_setsv(sv_mrk, sv_yes_mark);
5449                 }
5450
5451                 /* we don't use MULTICALL here as we want to call the
5452                  * first op of the block of interest, rather than the
5453                  * first op of the sub */
5454                 before = (IV)(SP-PL_stack_base);
5455                 PL_op = nop;
5456                 CALLRUNOPS(aTHX);                       /* Scalar context. */
5457                 SPAGAIN;
5458                 if ((IV)(SP-PL_stack_base) == before)
5459                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
5460                 else {
5461                     ret = POPs;
5462                     PUTBACK;
5463                 }
5464
5465                 /* before restoring everything, evaluate the returned
5466                  * value, so that 'uninit' warnings don't use the wrong
5467                  * PL_op or pad. Also need to process any magic vars
5468                  * (e.g. $1) *before* parentheses are restored */
5469
5470                 PL_op = NULL;
5471
5472                 re_sv = NULL;
5473                 if (logical == 0)        /*   (?{})/   */
5474                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5475                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
5476                     sw = cBOOL(SvTRUE(ret));
5477                     logical = 0;
5478                 }
5479                 else {                   /*  /(??{})  */
5480                     /*  if its overloaded, let the regex compiler handle
5481                      *  it; otherwise extract regex, or stringify  */
5482                     if (SvGMAGICAL(ret))
5483                         ret = sv_mortalcopy(ret);
5484                     if (!SvAMAGIC(ret)) {
5485                         SV *sv = ret;
5486                         if (SvROK(sv))
5487                             sv = SvRV(sv);
5488                         if (SvTYPE(sv) == SVt_REGEXP)
5489                             re_sv = (REGEXP*) sv;
5490                         else if (SvSMAGICAL(ret)) {
5491                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
5492                             if (mg)
5493                                 re_sv = (REGEXP *) mg->mg_obj;
5494                         }
5495
5496                         /* force any undef warnings here */
5497                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5498                             ret = sv_mortalcopy(ret);
5499                             (void) SvPV_force_nolen(ret);
5500                         }
5501                     }
5502
5503                 }
5504
5505                 /* *** Note that at this point we don't restore
5506                  * PL_comppad, (or pop the CxSUB) on the assumption it may
5507                  * be used again soon. This is safe as long as nothing
5508                  * in the regexp code uses the pad ! */
5509                 PL_op = oop;
5510                 PL_curcop = ocurcop;
5511                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5512                 PL_curpm = PL_reg_curpm;
5513
5514                 if (logical != 2)
5515                     break;
5516             }
5517
5518                 /* only /(??{})/  from now on */
5519                 logical = 0;
5520                 {
5521                     /* extract RE object from returned value; compiling if
5522                      * necessary */
5523
5524                     if (re_sv) {
5525                         re_sv = reg_temp_copy(NULL, re_sv);
5526                     }
5527                     else {
5528                         U32 pm_flags = 0;
5529
5530                         if (SvUTF8(ret) && IN_BYTES) {
5531                             /* In use 'bytes': make a copy of the octet
5532                              * sequence, but without the flag on */
5533                             STRLEN len;
5534                             const char *const p = SvPV(ret, len);
5535                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5536                         }
5537                         if (rex->intflags & PREGf_USE_RE_EVAL)
5538                             pm_flags |= PMf_USE_RE_EVAL;
5539
5540                         /* if we got here, it should be an engine which
5541                          * supports compiling code blocks and stuff */
5542                         assert(rex->engine && rex->engine->op_comp);
5543                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5544                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5545                                     rex->engine, NULL, NULL,
5546                                     /* copy /msixn etc to inner pattern */
5547                                     ARG2L(scan),
5548                                     pm_flags);
5549
5550                         if (!(SvFLAGS(ret)
5551                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
5552                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
5553                             /* This isn't a first class regexp. Instead, it's
5554                                caching a regexp onto an existing, Perl visible
5555                                scalar.  */
5556                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5557                         }
5558                     }
5559                     SAVEFREESV(re_sv);
5560                     re = ReANY(re_sv);
5561                 }
5562                 RXp_MATCH_COPIED_off(re);
5563                 re->subbeg = rex->subbeg;
5564                 re->sublen = rex->sublen;
5565                 re->suboffset = rex->suboffset;
5566                 re->subcoffset = rex->subcoffset;
5567                 re->lastparen = 0;
5568                 re->lastcloseparen = 0;
5569                 rei = RXi_GET(re);
5570                 DEBUG_EXECUTE_r(
5571                     debug_start_match(re_sv, utf8_target, locinput,
5572                                     reginfo->strend, "Matching embedded");
5573                 );
5574                 startpoint = rei->program + 1;
5575                 ST.close_paren = 0; /* only used for GOSUB */
5576                 /* Save all the seen positions so far. */
5577                 ST.cp = regcppush(rex, 0, maxopenparen);
5578                 REGCP_SET(ST.lastcp);
5579                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
5580                 maxopenparen = 0;
5581                 /* run the pattern returned from (??{...}) */
5582
5583               eval_recurse_doit: /* Share code with GOSUB below this line
5584                             * At this point we expect the stack context to be
5585                             * set up correctly */
5586
5587                 /* invalidate the S-L poscache. We're now executing a
5588                  * different set of WHILEM ops (and their associated
5589                  * indexes) against the same string, so the bits in the
5590                  * cache are meaningless. Setting maxiter to zero forces
5591                  * the cache to be invalidated and zeroed before reuse.
5592                  * XXX This is too dramatic a measure. Ideally we should
5593                  * save the old cache and restore when running the outer
5594                  * pattern again */
5595                 reginfo->poscache_maxiter = 0;
5596
5597                 /* the new regexp might have a different is_utf8_pat than we do */
5598                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5599
5600                 ST.prev_rex = rex_sv;
5601                 ST.prev_curlyx = cur_curlyx;
5602                 rex_sv = re_sv;
5603                 SET_reg_curpm(rex_sv);
5604                 rex = re;
5605                 rexi = rei;
5606                 cur_curlyx = NULL;
5607                 ST.B = next;
5608                 ST.prev_eval = cur_eval;
5609                 cur_eval = st;
5610                 /* now continue from first node in postoned RE */
5611                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5612                 /* NOTREACHED */
5613                 NOT_REACHED;
5614         }
5615
5616         case EVAL_AB: /* cleanup after a successful (??{A})B */
5617             /* note: this is called twice; first after popping B, then A */
5618             rex_sv = ST.prev_rex;
5619             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5620             SET_reg_curpm(rex_sv);
5621             rex = ReANY(rex_sv);
5622             rexi = RXi_GET(rex);
5623             {
5624                 /* preserve $^R across LEAVE's. See Bug 121070. */
5625                 SV *save_sv= GvSV(PL_replgv);
5626                 SvREFCNT_inc(save_sv);
5627                 regcpblow(ST.cp); /* LEAVE in disguise */
5628                 sv_setsv(GvSV(PL_replgv), save_sv);
5629                 SvREFCNT_dec(save_sv);
5630             }
5631             cur_eval = ST.prev_eval;
5632             cur_curlyx = ST.prev_curlyx;
5633
5634             /* Invalidate cache. See "invalidate" comment above. */
5635             reginfo->poscache_maxiter = 0;
5636             if ( nochange_depth )
5637                 nochange_depth--;
5638             sayYES;
5639
5640
5641         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5642             /* note: this is called twice; first after popping B, then A */
5643             rex_sv = ST.prev_rex;
5644             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5645             SET_reg_curpm(rex_sv);
5646             rex = ReANY(rex_sv);
5647             rexi = RXi_GET(rex);
5648
5649             REGCP_UNWIND(ST.lastcp);
5650             regcppop(rex, &maxopenparen);
5651             cur_eval = ST.prev_eval;
5652             cur_curlyx = ST.prev_curlyx;
5653             /* Invalidate cache. See "invalidate" comment above. */
5654             reginfo->poscache_maxiter = 0;
5655             if ( nochange_depth )
5656                 nochange_depth--;
5657             sayNO_SILENT;
5658 #undef ST
5659
5660         case OPEN: /*  (  */
5661             n = ARG(scan);  /* which paren pair */
5662             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5663             if (n > maxopenparen)
5664                 maxopenparen = n;
5665             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5666                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5667                 PTR2UV(rex),
5668                 PTR2UV(rex->offs),
5669                 (UV)n,
5670                 (IV)rex->offs[n].start_tmp,
5671                 (UV)maxopenparen
5672             ));
5673             lastopen = n;
5674             break;
5675
5676 /* XXX really need to log other places start/end are set too */
5677 #define CLOSE_CAPTURE \
5678     rex->offs[n].start = rex->offs[n].start_tmp; \
5679     rex->offs[n].end = locinput - reginfo->strbeg; \
5680     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5681         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5682         PTR2UV(rex), \
5683         PTR2UV(rex->offs), \
5684         (UV)n, \
5685         (IV)rex->offs[n].start, \
5686         (IV)rex->offs[n].end \
5687     ))
5688
5689         case CLOSE:  /*  )  */
5690             n = ARG(scan);  /* which paren pair */
5691             CLOSE_CAPTURE;
5692             if (n > rex->lastparen)
5693                 rex->lastparen = n;
5694             rex->lastcloseparen = n;
5695             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5696                 goto fake_end;
5697             }
5698             break;
5699
5700         case ACCEPT:  /*  (*ACCEPT)  */
5701             if (ARG(scan)){
5702                 regnode *cursor;
5703                 for (cursor=scan;
5704                      cursor && OP(cursor)!=END;
5705                      cursor=regnext(cursor))
5706                 {
5707                     if ( OP(cursor)==CLOSE ){
5708                         n = ARG(cursor);
5709                         if ( n <= lastopen ) {
5710                             CLOSE_CAPTURE;
5711                             if (n > rex->lastparen)
5712                                 rex->lastparen = n;
5713                             rex->lastcloseparen = n;
5714                             if ( n == ARG(scan) || (cur_eval &&
5715                                 cur_eval->u.eval.close_paren == n))
5716                                 break;
5717                         }
5718                     }
5719                 }
5720             }
5721             goto fake_end;
5722             /* NOTREACHED */
5723
5724         case GROUPP:  /*  (?(1))  */
5725             n = ARG(scan);  /* which paren pair */
5726             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5727             break;
5728
5729         case NGROUPP:  /*  (?(<name>))  */
5730             /* reg_check_named_buff_matched returns 0 for no match */
5731             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5732             break;
5733
5734         case INSUBP:   /*  (?(R))  */
5735             n = ARG(scan);
5736             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5737             break;
5738
5739         case DEFINEP:  /*  (?(DEFINE))  */
5740             sw = 0;
5741             break;
5742
5743         case IFTHEN:   /*  (?(cond)A|B)  */
5744             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5745             if (sw)
5746                 next = NEXTOPER(NEXTOPER(scan));
5747             else {
5748                 next = scan + ARG(scan);
5749                 if (OP(next) == IFTHEN) /* Fake one. */
5750                     next = NEXTOPER(NEXTOPER(next));
5751             }
5752             break;
5753
5754         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5755             logical = scan->flags;
5756             break;
5757
5758 /*******************************************************************
5759
5760 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5761 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5762 STAR/PLUS/CURLY/CURLYN are used instead.)
5763
5764 A*B is compiled as <CURLYX><A><WHILEM><B>
5765
5766 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5767 state, which contains the current count, initialised to -1. It also sets
5768 cur_curlyx to point to this state, with any previous value saved in the
5769 state block.
5770
5771 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5772 since the pattern may possibly match zero times (i.e. it's a while {} loop
5773 rather than a do {} while loop).
5774
5775 Each entry to WHILEM represents a successful match of A. The count in the
5776 CURLYX block is incremented, another WHILEM state is pushed, and execution
5777 passes to A or B depending on greediness and the current count.
5778
5779 For example, if matching against the string a1a2a3b (where the aN are
5780 substrings that match /A/), then the match progresses as follows: (the
5781 pushed states are interspersed with the bits of strings matched so far):
5782
5783     <CURLYX cnt=-1>
5784     <CURLYX cnt=0><WHILEM>
5785     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5786     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5787     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5788     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5789
5790 (Contrast this with something like CURLYM, which maintains only a single
5791 backtrack state:
5792
5793     <CURLYM cnt=0> a1
5794     a1 <CURLYM cnt=1> a2
5795     a1 a2 <CURLYM cnt=2> a3
5796     a1 a2 a3 <CURLYM cnt=3> b
5797 )
5798
5799 Each WHILEM state block marks a point to backtrack to upon partial failure
5800 of A or B, and also contains some minor state data related to that
5801 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5802 overall state, such as the count, and pointers to the A and B ops.
5803
5804 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5805 must always point to the *current* CURLYX block, the rules are:
5806
5807 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5808 and set cur_curlyx to point the new block.
5809
5810 When popping the CURLYX block after a successful or unsuccessful match,
5811 restore the previous cur_curlyx.
5812
5813 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5814 to the outer one saved in the CURLYX block.
5815
5816 When popping the WHILEM block after a successful or unsuccessful B match,
5817 restore the previous cur_curlyx.
5818
5819 Here's an example for the pattern (AI* BI)*BO
5820 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5821
5822 cur_
5823 curlyx backtrack stack
5824 ------ ---------------
5825 NULL
5826 CO     <CO prev=NULL> <WO>
5827 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5828 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5829 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5830
5831 At this point the pattern succeeds, and we work back down the stack to
5832 clean up, restoring as we go:
5833
5834 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5835 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5836 CO     <CO prev=NULL> <WO>
5837 NULL
5838
5839 *******************************************************************/
5840
5841 #define ST st->u.curlyx
5842
5843         case CURLYX:    /* start of /A*B/  (for complex A) */
5844         {
5845             /* No need to save/restore up to this paren */
5846             I32 parenfloor = scan->flags;
5847
5848             assert(next); /* keep Coverity happy */
5849             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5850                 next += ARG(next);
5851
5852             /* XXXX Probably it is better to teach regpush to support
5853                parenfloor > maxopenparen ... */
5854             if (parenfloor > (I32)rex->lastparen)
5855                 parenfloor = rex->lastparen; /* Pessimization... */
5856
5857             ST.prev_curlyx= cur_curlyx;
5858             cur_curlyx = st;
5859             ST.cp = PL_savestack_ix;
5860
5861             /* these fields contain the state of the current curly.
5862              * they are accessed by subsequent WHILEMs */
5863             ST.parenfloor = parenfloor;
5864             ST.me = scan;
5865             ST.B = next;
5866             ST.minmod = minmod;
5867             minmod = 0;
5868             ST.count = -1;      /* this will be updated by WHILEM */
5869             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5870
5871             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5872             /* NOTREACHED */
5873             NOT_REACHED;
5874         }
5875
5876         case CURLYX_end: /* just finished matching all of A*B */
5877             cur_curlyx = ST.prev_curlyx;
5878             sayYES;
5879             /* NOTREACHED */
5880             NOT_REACHED;
5881
5882         case CURLYX_end_fail: /* just failed to match all of A*B */
5883             regcpblow(ST.cp);
5884             cur_curlyx = ST.prev_curlyx;
5885             sayNO;
5886             /* NOTREACHED */
5887             NOT_REACHED;
5888
5889
5890 #undef ST
5891 #define ST st->u.whilem
5892
5893         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5894         {
5895             /* see the discussion above about CURLYX/WHILEM */
5896             I32 n;
5897             int min, max;
5898             regnode *A;
5899
5900             assert(cur_curlyx); /* keep Coverity happy */
5901
5902             min = ARG1(cur_curlyx->u.curlyx.me);
5903             max = ARG2(cur_curlyx->u.curlyx.me);
5904             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5905             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5906             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5907             ST.cache_offset = 0;
5908             ST.cache_mask = 0;
5909
5910
5911             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5912                   "%*s  whilem: matched %ld out of %d..%d\n",
5913                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5914             );
5915
5916             /* First just match a string of min A's. */
5917
5918             if (n < min) {
5919                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5920                                     maxopenparen);
5921                 cur_curlyx->u.curlyx.lastloc = locinput;
5922                 REGCP_SET(ST.lastcp);
5923
5924                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5925                 /* NOTREACHED */
5926                 NOT_REACHED;
5927             }
5928
5929             /* If degenerate A matches "", assume A done. */
5930
5931             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5932                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5933                    "%*s  whilem: empty match detected, trying continuation...\n",
5934                    REPORT_CODE_OFF+depth*2, "")
5935                 );
5936                 goto do_whilem_B_max;
5937             }
5938
5939             /* super-linear cache processing.
5940              *
5941              * The idea here is that for certain types of CURLYX/WHILEM -
5942              * principally those whose upper bound is infinity (and
5943              * excluding regexes that have things like \1 and other very
5944              * non-regular expresssiony things), then if a pattern like
5945              * /....A*.../ fails and we backtrack to the WHILEM, then we
5946              * make a note that this particular WHILEM op was at string
5947              * position 47 (say) when the rest of pattern failed. Then, if
5948              * we ever find ourselves back at that WHILEM, and at string
5949              * position 47 again, we can just fail immediately rather than
5950              * running the rest of the pattern again.
5951              *
5952              * This is very handy when patterns start to go
5953              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5954              * with a combinatorial explosion of backtracking.
5955              *
5956              * The cache is implemented as a bit array, with one bit per
5957              * string byte position per WHILEM op (up to 16) - so its
5958              * between 0.25 and 2x the string size.
5959              *
5960              * To avoid allocating a poscache buffer every time, we do an
5961              * initially countdown; only after we have  executed a WHILEM
5962              * op (string-length x #WHILEMs) times do we allocate the
5963              * cache.
5964              *
5965              * The top 4 bits of scan->flags byte say how many different
5966              * relevant CURLLYX/WHILEM op pairs there are, while the
5967              * bottom 4-bits is the identifying index number of this
5968              * WHILEM.
5969              */
5970
5971             if (scan->flags) {
5972
5973                 if (!reginfo->poscache_maxiter) {
5974                     /* start the countdown: Postpone detection until we
5975                      * know the match is not *that* much linear. */
5976                     reginfo->poscache_maxiter
5977                         =    (reginfo->strend - reginfo->strbeg + 1)
5978                            * (scan->flags>>4);
5979                     /* possible overflow for long strings and many CURLYX's */
5980                     if (reginfo->poscache_maxiter < 0)
5981                         reginfo->poscache_maxiter = I32_MAX;
5982                     reginfo->poscache_iter = reginfo->poscache_maxiter;
5983                 }
5984
5985                 if (reginfo->poscache_iter-- == 0) {
5986                     /* initialise cache */
5987                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5988                     regmatch_info_aux *const aux = reginfo->info_aux;
5989                     if (aux->poscache) {
5990                         if ((SSize_t)reginfo->poscache_size < size) {
5991                             Renew(aux->poscache, size, char);
5992                             reginfo->poscache_size = size;
5993                         }
5994                         Zero(aux->poscache, size, char);
5995                     }
5996                     else {
5997                         reginfo->poscache_size = size;
5998                         Newxz(aux->poscache, size, char);
5999                     }
6000                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6001       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
6002                               PL_colors[4], PL_colors[5])
6003                     );
6004                 }
6005
6006                 if (reginfo->poscache_iter < 0) {
6007                     /* have we already failed at this position? */
6008                     SSize_t offset, mask;
6009
6010                     reginfo->poscache_iter = -1; /* stop eventual underflow */
6011                     offset  = (scan->flags & 0xf) - 1
6012                                 +   (locinput - reginfo->strbeg)
6013                                   * (scan->flags>>4);
6014                     mask    = 1 << (offset % 8);
6015                     offset /= 8;
6016                     if (reginfo->info_aux->poscache[offset] & mask) {
6017                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6018                             "%*s  whilem: (cache) already tried at this position...\n",
6019                             REPORT_CODE_OFF+depth*2, "")
6020                         );
6021                         sayNO; /* cache records failure */
6022                     }
6023                     ST.cache_offset = offset;
6024                     ST.cache_mask   = mask;
6025                 }
6026             }
6027
6028             /* Prefer B over A for minimal matching. */
6029
6030             if (cur_curlyx->u.curlyx.minmod) {
6031                 ST.save_curlyx = cur_curlyx;
6032                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6033                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
6034                             maxopenparen);
6035                 REGCP_SET(ST.lastcp);
6036                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
6037                                     locinput);
6038                 /* NOTREACHED */
6039                 NOT_REACHED;
6040             }
6041
6042             /* Prefer A over B for maximal matching. */
6043
6044             if (n < max) { /* More greed allowed? */
6045                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6046                             maxopenparen);
6047                 cur_curlyx->u.curlyx.lastloc = locinput;
6048                 REGCP_SET(ST.lastcp);
6049                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
6050                 /* NOTREACHED */
6051                 NOT_REACHED;
6052             }
6053             goto do_whilem_B_max;
6054         }
6055         /* NOTREACHED */
6056         NOT_REACHED;
6057
6058         case WHILEM_B_min: /* just matched B in a minimal match */
6059         case WHILEM_B_max: /* just matched B in a maximal match */
6060             cur_curlyx = ST.save_curlyx;
6061             sayYES;
6062             /* NOTREACHED */
6063             NOT_REACHED;
6064
6065         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
6066             cur_curlyx = ST.save_curlyx;
6067             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6068             cur_curlyx->u.curlyx.count--;
6069             CACHEsayNO;
6070             /* NOTREACHED */
6071             NOT_REACHED;
6072
6073         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
6074             /* FALLTHROUGH */
6075         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
6076             REGCP_UNWIND(ST.lastcp);
6077             regcppop(rex, &maxopenparen);
6078             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6079             cur_curlyx->u.curlyx.count--;
6080             CACHEsayNO;
6081             /* NOTREACHED */
6082             NOT_REACHED;
6083
6084         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
6085             REGCP_UNWIND(ST.lastcp);
6086             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
6087             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6088                 "%*s  whilem: failed, trying continuation...\n",
6089                 REPORT_CODE_OFF+depth*2, "")
6090             );
6091           do_whilem_B_max:
6092             if (cur_curlyx->u.curlyx.count >= REG_INFTY
6093                 && ckWARN(WARN_REGEXP)
6094                 && !reginfo->warned)
6095             {
6096                 reginfo->warned = TRUE;
6097                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6098                      "Complex regular subexpression recursion limit (%d) "
6099                      "exceeded",
6100                      REG_INFTY - 1);
6101             }
6102
6103             /* now try B */
6104             ST.save_curlyx = cur_curlyx;
6105             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6106             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6107                                 locinput);
6108             /* NOTREACHED */
6109             NOT_REACHED;
6110
6111         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6112             cur_curlyx = ST.save_curlyx;
6113             REGCP_UNWIND(ST.lastcp);
6114             regcppop(rex, &maxopenparen);
6115
6116             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
6117                 /* Maximum greed exceeded */
6118                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6119                     && ckWARN(WARN_REGEXP)
6120                     && !reginfo->warned)
6121                 {
6122                     reginfo->warned     = TRUE;
6123                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6124                         "Complex regular subexpression recursion "
6125                         "limit (%d) exceeded",
6126                         REG_INFTY - 1);
6127                 }
6128                 cur_curlyx->u.curlyx.count--;
6129                 CACHEsayNO;
6130             }
6131
6132             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6133                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6134             );
6135             /* Try grabbing another A and see if it helps. */
6136             cur_curlyx->u.curlyx.lastloc = locinput;
6137             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6138                             maxopenparen);
6139             REGCP_SET(ST.lastcp);
6140             PUSH_STATE_GOTO(WHILEM_A_min,
6141                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6142                 locinput);
6143             /* NOTREACHED */
6144             NOT_REACHED;
6145
6146 #undef  ST
6147 #define ST st->u.branch
6148
6149         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
6150             next = scan + ARG(scan);
6151             if (next == scan)
6152                 next = NULL;
6153             scan = NEXTOPER(scan);
6154             /* FALLTHROUGH */
6155
6156         case BRANCH:        /*  /(...|A|...)/ */
6157             scan = NEXTOPER(scan); /* scan now points to inner node */
6158             ST.lastparen = rex->lastparen;
6159             ST.lastcloseparen = rex->lastcloseparen;
6160             ST.next_branch = next;
6161             REGCP_SET(ST.cp);
6162
6163             /* Now go into the branch */
6164             if (has_cutgroup) {
6165                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
6166             } else {
6167                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
6168             }
6169             /* NOTREACHED */
6170             NOT_REACHED;
6171
6172         case CUTGROUP:  /*  /(*THEN)/  */
6173             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
6174                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6175             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
6176             /* NOTREACHED */
6177             NOT_REACHED;
6178
6179         case CUTGROUP_next_fail:
6180             do_cutgroup = 1;
6181             no_final = 1;
6182             if (st->u.mark.mark_name)
6183                 sv_commit = st->u.mark.mark_name;
6184             sayNO;
6185             /* NOTREACHED */
6186             NOT_REACHED;
6187
6188         case BRANCH_next:
6189             sayYES;
6190             /* NOTREACHED */
6191             NOT_REACHED;
6192
6193         case BRANCH_next_fail: /* that branch failed; try the next, if any */
6194             if (do_cutgroup) {
6195                 do_cutgroup = 0;
6196                 no_final = 0;
6197             }
6198             REGCP_UNWIND(ST.cp);
6199             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6200             scan = ST.next_branch;
6201             /* no more branches? */
6202             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
6203                 DEBUG_EXECUTE_r({
6204                     PerlIO_printf( Perl_debug_log,
6205                         "%*s  %sBRANCH failed...%s\n",
6206                         REPORT_CODE_OFF+depth*2, "",
6207                         PL_colors[4],
6208                         PL_colors[5] );
6209                 });
6210                 sayNO_SILENT;
6211             }
6212             continue; /* execute next BRANCH[J] op */
6213             /* NOTREACHED */
6214
6215         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
6216             minmod = 1;
6217             break;
6218
6219 #undef  ST
6220 #define ST st->u.curlym
6221
6222         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
6223
6224             /* This is an optimisation of CURLYX that enables us to push
6225              * only a single backtracking state, no matter how many matches
6226              * there are in {m,n}. It relies on the pattern being constant
6227              * length, with no parens to influence future backrefs
6228              */
6229
6230             ST.me = scan;
6231             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6232
6233             ST.lastparen      = rex->lastparen;
6234             ST.lastcloseparen = rex->lastcloseparen;
6235
6236             /* if paren positive, emulate an OPEN/CLOSE around A */
6237             if (ST.me->flags) {
6238                 U32 paren = ST.me->flags;
6239                 if (paren > maxopenparen)
6240                     maxopenparen = paren;
6241                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6242             }
6243             ST.A = scan;
6244             ST.B = next;
6245             ST.alen = 0;
6246             ST.count = 0;
6247             ST.minmod = minmod;
6248             minmod = 0;
6249             ST.c1 = CHRTEST_UNINIT;
6250             REGCP_SET(ST.cp);
6251
6252             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
6253                 goto curlym_do_B;
6254
6255           curlym_do_A: /* execute the A in /A{m,n}B/  */
6256             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
6257             /* NOTREACHED */
6258             NOT_REACHED;
6259
6260         case CURLYM_A: /* we've just matched an A */
6261             ST.count++;
6262             /* after first match, determine A's length: u.curlym.alen */
6263             if (ST.count == 1) {
6264                 if (reginfo->is_utf8_target) {
6265                     char *s = st->locinput;
6266                     while (s < locinput) {
6267                         ST.alen++;
6268                         s += UTF8SKIP(s);
6269                     }
6270                 }
6271                 else {
6272                     ST.alen = locinput - st->locinput;
6273                 }
6274                 if (ST.alen == 0)
6275                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
6276             }
6277             DEBUG_EXECUTE_r(
6278                 PerlIO_printf(Perl_debug_log,
6279                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
6280                           (int)(REPORT_CODE_OFF+(depth*2)), "",
6281                           (IV) ST.count, (IV)ST.alen)
6282             );
6283
6284             if (cur_eval && cur_eval->u.eval.close_paren &&
6285                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6286                 goto fake_end;
6287
6288             {
6289                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
6290                 if ( max == REG_INFTY || ST.count < max )
6291                     goto curlym_do_A; /* try to match another A */
6292             }
6293             goto curlym_do_B; /* try to match B */
6294
6295         case CURLYM_A_fail: /* just failed to match an A */
6296             REGCP_UNWIND(ST.cp);
6297
6298             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
6299                 || (cur_eval && cur_eval->u.eval.close_paren &&
6300                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
6301                 sayNO;
6302
6303           curlym_do_B: /* execute the B in /A{m,n}B/  */
6304             if (ST.c1 == CHRTEST_UNINIT) {
6305                 /* calculate c1 and c2 for possible match of 1st char
6306                  * following curly */
6307                 ST.c1 = ST.c2 = CHRTEST_VOID;
6308                 assert(ST.B);
6309                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
6310                     regnode *text_node = ST.B;
6311                     if (! HAS_TEXT(text_node))
6312                         FIND_NEXT_IMPT(text_node);
6313                     /* this used to be
6314
6315                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
6316
6317                         But the former is redundant in light of the latter.
6318
6319                         if this changes back then the macro for
6320                         IS_TEXT and friends need to change.
6321                      */
6322                     if (PL_regkind[OP(text_node)] == EXACT) {
6323                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6324                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6325                            reginfo))
6326                         {
6327                             sayNO;
6328                         }
6329                     }
6330                 }
6331             }
6332
6333             DEBUG_EXECUTE_r(
6334                 PerlIO_printf(Perl_debug_log,
6335                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
6336                     (int)(REPORT_CODE_OFF+(depth*2)),
6337                     "", (IV)ST.count)
6338                 );
6339             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
6340                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
6341                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6342                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6343                     {
6344                         /* simulate B failing */
6345                         DEBUG_OPTIMISE_r(
6346                             PerlIO_printf(Perl_debug_log,
6347                                 "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
6348                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
6349                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
6350                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
6351                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
6352                         );
6353                         state_num = CURLYM_B_fail;
6354                         goto reenter_switch;
6355                     }
6356                 }
6357                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
6358                     /* simulate B failing */
6359                     DEBUG_OPTIMISE_r(
6360                         PerlIO_printf(Perl_debug_log,
6361                             "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
6362                             (int)(REPORT_CODE_OFF+(depth*2)),"",
6363                             (int) nextchr, ST.c1, ST.c2)
6364                     );
6365                     state_num = CURLYM_B_fail;
6366                     goto reenter_switch;
6367                 }
6368             }
6369
6370             if (ST.me->flags) {
6371                 /* emulate CLOSE: mark current A as captured */
6372                 I32 paren = ST.me->flags;
6373                 if (ST.count) {
6374                     rex->offs[paren].start
6375                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
6376                     rex->offs[paren].end = locinput - reginfo->strbeg;
6377                     if ((U32)paren > rex->lastparen)
6378                         rex->lastparen = paren;
6379                     rex->lastcloseparen = paren;
6380                 }
6381                 else
6382                     rex->offs[paren].end = -1;
6383                 if (cur_eval && cur_eval->u.eval.close_paren &&
6384                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6385                 {
6386                     if (ST.count)
6387                         goto fake_end;
6388                     else
6389                         sayNO;
6390                 }
6391             }
6392
6393             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
6394             /* NOTREACHED */
6395             NOT_REACHED;
6396
6397         case CURLYM_B_fail: /* just failed to match a B */
6398             REGCP_UNWIND(ST.cp);
6399             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6400             if (ST.minmod) {
6401                 I32 max = ARG2(ST.me);
6402                 if (max != REG_INFTY && ST.count == max)
6403                     sayNO;
6404                 goto curlym_do_A; /* try to match a further A */
6405             }
6406             /* backtrack one A */
6407             if (ST.count == ARG1(ST.me) /* min */)
6408                 sayNO;
6409             ST.count--;
6410             SET_locinput(HOPc(locinput, -ST.alen));
6411             goto curlym_do_B; /* try to match B */
6412
6413 #undef ST
6414 #define ST st->u.curly
6415
6416 #define CURLY_SETPAREN(paren, success) \
6417     if (paren) { \
6418         if (success) { \
6419             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6420             rex->offs[paren].end = locinput - reginfo->strbeg; \
6421             if (paren > rex->lastparen) \
6422                 rex->lastparen = paren; \
6423             rex->lastcloseparen = paren; \
6424         } \
6425         else { \
6426             rex->offs[paren].end = -1; \
6427             rex->lastparen      = ST.lastparen; \
6428             rex->lastcloseparen = ST.lastcloseparen; \
6429         } \
6430     }
6431
6432         case STAR:              /*  /A*B/ where A is width 1 char */
6433             ST.paren = 0;
6434             ST.min = 0;
6435             ST.max = REG_INFTY;
6436             scan = NEXTOPER(scan);
6437             goto repeat;
6438
6439         case PLUS:              /*  /A+B/ where A is width 1 char */
6440             ST.paren = 0;
6441             ST.min = 1;
6442             ST.max = REG_INFTY;
6443             scan = NEXTOPER(scan);
6444             goto repeat;
6445
6446         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
6447             ST.paren = scan->flags;     /* Which paren to set */
6448             ST.lastparen      = rex->lastparen;
6449             ST.lastcloseparen = rex->lastcloseparen;
6450             if (ST.paren > maxopenparen)
6451                 maxopenparen = ST.paren;
6452             ST.min = ARG1(scan);  /* min to match */
6453             ST.max = ARG2(scan);  /* max to match */
6454             if (cur_eval && cur_eval->u.eval.close_paren &&
6455                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6456                 ST.min=1;
6457                 ST.max=1;
6458             }
6459             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6460             goto repeat;
6461
6462         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
6463             ST.paren = 0;
6464             ST.min = ARG1(scan);  /* min to match */
6465             ST.max = ARG2(scan);  /* max to match */
6466             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6467           repeat:
6468             /*
6469             * Lookahead to avoid useless match attempts
6470             * when we know what character comes next.
6471             *
6472             * Used to only do .*x and .*?x, but now it allows
6473             * for )'s, ('s and (?{ ... })'s to be in the way
6474             * of the quantifier and the EXACT-like node.  -- japhy
6475             */
6476
6477             assert(ST.min <= ST.max);
6478             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6479                 ST.c1 = ST.c2 = CHRTEST_VOID;
6480             }
6481             else {
6482                 regnode *text_node = next;
6483
6484                 if (! HAS_TEXT(text_node))
6485                     FIND_NEXT_IMPT(text_node);
6486
6487                 if (! HAS_TEXT(text_node))
6488                     ST.c1 = ST.c2 = CHRTEST_VOID;
6489                 else {
6490                     if ( PL_regkind[OP(text_node)] != EXACT ) {
6491                         ST.c1 = ST.c2 = CHRTEST_VOID;
6492                     }
6493                     else {
6494
6495                     /*  Currently we only get here when
6496
6497                         PL_rekind[OP(text_node)] == EXACT
6498
6499                         if this changes back then the macro for IS_TEXT and
6500                         friends need to change. */
6501                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6502                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6503                            reginfo))
6504                         {
6505                             sayNO;
6506                         }
6507                     }
6508                 }
6509             }
6510
6511             ST.A = scan;
6512             ST.B = next;
6513             if (minmod) {
6514                 char *li = locinput;
6515                 minmod = 0;
6516                 if (ST.min &&
6517                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6518                             < ST.min)
6519                     sayNO;
6520                 SET_locinput(li);
6521                 ST.count = ST.min;
6522                 REGCP_SET(ST.cp);
6523                 if (ST.c1 == CHRTEST_VOID)
6524                     goto curly_try_B_min;
6525
6526                 ST.oldloc = locinput;
6527
6528                 /* set ST.maxpos to the furthest point along the
6529                  * string that could possibly match */
6530                 if  (ST.max == REG_INFTY) {
6531                     ST.maxpos = reginfo->strend - 1;
6532                     if (utf8_target)
6533                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6534                             ST.maxpos--;
6535                 }
6536                 else if (utf8_target) {
6537                     int m = ST.max - ST.min;
6538                     for (ST.maxpos = locinput;
6539                          m >0 && ST.maxpos < reginfo->strend; m--)
6540                         ST.maxpos += UTF8SKIP(ST.maxpos);
6541                 }
6542                 else {
6543                     ST.maxpos = locinput + ST.max - ST.min;
6544                     if (ST.maxpos >= reginfo->strend)
6545                         ST.maxpos = reginfo->strend - 1;
6546                 }
6547                 goto curly_try_B_min_known;
6548
6549             }
6550             else {
6551                 /* avoid taking address of locinput, so it can remain
6552                  * a register var */
6553                 char *li = locinput;
6554                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6555                 if (ST.count < ST.min)
6556                     sayNO;
6557                 SET_locinput(li);
6558                 if ((ST.count > ST.min)
6559                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6560                 {
6561                     /* A{m,n} must come at the end of the string, there's
6562                      * no point in backing off ... */
6563                     ST.min = ST.count;
6564                     /* ...except that $ and \Z can match before *and* after
6565                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6566                        We may back off by one in this case. */
6567                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6568                         ST.min--;
6569                 }
6570                 REGCP_SET(ST.cp);
6571                 goto curly_try_B_max;
6572             }
6573             /* NOTREACHED */
6574             NOT_REACHED;
6575
6576         case CURLY_B_min_known_fail:
6577             /* failed to find B in a non-greedy match where c1,c2 valid */
6578
6579             REGCP_UNWIND(ST.cp);
6580             if (ST.paren) {
6581                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6582             }
6583             /* Couldn't or didn't -- move forward. */
6584             ST.oldloc = locinput;
6585             if (utf8_target)
6586                 locinput += UTF8SKIP(locinput);
6587             else
6588                 locinput++;
6589             ST.count++;
6590           curly_try_B_min_known:
6591              /* find the next place where 'B' could work, then call B */
6592             {
6593                 int n;
6594                 if (utf8_target) {
6595                     n = (ST.oldloc == locinput) ? 0 : 1;
6596                     if (ST.c1 == ST.c2) {
6597                         /* set n to utf8_distance(oldloc, locinput) */
6598                         while (locinput <= ST.maxpos
6599                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6600                         {
6601                             locinput += UTF8SKIP(locinput);
6602                             n++;
6603                         }
6604                     }
6605                     else {
6606                         /* set n to utf8_distance(oldloc, locinput) */
6607                         while (locinput <= ST.maxpos
6608                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6609                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6610                         {
6611                             locinput += UTF8SKIP(locinput);
6612                             n++;
6613                         }
6614                     }
6615                 }
6616                 else {  /* Not utf8_target */
6617                     if (ST.c1 == ST.c2) {
6618                         while (locinput <= ST.maxpos &&
6619                                UCHARAT(locinput) != ST.c1)
6620                             locinput++;
6621                     }
6622                     else {
6623                         while (locinput <= ST.maxpos
6624                                && UCHARAT(locinput) != ST.c1
6625                                && UCHARAT(locinput) != ST.c2)
6626                             locinput++;
6627                     }
6628                     n = locinput - ST.oldloc;
6629                 }
6630                 if (locinput > ST.maxpos)
6631                     sayNO;
6632                 if (n) {
6633                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6634                      * at b; check that everything between oldloc and
6635                      * locinput matches */
6636                     char *li = ST.oldloc;
6637                     ST.count += n;
6638                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6639                         sayNO;
6640                     assert(n == REG_INFTY || locinput == li);
6641                 }
6642                 CURLY_SETPAREN(ST.paren, ST.count);
6643                 if (cur_eval && cur_eval->u.eval.close_paren &&
6644                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6645                     goto fake_end;
6646                 }
6647                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6648             }
6649             /* NOTREACHED */
6650             NOT_REACHED;
6651
6652         case CURLY_B_min_fail:
6653             /* failed to find B in a non-greedy match where c1,c2 invalid */
6654
6655             REGCP_UNWIND(ST.cp);
6656             if (ST.paren) {
6657                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6658             }
6659             /* failed -- move forward one */
6660             {
6661                 char *li = locinput;
6662                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6663                     sayNO;
6664                 }
6665                 locinput = li;
6666             }
6667             {
6668                 ST.count++;
6669                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6670                         ST.count > 0)) /* count overflow ? */
6671                 {
6672                   curly_try_B_min:
6673                     CURLY_SETPAREN(ST.paren, ST.count);
6674                     if (cur_eval && cur_eval->u.eval.close_paren &&
6675                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6676                         goto fake_end;
6677                     }
6678                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6679                 }
6680             }
6681             sayNO;
6682             /* NOTREACHED */
6683             NOT_REACHED;
6684
6685           curly_try_B_max:
6686             /* a successful greedy match: now try to match B */
6687             if (cur_eval && cur_eval->u.eval.close_paren &&
6688                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6689                 goto fake_end;
6690             }
6691             {
6692                 bool could_match = locinput < reginfo->strend;
6693
6694                 /* If it could work, try it. */
6695                 if (ST.c1 != CHRTEST_VOID && could_match) {
6696                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6697                     {
6698                         could_match = memEQ(locinput,
6699                                             ST.c1_utf8,
6700                                             UTF8SKIP(locinput))
6701                                     || memEQ(locinput,
6702                                              ST.c2_utf8,
6703                                              UTF8SKIP(locinput));
6704                     }
6705                     else {
6706                         could_match = UCHARAT(locinput) == ST.c1
6707                                       || UCHARAT(locinput) == ST.c2;
6708                     }
6709                 }
6710                 if (ST.c1 == CHRTEST_VOID || could_match) {
6711                     CURLY_SETPAREN(ST.paren, ST.count);
6712                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6713                     /* NOTREACHED */
6714                     NOT_REACHED;
6715                 }
6716             }
6717             /* FALLTHROUGH */
6718
6719         case CURLY_B_max_fail:
6720             /* failed to find B in a greedy match */
6721
6722             REGCP_UNWIND(ST.cp);
6723             if (ST.paren) {
6724                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6725             }
6726             /*  back up. */
6727             if (--ST.count < ST.min)
6728                 sayNO;
6729             locinput = HOPc(locinput, -1);
6730             goto curly_try_B_max;
6731
6732 #undef ST
6733
6734         case END: /*  last op of main pattern  */
6735           fake_end:
6736             if (cur_eval) {
6737                 /* we've just finished A in /(??{A})B/; now continue with B */
6738
6739                 st->u.eval.prev_rex = rex_sv;           /* inner */
6740
6741                 /* Save *all* the positions. */
6742                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6743                 rex_sv = cur_eval->u.eval.prev_rex;
6744                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6745                 SET_reg_curpm(rex_sv);
6746                 rex = ReANY(rex_sv);
6747                 rexi = RXi_GET(rex);
6748                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6749
6750                 REGCP_SET(st->u.eval.lastcp);
6751
6752                 /* Restore parens of the outer rex without popping the
6753                  * savestack */
6754                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6755                                         &maxopenparen);
6756
6757                 st->u.eval.prev_eval = cur_eval;
6758                 cur_eval = cur_eval->u.eval.prev_eval;
6759                 DEBUG_EXECUTE_r(
6760                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6761                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6762                 if ( nochange_depth )
6763                     nochange_depth--;
6764
6765                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6766                                     locinput); /* match B */
6767             }
6768
6769             if (locinput < reginfo->till) {
6770                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6771                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6772                                       PL_colors[4],
6773                                       (long)(locinput - startpos),
6774                                       (long)(reginfo->till - startpos),
6775                                       PL_colors[5]));
6776
6777                 sayNO_SILENT;           /* Cannot match: too short. */
6778             }
6779             sayYES;                     /* Success! */
6780
6781         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6782             DEBUG_EXECUTE_r(
6783             PerlIO_printf(Perl_debug_log,
6784                 "%*s  %ssubpattern success...%s\n",
6785                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6786             sayYES;                     /* Success! */
6787
6788 #undef  ST
6789 #define ST st->u.ifmatch
6790
6791         {
6792             char *newstart;
6793
6794         case SUSPEND:   /* (?>A) */
6795             ST.wanted = 1;
6796             newstart = locinput;
6797             goto do_ifmatch;
6798
6799         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6800             ST.wanted = 0;
6801             goto ifmatch_trivial_fail_test;
6802
6803         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6804             ST.wanted = 1;
6805           ifmatch_trivial_fail_test:
6806             if (scan->flags) {
6807                 char * const s = HOPBACKc(locinput, scan->flags);
6808                 if (!s) {
6809                     /* trivial fail */
6810                     if (logical) {
6811                         logical = 0;
6812                         sw = 1 - cBOOL(ST.wanted);
6813                     }
6814                     else if (ST.wanted)
6815                         sayNO;
6816                     next = scan + ARG(scan);
6817                     if (next == scan)
6818                         next = NULL;
6819                     break;
6820                 }
6821                 newstart = s;
6822             }
6823             else
6824                 newstart = locinput;
6825
6826           do_ifmatch:
6827             ST.me = scan;
6828             ST.logical = logical;
6829             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6830
6831             /* execute body of (?...A) */
6832             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6833             /* NOTREACHED */
6834             NOT_REACHED;
6835         }
6836
6837         case IFMATCH_A_fail: /* body of (?...A) failed */
6838             ST.wanted = !ST.wanted;
6839             /* FALLTHROUGH */
6840
6841         case IFMATCH_A: /* body of (?...A) succeeded */
6842             if (ST.logical) {
6843                 sw = cBOOL(ST.wanted);
6844             }
6845             else if (!ST.wanted)
6846                 sayNO;
6847
6848             if (OP(ST.me) != SUSPEND) {
6849                 /* restore old position except for (?>...) */
6850                 locinput = st->locinput;
6851             }
6852             scan = ST.me + ARG(ST.me);
6853             if (scan == ST.me)
6854                 scan = NULL;
6855             continue; /* execute B */
6856
6857 #undef ST
6858
6859         case LONGJMP: /*  alternative with many branches compiles to
6860                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6861             next = scan + ARG(scan);
6862             if (next == scan)
6863                 next = NULL;
6864             break;
6865
6866         case COMMIT:  /*  (*COMMIT)  */
6867             reginfo->cutpoint = reginfo->strend;
6868             /* FALLTHROUGH */
6869
6870         case PRUNE:   /*  (*PRUNE)   */
6871             if (!scan->flags)
6872                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6873             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6874             /* NOTREACHED */
6875             NOT_REACHED;
6876
6877         case COMMIT_next_fail:
6878             no_final = 1;
6879             /* FALLTHROUGH */
6880
6881         case OPFAIL:   /* (*FAIL)  */
6882             sayNO;
6883             /* NOTREACHED */
6884             NOT_REACHED;
6885
6886 #define ST st->u.mark
6887         case MARKPOINT: /*  (*MARK:foo)  */
6888             ST.prev_mark = mark_state;
6889             ST.mark_name = sv_commit = sv_yes_mark
6890                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6891             mark_state = st;
6892             ST.mark_loc = locinput;
6893             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6894             /* NOTREACHED */
6895             NOT_REACHED;
6896
6897         case MARKPOINT_next:
6898             mark_state = ST.prev_mark;
6899             sayYES;
6900             /* NOTREACHED */
6901             NOT_REACHED;
6902
6903         case MARKPOINT_next_fail:
6904             if (popmark && sv_eq(ST.mark_name,popmark))
6905             {
6906                 if (ST.mark_loc > startpoint)
6907                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6908                 popmark = NULL; /* we found our mark */
6909                 sv_commit = ST.mark_name;
6910
6911                 DEBUG_EXECUTE_r({
6912                         PerlIO_printf(Perl_debug_log,
6913                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6914                             REPORT_CODE_OFF+depth*2, "",
6915                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6916                 });
6917             }
6918             mark_state = ST.prev_mark;
6919             sv_yes_mark = mark_state ?
6920                 mark_state->u.mark.mark_name : NULL;
6921             sayNO;
6922             /* NOTREACHED */
6923             NOT_REACHED;
6924
6925         case SKIP:  /*  (*SKIP)  */
6926             if (scan->flags) {
6927                 /* (*SKIP) : if we fail we cut here*/
6928                 ST.mark_name = NULL;
6929                 ST.mark_loc = locinput;
6930                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6931             } else {
6932                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6933                    otherwise do nothing.  Meaning we need to scan
6934                  */
6935                 regmatch_state *cur = mark_state;
6936                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6937
6938                 while (cur) {
6939                     if ( sv_eq( cur->u.mark.mark_name,
6940                                 find ) )
6941                     {
6942                         ST.mark_name = find;
6943                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6944                     }
6945                     cur = cur->u.mark.prev_mark;
6946                 }
6947             }
6948             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6949             break;
6950
6951         case SKIP_next_fail:
6952             if (ST.mark_name) {
6953                 /* (*CUT:NAME) - Set up to search for the name as we
6954                    collapse the stack*/
6955                 popmark = ST.mark_name;
6956             } else {
6957                 /* (*CUT) - No name, we cut here.*/
6958                 if (ST.mark_loc > startpoint)
6959                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6960                 /* but we set sv_commit to latest mark_name if there
6961                    is one so they can test to see how things lead to this
6962                    cut */
6963                 if (mark_state)
6964                     sv_commit=mark_state->u.mark.mark_name;
6965             }
6966             no_final = 1;
6967             sayNO;
6968             /* NOTREACHED */
6969             NOT_REACHED;
6970 #undef ST
6971
6972         case LNBREAK: /* \R */
6973             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6974                 locinput += n;
6975             } else
6976                 sayNO;
6977             break;
6978
6979         default:
6980             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6981                           PTR2UV(scan), OP(scan));
6982             Perl_croak(aTHX_ "regexp memory corruption");
6983
6984         /* this is a point to jump to in order to increment
6985          * locinput by one character */
6986           increment_locinput:
6987             assert(!NEXTCHR_IS_EOS);
6988             if (utf8_target) {
6989                 locinput += PL_utf8skip[nextchr];
6990                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6991                 if (locinput > reginfo->strend)
6992                     sayNO;
6993             }
6994             else
6995                 locinput++;
6996             break;
6997
6998         } /* end switch */
6999
7000         /* switch break jumps here */
7001         scan = next; /* prepare to execute the next op and ... */
7002         continue;    /* ... jump back to the top, reusing st */
7003         /* NOTREACHED */
7004
7005       push_yes_state:
7006         /* push a state that backtracks on success */
7007         st->u.yes.prev_yes_state = yes_state;
7008         yes_state = st;
7009         /* FALLTHROUGH */
7010       push_state:
7011         /* push a new regex state, then continue at scan  */
7012         {
7013             regmatch_state *newst;
7014
7015             DEBUG_STACK_r({
7016                 regmatch_state *cur = st;
7017                 regmatch_state *curyes = yes_state;
7018                 int curd = depth;
7019                 regmatch_slab *slab = PL_regmatch_slab;
7020                 for (;curd > -1;cur--,curd--) {
7021                     if (cur < SLAB_FIRST(slab)) {
7022                         slab = slab->prev;
7023                         cur = SLAB_LAST(slab);
7024                     }
7025                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
7026                         REPORT_CODE_OFF + 2 + depth * 2,"",
7027                         curd, PL_reg_name[cur->resume_state],
7028                         (curyes == cur) ? "yes" : ""
7029                     );
7030                     if (curyes == cur)
7031                         curyes = cur->u.yes.prev_yes_state;
7032                 }
7033             } else
7034                 DEBUG_STATE_pp("push")
7035             );
7036             depth++;
7037             st->locinput = locinput;
7038             newst = st+1;
7039             if (newst >  SLAB_LAST(PL_regmatch_slab))
7040                 newst = S_push_slab(aTHX);
7041             PL_regmatch_state = newst;
7042
7043             locinput = pushinput;
7044             st = newst;
7045             continue;
7046             /* NOTREACHED */
7047         }
7048     }
7049
7050     /*
7051     * We get here only if there's trouble -- normally "case END" is
7052     * the terminating point.
7053     */
7054     Perl_croak(aTHX_ "corrupted regexp pointers");
7055     /* NOTREACHED */
7056     sayNO;
7057     NOT_REACHED;
7058
7059   yes:
7060     if (yes_state) {
7061         /* we have successfully completed a subexpression, but we must now
7062          * pop to the state marked by yes_state and continue from there */
7063         assert(st != yes_state);
7064 #ifdef DEBUGGING
7065         while (st != yes_state) {
7066             st--;
7067             if (st < SLAB_FIRST(PL_regmatch_slab)) {
7068                 PL_regmatch_slab = PL_regmatch_slab->prev;
7069                 st = SLAB_LAST(PL_regmatch_slab);
7070             }
7071             DEBUG_STATE_r({
7072                 if (no_final) {
7073                     DEBUG_STATE_pp("pop (no final)");
7074                 } else {
7075                     DEBUG_STATE_pp("pop (yes)");
7076                 }
7077             });
7078             depth--;
7079         }
7080 #else
7081         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
7082             || yes_state > SLAB_LAST(PL_regmatch_slab))
7083         {
7084             /* not in this slab, pop slab */
7085             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
7086             PL_regmatch_slab = PL_regmatch_slab->prev;
7087             st = SLAB_LAST(PL_regmatch_slab);
7088         }
7089         depth -= (st - yes_state);
7090 #endif
7091         st = yes_state;
7092         yes_state = st->u.yes.prev_yes_state;
7093         PL_regmatch_state = st;
7094
7095         if (no_final)
7096             locinput= st->locinput;
7097         state_num = st->resume_state + no_final;
7098         goto reenter_switch;
7099     }
7100
7101     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
7102                           PL_colors[4], PL_colors[5]));
7103
7104     if (reginfo->info_aux_eval) {
7105         /* each successfully executed (?{...}) block does the equivalent of
7106          *   local $^R = do {...}
7107          * When popping the save stack, all these locals would be undone;
7108          * bypass this by setting the outermost saved $^R to the latest
7109          * value */
7110         /* I dont know if this is needed or works properly now.
7111          * see code related to PL_replgv elsewhere in this file.
7112          * Yves
7113          */
7114         if (oreplsv != GvSV(PL_replgv))
7115             sv_setsv(oreplsv, GvSV(PL_replgv));
7116     }
7117     result = 1;
7118     goto final_exit;
7119
7120   no:
7121     DEBUG_EXECUTE_r(
7122         PerlIO_printf(Perl_debug_log,
7123             "%*s  %sfailed...%s\n",
7124             REPORT_CODE_OFF+depth*2, "",
7125             PL_colors[4], PL_colors[5])
7126         );
7127
7128   no_silent:
7129     if (no_final) {
7130         if (yes_state) {
7131             goto yes;
7132         } else {
7133             goto final_exit;
7134         }
7135     }
7136     if (depth) {
7137         /* there's a previous state to backtrack to */
7138         st--;
7139         if (st < SLAB_FIRST(PL_regmatch_slab)) {
7140             PL_regmatch_slab = PL_regmatch_slab->prev;
7141             st = SLAB_LAST(PL_regmatch_slab);
7142         }
7143         PL_regmatch_state = st;
7144         locinput= st->locinput;
7145
7146         DEBUG_STATE_pp("pop");
7147         depth--;
7148         if (yes_state == st)
7149             yes_state = st->u.yes.prev_yes_state;
7150
7151         state_num = st->resume_state + 1; /* failure = success + 1 */
7152         goto reenter_switch;
7153     }
7154     result = 0;
7155
7156   final_exit:
7157     if (rex->intflags & PREGf_VERBARG_SEEN) {
7158         SV *sv_err = get_sv("REGERROR", 1);
7159         SV *sv_mrk = get_sv("REGMARK", 1);
7160         if (result) {
7161             sv_commit = &PL_sv_no;
7162             if (!sv_yes_mark)
7163                 sv_yes_mark = &PL_sv_yes;
7164         } else {
7165             if (!sv_commit)
7166                 sv_commit = &PL_sv_yes;
7167             sv_yes_mark = &PL_sv_no;
7168         }
7169         assert(sv_err);
7170         assert(sv_mrk);
7171         sv_setsv(sv_err, sv_commit);
7172         sv_setsv(sv_mrk, sv_yes_mark);
7173     }
7174
7175
7176     if (last_pushed_cv) {
7177         dSP;
7178         POP_MULTICALL;
7179         PERL_UNUSED_VAR(SP);
7180     }
7181
7182     assert(!result ||  locinput - reginfo->strbeg >= 0);
7183     return result ?  locinput - reginfo->strbeg : -1;
7184 }
7185
7186 /*
7187  - regrepeat - repeatedly match something simple, report how many
7188  *
7189  * What 'simple' means is a node which can be the operand of a quantifier like
7190  * '+', or {1,3}
7191  *
7192  * startposp - pointer a pointer to the start position.  This is updated
7193  *             to point to the byte following the highest successful
7194  *             match.
7195  * p         - the regnode to be repeatedly matched against.
7196  * reginfo   - struct holding match state, such as strend
7197  * max       - maximum number of things to match.
7198  * depth     - (for debugging) backtracking depth.
7199  */
7200 STATIC I32
7201 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
7202             regmatch_info *const reginfo, I32 max, int depth)
7203 {
7204     char *scan;     /* Pointer to current position in target string */
7205     I32 c;
7206     char *loceol = reginfo->strend;   /* local version */
7207     I32 hardcount = 0;  /* How many matches so far */
7208     bool utf8_target = reginfo->is_utf8_target;
7209     int to_complement = 0;  /* Invert the result? */
7210     UV utf8_flags;
7211     _char_class_number classnum;
7212 #ifndef DEBUGGING
7213     PERL_UNUSED_ARG(depth);
7214 #endif
7215
7216     PERL_ARGS_ASSERT_REGREPEAT;
7217
7218     scan = *startposp;
7219     if (max == REG_INFTY)
7220         max = I32_MAX;
7221     else if (! utf8_target && loceol - scan > max)
7222         loceol = scan + max;
7223
7224     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
7225      * to the maximum of how far we should go in it (leaving it set to the real
7226      * end, if the maximum permissible would take us beyond that).  This allows
7227      * us to make the loop exit condition that we haven't gone past <loceol> to
7228      * also mean that we haven't exceeded the max permissible count, saving a
7229      * test each time through the loop.  But it assumes that the OP matches a
7230      * single byte, which is true for most of the OPs below when applied to a
7231      * non-UTF-8 target.  Those relatively few OPs that don't have this
7232      * characteristic will have to compensate.
7233      *
7234      * There is no adjustment for UTF-8 targets, as the number of bytes per
7235      * character varies.  OPs will have to test both that the count is less
7236      * than the max permissible (using <hardcount> to keep track), and that we
7237      * are still within the bounds of the string (using <loceol>.  A few OPs
7238      * match a single byte no matter what the encoding.  They can omit the max
7239      * test if, for the UTF-8 case, they do the adjustment that was skipped
7240      * above.
7241      *
7242      * Thus, the code above sets things up for the common case; and exceptional
7243      * cases need extra work; the common case is to make sure <scan> doesn't
7244      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
7245      * count doesn't exceed the maximum permissible */
7246
7247     switch (OP(p)) {
7248     case REG_ANY:
7249         if (utf8_target) {
7250             while (scan < loceol && hardcount < max && *scan != '\n') {
7251                 scan += UTF8SKIP(scan);
7252                 hardcount++;
7253             }
7254         } else {
7255             while (scan < loceol && *scan != '\n')
7256                 scan++;
7257         }
7258         break;
7259     case SANY:
7260         if (utf8_target) {
7261             while (scan < loceol && hardcount < max) {
7262                 scan += UTF8SKIP(scan);
7263                 hardcount++;
7264             }
7265         }
7266         else
7267             scan = loceol;
7268         break;
7269     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
7270         if (utf8_target && loceol - scan > max) {
7271
7272             /* <loceol> hadn't been adjusted in the UTF-8 case */
7273             scan +=  max;
7274         }
7275         else {
7276             scan = loceol;
7277         }
7278         break;
7279     case EXACTL:
7280         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7281         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
7282             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
7283         }
7284         /* FALLTHROUGH */
7285     case EXACT:
7286         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7287
7288         c = (U8)*STRING(p);
7289
7290         /* Can use a simple loop if the pattern char to match on is invariant
7291          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
7292          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
7293          * true iff it doesn't matter if the argument is in UTF-8 or not */
7294         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
7295             if (utf8_target && loceol - scan > max) {
7296                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7297                  * since here, to match at all, 1 char == 1 byte */
7298                 loceol = scan + max;
7299             }
7300             while (scan < loceol && UCHARAT(scan) == c) {
7301                 scan++;
7302             }
7303         }
7304         else if (reginfo->is_utf8_pat) {
7305             if (utf8_target) {
7306                 STRLEN scan_char_len;
7307
7308                 /* When both target and pattern are UTF-8, we have to do
7309                  * string EQ */
7310                 while (hardcount < max
7311                        && scan < loceol
7312                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
7313                        && memEQ(scan, STRING(p), scan_char_len))
7314                 {
7315                     scan += scan_char_len;
7316                     hardcount++;
7317                 }
7318             }
7319             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
7320
7321                 /* Target isn't utf8; convert the character in the UTF-8
7322                  * pattern to non-UTF8, and do a simple loop */
7323                 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
7324                 while (scan < loceol && UCHARAT(scan) == c) {
7325                     scan++;
7326                 }
7327             } /* else pattern char is above Latin1, can't possibly match the
7328                  non-UTF-8 target */
7329         }
7330         else {
7331
7332             /* Here, the string must be utf8; pattern isn't, and <c> is
7333              * different in utf8 than not, so can't compare them directly.
7334              * Outside the loop, find the two utf8 bytes that represent c, and
7335              * then look for those in sequence in the utf8 string */
7336             U8 high = UTF8_TWO_BYTE_HI(c);
7337             U8 low = UTF8_TWO_BYTE_LO(c);
7338
7339             while (hardcount < max
7340                     && scan + 1 < loceol
7341                     && UCHARAT(scan) == high
7342                     && UCHARAT(scan + 1) == low)
7343             {
7344                 scan += 2;
7345                 hardcount++;
7346             }
7347         }
7348         break;
7349
7350     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
7351         assert(! reginfo->is_utf8_pat);
7352         /* FALLTHROUGH */
7353     case EXACTFA:
7354         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7355         goto do_exactf;
7356
7357     case EXACTFL:
7358         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7359         utf8_flags = FOLDEQ_LOCALE;
7360         goto do_exactf;
7361
7362     case EXACTF:   /* This node only generated for non-utf8 patterns */
7363         assert(! reginfo->is_utf8_pat);
7364         utf8_flags = 0;
7365         goto do_exactf;
7366
7367     case EXACTFLU8:
7368         if (! utf8_target) {
7369             break;
7370         }
7371         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
7372                                     | FOLDEQ_S2_FOLDS_SANE;
7373         goto do_exactf;
7374
7375     case EXACTFU_SS:
7376     case EXACTFU:
7377         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
7378
7379       do_exactf: {
7380         int c1, c2;
7381         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
7382
7383         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7384
7385         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
7386                                         reginfo))
7387         {
7388             if (c1 == CHRTEST_VOID) {
7389                 /* Use full Unicode fold matching */
7390                 char *tmpeol = reginfo->strend;
7391                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
7392                 while (hardcount < max
7393                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
7394                                              STRING(p), NULL, pat_len,
7395                                              reginfo->is_utf8_pat, utf8_flags))
7396                 {
7397                     scan = tmpeol;
7398                     tmpeol = reginfo->strend;
7399                     hardcount++;
7400                 }
7401             }
7402             else if (utf8_target) {
7403                 if (c1 == c2) {
7404                     while (scan < loceol
7405                            && hardcount < max
7406                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
7407                     {
7408                         scan += UTF8SKIP(scan);
7409                         hardcount++;
7410                     }
7411                 }
7412                 else {
7413                     while (scan < loceol
7414                            && hardcount < max
7415                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
7416                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
7417                     {
7418                         scan += UTF8SKIP(scan);
7419                         hardcount++;
7420                     }
7421                 }
7422             }
7423             else if (c1 == c2) {
7424                 while (scan < loceol && UCHARAT(scan) == c1) {
7425                     scan++;
7426                 }
7427             }
7428             else {
7429                 while (scan < loceol &&
7430                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7431                 {
7432                     scan++;
7433                 }
7434             }
7435         }
7436         break;
7437     }
7438     case ANYOFL:
7439         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7440         /* FALLTHROUGH */
7441     case ANYOF:
7442         if (utf8_target) {
7443             while (hardcount < max
7444                    && scan < loceol
7445                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
7446             {
7447                 scan += UTF8SKIP(scan);
7448                 hardcount++;
7449             }
7450         } else {
7451             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7452                 scan++;
7453         }
7454         break;
7455
7456     /* The argument (FLAGS) to all the POSIX node types is the class number */
7457
7458     case NPOSIXL:
7459         to_complement = 1;
7460         /* FALLTHROUGH */
7461
7462     case POSIXL:
7463         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7464         if (! utf8_target) {
7465             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7466                                                                    *scan)))
7467             {
7468                 scan++;
7469             }
7470         } else {
7471             while (hardcount < max && scan < loceol
7472                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7473                                                                   (U8 *) scan)))
7474             {
7475                 scan += UTF8SKIP(scan);
7476                 hardcount++;
7477             }
7478         }
7479         break;
7480
7481     case POSIXD:
7482         if (utf8_target) {
7483             goto utf8_posix;
7484         }
7485         /* FALLTHROUGH */
7486
7487     case POSIXA:
7488         if (utf8_target && loceol - scan > max) {
7489
7490             /* We didn't adjust <loceol> at the beginning of this routine
7491              * because is UTF-8, but it is actually ok to do so, since here, to
7492              * match, 1 char == 1 byte. */
7493             loceol = scan + max;
7494         }
7495         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7496             scan++;
7497         }
7498         break;
7499
7500     case NPOSIXD:
7501         if (utf8_target) {
7502             to_complement = 1;
7503             goto utf8_posix;
7504         }
7505         /* FALLTHROUGH */
7506
7507     case NPOSIXA:
7508         if (! utf8_target) {
7509             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7510                 scan++;
7511             }
7512         }
7513         else {
7514
7515             /* The complement of something that matches only ASCII matches all
7516              * non-ASCII, plus everything in ASCII that isn't in the class. */
7517             while (hardcount < max && scan < loceol
7518                    && (! isASCII_utf8(scan)
7519                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7520             {
7521                 scan += UTF8SKIP(scan);
7522                 hardcount++;
7523             }
7524         }
7525         break;
7526
7527     case NPOSIXU:
7528         to_complement = 1;
7529         /* FALLTHROUGH */
7530
7531     case POSIXU:
7532         if (! utf8_target) {
7533             while (scan < loceol && to_complement
7534                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7535             {
7536                 scan++;
7537             }
7538         }
7539         else {
7540           utf8_posix:
7541             classnum = (_char_class_number) FLAGS(p);
7542             if (classnum < _FIRST_NON_SWASH_CC) {
7543
7544                 /* Here, a swash is needed for above-Latin1 code points.
7545                  * Process as many Latin1 code points using the built-in rules.
7546                  * Go to another loop to finish processing upon encountering
7547                  * the first Latin1 code point.  We could do that in this loop
7548                  * as well, but the other way saves having to test if the swash
7549                  * has been loaded every time through the loop: extra space to
7550                  * save a test. */
7551                 while (hardcount < max && scan < loceol) {
7552                     if (UTF8_IS_INVARIANT(*scan)) {
7553                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7554                                                                    classnum))))
7555                         {
7556                             break;
7557                         }
7558                         scan++;
7559                     }
7560                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7561                         if (! (to_complement
7562                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7563                                                                      *(scan + 1)),
7564                                                     classnum))))
7565                         {
7566                             break;
7567                         }
7568                         scan += 2;
7569                     }
7570                     else {
7571                         goto found_above_latin1;
7572                     }
7573
7574                     hardcount++;
7575                 }
7576             }
7577             else {
7578                 /* For these character classes, the knowledge of how to handle
7579                  * every code point is compiled in to Perl via a macro.  This
7580                  * code is written for making the loops as tight as possible.
7581                  * It could be refactored to save space instead */
7582                 switch (classnum) {
7583                     case _CC_ENUM_SPACE:    /* XXX would require separate code
7584                                                if we revert the change of \v
7585                                                matching this */
7586                         /* FALLTHROUGH */
7587                     case _CC_ENUM_PSXSPC:
7588                         while (hardcount < max
7589                                && scan < loceol
7590                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7591                         {
7592                             scan += UTF8SKIP(scan);
7593                             hardcount++;
7594                         }
7595                         break;
7596                     case _CC_ENUM_BLANK:
7597                         while (hardcount < max
7598                                && scan < loceol
7599                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7600                         {
7601                             scan += UTF8SKIP(scan);
7602                             hardcount++;
7603                         }
7604                         break;
7605                     case _CC_ENUM_XDIGIT:
7606                         while (hardcount < max
7607                                && scan < loceol
7608                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7609                         {
7610                             scan += UTF8SKIP(scan);
7611                             hardcount++;
7612                         }
7613                         break;
7614                     case _CC_ENUM_VERTSPACE:
7615                         while (hardcount < max
7616                                && scan < loceol
7617                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7618                         {
7619                             scan += UTF8SKIP(scan);
7620                             hardcount++;
7621                         }
7622                         break;
7623                     case _CC_ENUM_CNTRL:
7624                         while (hardcount < max
7625                                && scan < loceol
7626                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7627                         {
7628                             scan += UTF8SKIP(scan);
7629                             hardcount++;
7630                         }
7631                         break;
7632                     default:
7633                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7634                 }
7635             }
7636         }
7637         break;
7638
7639       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7640
7641         /* Load the swash if not already present */
7642         if (! PL_utf8_swash_ptrs[classnum]) {
7643             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7644             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7645                                         "utf8",
7646                                         "",
7647                                         &PL_sv_undef, 1, 0,
7648                                         PL_XPosix_ptrs[classnum], &flags);
7649         }
7650
7651         while (hardcount < max && scan < loceol
7652                && to_complement ^ cBOOL(_generic_utf8(
7653                                        classnum,
7654                                        scan,
7655                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
7656                                                    (U8 *) scan,
7657                                                    TRUE))))
7658         {
7659             scan += UTF8SKIP(scan);
7660             hardcount++;
7661         }
7662         break;
7663
7664     case LNBREAK:
7665         if (utf8_target) {
7666             while (hardcount < max && scan < loceol &&
7667                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7668                 scan += c;
7669                 hardcount++;
7670             }
7671         } else {
7672             /* LNBREAK can match one or two latin chars, which is ok, but we
7673              * have to use hardcount in this situation, and throw away the
7674              * adjustment to <loceol> done before the switch statement */
7675             loceol = reginfo->strend;
7676             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7677                 scan+=c;
7678                 hardcount++;
7679             }
7680         }
7681         break;
7682
7683     case BOUNDL:
7684     case NBOUNDL:
7685         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7686         /* FALLTHROUGH */
7687     case BOUND:
7688     case BOUNDA:
7689     case BOUNDU:
7690     case EOS:
7691     case GPOS:
7692     case KEEPS:
7693     case NBOUND:
7694     case NBOUNDA:
7695     case NBOUNDU:
7696     case OPFAIL:
7697     case SBOL:
7698     case SEOL:
7699         /* These are all 0 width, so match right here or not at all. */
7700         break;
7701
7702     default:
7703         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7704         /* NOTREACHED */
7705         NOT_REACHED;
7706
7707     }
7708
7709     if (hardcount)
7710         c = hardcount;
7711     else
7712         c = scan - *startposp;
7713     *startposp = scan;
7714
7715     DEBUG_r({
7716         GET_RE_DEBUG_FLAGS_DECL;
7717         DEBUG_EXECUTE_r({
7718             SV * const prop = sv_newmortal();
7719             regprop(prog, prop, p, reginfo, NULL);
7720             PerlIO_printf(Perl_debug_log,
7721                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7722                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7723         });
7724     });
7725
7726     return(c);
7727 }
7728
7729
7730 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7731 /*
7732 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7733 create a copy so that changes the caller makes won't change the shared one.
7734 If <altsvp> is non-null, will return NULL in it, for back-compat.
7735  */
7736 SV *
7737 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7738 {
7739     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7740
7741     if (altsvp) {
7742         *altsvp = NULL;
7743     }
7744
7745     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
7746 }
7747
7748 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
7749
7750 /*
7751  - reginclass - determine if a character falls into a character class
7752
7753   n is the ANYOF-type regnode
7754   p is the target string
7755   p_end points to one byte beyond the end of the target string
7756   utf8_target tells whether p is in UTF-8.
7757
7758   Returns true if matched; false otherwise.
7759
7760   Note that this can be a synthetic start class, a combination of various
7761   nodes, so things you think might be mutually exclusive, such as locale,
7762   aren't.  It can match both locale and non-locale
7763
7764  */
7765
7766 STATIC bool
7767 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
7768 {
7769     dVAR;
7770     const char flags = ANYOF_FLAGS(n);
7771     bool match = FALSE;
7772     UV c = *p;
7773
7774     PERL_ARGS_ASSERT_REGINCLASS;
7775
7776     /* If c is not already the code point, get it.  Note that
7777      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7778     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7779         STRLEN c_len = 0;
7780         c = utf8n_to_uvchr(p, p_end - p, &c_len,
7781                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7782                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7783                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7784                  * UTF8_ALLOW_FFFF */
7785         if (c_len == (STRLEN)-1)
7786             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7787         if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
7788             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
7789         }
7790     }
7791
7792     /* If this character is potentially in the bitmap, check it */
7793     if (c < NUM_ANYOF_CODE_POINTS) {
7794         if (ANYOF_BITMAP_TEST(n, c))
7795             match = TRUE;
7796         else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
7797                   && ! utf8_target
7798                   && ! isASCII(c))
7799         {
7800             match = TRUE;
7801         }
7802         else if (flags & ANYOF_LOCALE_FLAGS) {
7803             if ((flags & ANYOF_LOC_FOLD)
7804                 && c < 256
7805                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7806             {
7807                 match = TRUE;
7808             }
7809             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
7810                      && c < 256
7811             ) {
7812
7813                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7814                  * if the class includes the Posix character class given by
7815                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7816                  * complemented Posix class given by int(bit/2).  So we loop
7817                  * through the bits, each time changing whether we complement
7818                  * the result or not.  Suppose for the sake of illustration
7819                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7820                  * is set, it means there is a match for this ANYOF node if the
7821                  * character is in the class given by the expression (0 / 2 = 0
7822                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7823                  * and since 'to_complement' is 0, the result will stay TRUE,
7824                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7825                  * bit 1 is 1.  That means there is a match if the character
7826                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7827                  * but will on bit 1.  On the second iteration 'to_complement'
7828                  * will be 1, so the exclusive or will reverse things, so we
7829                  * are testing for \W.  On the third iteration, 'to_complement'
7830                  * will be 0, and we would be testing for \s; the fourth
7831                  * iteration would test for \S, etc.
7832                  *
7833                  * Note that this code assumes that all the classes are closed
7834                  * under folding.  For example, if a character matches \w, then
7835                  * its fold does too; and vice versa.  This should be true for
7836                  * any well-behaved locale for all the currently defined Posix
7837                  * classes, except for :lower: and :upper:, which are handled
7838                  * by the pseudo-class :cased: which matches if either of the
7839                  * other two does.  To get rid of this assumption, an outer
7840                  * loop could be used below to iterate over both the source
7841                  * character, and its fold (if different) */
7842
7843                 int count = 0;
7844                 int to_complement = 0;
7845
7846                 while (count < ANYOF_MAX) {
7847                     if (ANYOF_POSIXL_TEST(n, count)
7848                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7849                     {
7850                         match = TRUE;
7851                         break;
7852                     }
7853                     count++;
7854                     to_complement ^= 1;
7855                 }
7856             }
7857         }
7858     }
7859
7860
7861     /* If the bitmap didn't (or couldn't) match, and something outside the
7862      * bitmap could match, try that. */
7863     if (!match) {
7864         if (c >= NUM_ANYOF_CODE_POINTS
7865             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7866         {
7867             match = TRUE;       /* Everything above the bitmap matches */
7868         }
7869         else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
7870                   || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
7871                   || ((flags & ANYOF_LOC_FOLD)
7872                        && IN_UTF8_CTYPE_LOCALE
7873                        && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
7874         {
7875             SV* only_utf8_locale = NULL;
7876             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
7877                                                        &only_utf8_locale, NULL);
7878             if (sw) {
7879                 U8 utf8_buffer[2];
7880                 U8 * utf8_p;
7881                 if (utf8_target) {
7882                     utf8_p = (U8 *) p;
7883                 } else { /* Convert to utf8 */
7884                     utf8_p = utf8_buffer;
7885                     append_utf8_from_native_byte(*p, &utf8_p);
7886                     utf8_p = utf8_buffer;
7887                 }
7888
7889                 if (swash_fetch(sw, utf8_p, TRUE)) {
7890                     match = TRUE;
7891                 }
7892             }
7893             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
7894                 match = _invlist_contains_cp(only_utf8_locale, c);
7895             }
7896         }
7897
7898         if (UNICODE_IS_SUPER(c)
7899             && (flags & ANYOF_WARN_SUPER)
7900             && ckWARN_d(WARN_NON_UNICODE))
7901         {
7902             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7903                 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
7904         }
7905     }
7906
7907 #if ANYOF_INVERT != 1
7908     /* Depending on compiler optimization cBOOL takes time, so if don't have to
7909      * use it, don't */
7910 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7911 #endif
7912
7913     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7914     return (flags & ANYOF_INVERT) ^ match;
7915 }
7916
7917 STATIC U8 *
7918 S_reghop3(U8 *s, SSize_t off, const U8* lim)
7919 {
7920     /* return the position 'off' UTF-8 characters away from 's', forward if
7921      * 'off' >= 0, backwards if negative.  But don't go outside of position
7922      * 'lim', which better be < s  if off < 0 */
7923
7924     PERL_ARGS_ASSERT_REGHOP3;
7925
7926     if (off >= 0) {
7927         while (off-- && s < lim) {
7928             /* XXX could check well-formedness here */
7929             s += UTF8SKIP(s);
7930         }
7931     }
7932     else {
7933         while (off++ && s > lim) {
7934             s--;
7935             if (UTF8_IS_CONTINUED(*s)) {
7936                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7937                     s--;
7938             }
7939             /* XXX could check well-formedness here */
7940         }
7941     }
7942     return s;
7943 }
7944
7945 STATIC U8 *
7946 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7947 {
7948     PERL_ARGS_ASSERT_REGHOP4;
7949
7950     if (off >= 0) {
7951         while (off-- && s < rlim) {
7952             /* XXX could check well-formedness here */
7953             s += UTF8SKIP(s);
7954         }
7955     }
7956     else {
7957         while (off++ && s > llim) {
7958             s--;
7959             if (UTF8_IS_CONTINUED(*s)) {
7960                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7961                     s--;
7962             }
7963             /* XXX could check well-formedness here */
7964         }
7965     }
7966     return s;
7967 }
7968
7969 /* like reghop3, but returns NULL on overrun, rather than returning last
7970  * char pos */
7971
7972 STATIC U8 *
7973 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7974 {
7975     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7976
7977     if (off >= 0) {
7978         while (off-- && s < lim) {
7979             /* XXX could check well-formedness here */
7980             s += UTF8SKIP(s);
7981         }
7982         if (off >= 0)
7983             return NULL;
7984     }
7985     else {
7986         while (off++ && s > lim) {
7987             s--;
7988             if (UTF8_IS_CONTINUED(*s)) {
7989                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7990                     s--;
7991             }
7992             /* XXX could check well-formedness here */
7993         }
7994         if (off <= 0)
7995             return NULL;
7996     }
7997     return s;
7998 }
7999
8000
8001 /* when executing a regex that may have (?{}), extra stuff needs setting
8002    up that will be visible to the called code, even before the current
8003    match has finished. In particular:
8004
8005    * $_ is localised to the SV currently being matched;
8006    * pos($_) is created if necessary, ready to be updated on each call-out
8007      to code;
8008    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
8009      isn't set until the current pattern is successfully finished), so that
8010      $1 etc of the match-so-far can be seen;
8011    * save the old values of subbeg etc of the current regex, and  set then
8012      to the current string (again, this is normally only done at the end
8013      of execution)
8014 */
8015
8016 static void
8017 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
8018 {
8019     MAGIC *mg;
8020     regexp *const rex = ReANY(reginfo->prog);
8021     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8022
8023     eval_state->rex = rex;
8024
8025     if (reginfo->sv) {
8026         /* Make $_ available to executed code. */
8027         if (reginfo->sv != DEFSV) {
8028             SAVE_DEFSV;
8029             DEFSV_set(reginfo->sv);
8030         }
8031
8032         if (!(mg = mg_find_mglob(reginfo->sv))) {
8033             /* prepare for quick setting of pos */
8034             mg = sv_magicext_mglob(reginfo->sv);
8035             mg->mg_len = -1;
8036         }
8037         eval_state->pos_magic = mg;
8038         eval_state->pos       = mg->mg_len;
8039         eval_state->pos_flags = mg->mg_flags;
8040     }
8041     else
8042         eval_state->pos_magic = NULL;
8043
8044     if (!PL_reg_curpm) {
8045         /* PL_reg_curpm is a fake PMOP that we can attach the current
8046          * regex to and point PL_curpm at, so that $1 et al are visible
8047          * within a /(?{})/. It's just allocated once per interpreter the
8048          * first time its needed */
8049         Newxz(PL_reg_curpm, 1, PMOP);
8050 #ifdef USE_ITHREADS
8051         {
8052             SV* const repointer = &PL_sv_undef;
8053             /* this regexp is also owned by the new PL_reg_curpm, which
8054                will try to free it.  */
8055             av_push(PL_regex_padav, repointer);
8056             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
8057             PL_regex_pad = AvARRAY(PL_regex_padav);
8058         }
8059 #endif
8060     }
8061     SET_reg_curpm(reginfo->prog);
8062     eval_state->curpm = PL_curpm;
8063     PL_curpm = PL_reg_curpm;
8064     if (RXp_MATCH_COPIED(rex)) {
8065         /*  Here is a serious problem: we cannot rewrite subbeg,
8066             since it may be needed if this match fails.  Thus
8067             $` inside (?{}) could fail... */
8068         eval_state->subbeg     = rex->subbeg;
8069         eval_state->sublen     = rex->sublen;
8070         eval_state->suboffset  = rex->suboffset;
8071         eval_state->subcoffset = rex->subcoffset;
8072 #ifdef PERL_ANY_COW
8073         eval_state->saved_copy = rex->saved_copy;
8074 #endif
8075         RXp_MATCH_COPIED_off(rex);
8076     }
8077     else
8078         eval_state->subbeg = NULL;
8079     rex->subbeg = (char *)reginfo->strbeg;
8080     rex->suboffset = 0;
8081     rex->subcoffset = 0;
8082     rex->sublen = reginfo->strend - reginfo->strbeg;
8083 }
8084
8085
8086 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8087
8088 static void
8089 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8090 {
8091     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8092     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
8093     regmatch_slab *s;
8094
8095     Safefree(aux->poscache);
8096
8097     if (eval_state) {
8098
8099         /* undo the effects of S_setup_eval_state() */
8100
8101         if (eval_state->subbeg) {
8102             regexp * const rex = eval_state->rex;
8103             rex->subbeg     = eval_state->subbeg;
8104             rex->sublen     = eval_state->sublen;
8105             rex->suboffset  = eval_state->suboffset;
8106             rex->subcoffset = eval_state->subcoffset;
8107 #ifdef PERL_ANY_COW
8108             rex->saved_copy = eval_state->saved_copy;
8109 #endif
8110             RXp_MATCH_COPIED_on(rex);
8111         }
8112         if (eval_state->pos_magic)
8113         {
8114             eval_state->pos_magic->mg_len = eval_state->pos;
8115             eval_state->pos_magic->mg_flags =
8116                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8117                | (eval_state->pos_flags & MGf_BYTES);
8118         }
8119
8120         PL_curpm = eval_state->curpm;
8121     }
8122
8123     PL_regmatch_state = aux->old_regmatch_state;
8124     PL_regmatch_slab  = aux->old_regmatch_slab;
8125
8126     /* free all slabs above current one - this must be the last action
8127      * of this function, as aux and eval_state are allocated within
8128      * slabs and may be freed here */
8129
8130     s = PL_regmatch_slab->next;
8131     if (s) {
8132         PL_regmatch_slab->next = NULL;
8133         while (s) {
8134             regmatch_slab * const osl = s;
8135             s = s->next;
8136             Safefree(osl);
8137         }
8138     }
8139 }
8140
8141
8142 STATIC void
8143 S_to_utf8_substr(pTHX_ regexp *prog)
8144 {
8145     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8146      * on the converted value */
8147
8148     int i = 1;
8149
8150     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8151
8152     do {
8153         if (prog->substrs->data[i].substr
8154             && !prog->substrs->data[i].utf8_substr) {
8155             SV* const sv = newSVsv(prog->substrs->data[i].substr);
8156             prog->substrs->data[i].utf8_substr = sv;
8157             sv_utf8_upgrade(sv);
8158             if (SvVALID(prog->substrs->data[i].substr)) {
8159                 if (SvTAIL(prog->substrs->data[i].substr)) {
8160                     /* Trim the trailing \n that fbm_compile added last
8161                        time.  */
8162                     SvCUR_set(sv, SvCUR(sv) - 1);
8163                     /* Whilst this makes the SV technically "invalid" (as its
8164                        buffer is no longer followed by "\0") when fbm_compile()
8165                        adds the "\n" back, a "\0" is restored.  */
8166                     fbm_compile(sv, FBMcf_TAIL);
8167                 } else
8168                     fbm_compile(sv, 0);
8169             }
8170             if (prog->substrs->data[i].substr == prog->check_substr)
8171                 prog->check_utf8 = sv;
8172         }
8173     } while (i--);
8174 }
8175
8176 STATIC bool
8177 S_to_byte_substr(pTHX_ regexp *prog)
8178 {
8179     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8180      * on the converted value; returns FALSE if can't be converted. */
8181
8182     int i = 1;
8183
8184     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8185
8186     do {
8187         if (prog->substrs->data[i].utf8_substr
8188             && !prog->substrs->data[i].substr) {
8189             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8190             if (! sv_utf8_downgrade(sv, TRUE)) {
8191                 return FALSE;
8192             }
8193             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8194                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8195                     /* Trim the trailing \n that fbm_compile added last
8196                         time.  */
8197                     SvCUR_set(sv, SvCUR(sv) - 1);
8198                     fbm_compile(sv, FBMcf_TAIL);
8199                 } else
8200                     fbm_compile(sv, 0);
8201             }
8202             prog->substrs->data[i].substr = sv;
8203             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8204                 prog->check_substr = sv;
8205         }
8206     } while (i--);
8207
8208     return TRUE;
8209 }
8210
8211 /*
8212  * Local variables:
8213  * c-indentation-style: bsd
8214  * c-basic-offset: 4
8215  * indent-tabs-mode: nil
8216  * End:
8217  *
8218  * ex: set ts=8 sts=4 sw=4 et:
8219  */