regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 static const char b_utf8_locale_required[] =
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
  88                                                 "  Assuming a UTF-8 locale";
  89
  90 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND                       \
  91     STMT_START {                                                            \
  92         if (! IN_UTF8_CTYPE_LOCALE) {                                       \
  93           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
  94                                                 b_utf8_locale_required);    \
  95         }                                                                   \
  96     } STMT_END
  97
  98 static const char sets_utf8_locale_required[] =
  99       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
 100
 101 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n)                     \
 102     STMT_START {                                                            \
 103         if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) {  \
 104           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
 105                                              sets_utf8_locale_required);    \
 106         }                                                                   \
 107     } STMT_END
 108
 109 #ifdef DEBUGGING
 110 /* At least one required character in the target string is expressible only in
 111  * UTF-8. */
 112 static const char non_utf8_target_but_utf8_required[]
 113                 = "Can't match, because target string needs to be in UTF-8\n";
 114 #endif
 115
 116 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 117     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 118     goto target;                                                         \
 119 } STMT_END
 120
 121 #ifndef STATIC
 122 #define STATIC  static
 123 #endif
 124
 125 /*
 126  * Forwards.
 127  */
 128
 129 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 130
 131 #define HOPc(pos,off) \
 132         (char *)(reginfo->is_utf8_target \
 133             ? reghop3((U8*)pos, off, \
 134                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 135             : (U8*)(pos + off))
 136
 137 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 138 #define HOPBACK3(pos, off, lim) \
 139         (reginfo->is_utf8_target                          \
 140             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 141             : (pos - off >= lim)                                 \
 142                 ? (U8*)pos - off                                 \
 143                 : NULL)
 144
 145 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 146
 147 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 148 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 149
 150 /* lim must be +ve. Returns NULL on overshoot */
 151 #define HOPMAYBE3(pos,off,lim) \
 152         (reginfo->is_utf8_target                        \
 153             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 154             : ((U8*)pos + off <= lim)                   \
 155                 ? (U8*)pos + off                        \
 156                 : NULL)
 157
 158 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 159  * off must be >=0; args should be vars rather than expressions */
 160 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 161     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 162     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 163 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 164
 165 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 166     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 167     : (U8*)(pos + off))
 168 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 169
 170 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 171 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 172
 173 /* for use after a quantifier and before an EXACT-like node -- japhy */
 174 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 175  *
 176  * NOTE that *nothing* that affects backtracking should be in here, specifically
 177  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 178  * node that is in between two EXACT like nodes when ascertaining what the required
 179  * "follow" character is. This should probably be moved to regex compile time
 180  * although it may be done at run time beause of the REF possibility - more
 181  * investigation required. -- demerphq
 182 */
 183 #define JUMPABLE(rn) (                                                             \
 184     OP(rn) == OPEN ||                                                              \
 185     (OP(rn) == CLOSE &&                                                            \
 186      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 187     OP(rn) == EVAL ||                                                              \
 188     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 189     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 190     OP(rn) == KEEPS ||                                                             \
 191     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 192 )
 193 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 194
 195 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 196
 197 /*
 198   Search for mandatory following text node; for lookahead, the text must
 199   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 200 */
 201 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 202     while (JUMPABLE(rn)) { \
 203         const OPCODE type = OP(rn); \
 204         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 205             rn = NEXTOPER(NEXTOPER(rn)); \
 206         else if (type == PLUS) \
 207             rn = NEXTOPER(rn); \
 208         else if (type == IFMATCH) \
 209             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 210         else rn += NEXT_OFF(rn); \
 211     } \
 212 } STMT_END
 213
 214 #define SLAB_FIRST(s) (&(s)->states[0])
 215 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 216
 217 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 218 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 219 static regmatch_state * S_push_slab(pTHX);
 220
 221 #define REGCP_PAREN_ELEMS 3
 222 #define REGCP_OTHER_ELEMS 3
 223 #define REGCP_FRAME_ELEMS 1
 224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 225  * are needed for the regexp context stack bookkeeping. */
 226
 227 STATIC CHECKPOINT
 228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 229 {
 230     const int retval = PL_savestack_ix;
 231     const int paren_elems_to_push =
 232                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 233     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 234     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 235     I32 p;
 236     DECLARE_AND_GET_RE_DEBUG_FLAGS;
 237
 238     PERL_ARGS_ASSERT_REGCPPUSH;
 239
 240     if (paren_elems_to_push < 0)
 241         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 242                    (int)paren_elems_to_push, (int)maxopenparen,
 243                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 244
 245     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 246         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 247                    " out of range (%lu-%ld)",
 248                    total_elems,
 249                    (unsigned long)maxopenparen,
 250                    (long)parenfloor);
 251
 252     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 253
 254     DEBUG_BUFFERS_r(
 255         if ((int)maxopenparen > (int)parenfloor)
 256             Perl_re_exec_indentf( aTHX_
 257                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 258                 depth,
 259                 PTR2UV(rex),
 260                 PTR2UV(rex->offs)
 261             );
 262     );
 263     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 264 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 265         SSPUSHIV(rex->offs[p].end);
 266         SSPUSHIV(rex->offs[p].start);
 267         SSPUSHINT(rex->offs[p].start_tmp);
 268         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 269             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 270             depth,
 271             (UV)p,
 272             (IV)rex->offs[p].start,
 273             (IV)rex->offs[p].start_tmp,
 274             (IV)rex->offs[p].end
 275         ));
 276     }
 277 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 278     SSPUSHINT(maxopenparen);
 279     SSPUSHINT(rex->lastparen);
 280     SSPUSHINT(rex->lastcloseparen);
 281     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 282
 283     return retval;
 284 }
 285
 286 /* These are needed since we do not localize EVAL nodes: */
 287 #define REGCP_SET(cp)                                           \
 288     DEBUG_STATE_r(                                              \
 289         Perl_re_exec_indentf( aTHX_                             \
 290             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 291             depth, (IV)PL_savestack_ix                          \
 292         )                                                       \
 293     );                                                          \
 294     cp = PL_savestack_ix
 295
 296 #define REGCP_UNWIND(cp)                                        \
 297     DEBUG_STATE_r(                                              \
 298         if (cp != PL_savestack_ix)                              \
 299             Perl_re_exec_indentf( aTHX_                         \
 300                 "Clearing an EVAL scope, savestack=%"           \
 301                 IVdf "..%" IVdf "\n",                           \
 302                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 303             )                                                   \
 304     );                                                          \
 305     regcpblow(cp)
 306
 307 /* set the start and end positions of capture ix */
 308 #define CLOSE_CAPTURE(ix, s, e)                                            \
 309     rex->offs[ix].start = s;                                               \
 310     rex->offs[ix].end = e;                                                 \
 311     if (ix > rex->lastparen)                                               \
 312         rex->lastparen = ix;                                               \
 313     rex->lastcloseparen = ix;                                              \
 314     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
 315         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
 316         depth,                                                             \
 317         PTR2UV(rex),                                                       \
 318         PTR2UV(rex->offs),                                                 \
 319         (UV)ix,                                                            \
 320         (IV)rex->offs[ix].start,                                           \
 321         (IV)rex->offs[ix].end,                                             \
 322         (UV)rex->lastparen                                                 \
 323     ))
 324
 325 #define UNWIND_PAREN(lp, lcp)               \
 326     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
 327         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
 328         depth,                              \
 329         PTR2UV(rex),                        \
 330         PTR2UV(rex->offs),                  \
 331         (UV)(lp),                           \
 332         (UV)(rex->lastparen),               \
 333         (UV)(lcp)                           \
 334     ));                                     \
 335     for (n = rex->lastparen; n > lp; n--)   \
 336         rex->offs[n].end = -1;              \
 337     rex->lastparen = n;                     \
 338     rex->lastcloseparen = lcp;
 339
 340
 341 STATIC void
 342 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 343 {
 344     UV i;
 345     U32 paren;
 346     DECLARE_AND_GET_RE_DEBUG_FLAGS;
 347
 348     PERL_ARGS_ASSERT_REGCPPOP;
 349
 350     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 351     i = SSPOPUV;
 352     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 353     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 354     rex->lastcloseparen = SSPOPINT;
 355     rex->lastparen = SSPOPINT;
 356     *maxopenparen_p = SSPOPINT;
 357
 358     i -= REGCP_OTHER_ELEMS;
 359     /* Now restore the parentheses context. */
 360     DEBUG_BUFFERS_r(
 361         if (i || rex->lastparen + 1 <= rex->nparens)
 362             Perl_re_exec_indentf( aTHX_
 363                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 364                 depth,
 365                 PTR2UV(rex),
 366                 PTR2UV(rex->offs)
 367             );
 368     );
 369     paren = *maxopenparen_p;
 370     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 371         SSize_t tmps;
 372         rex->offs[paren].start_tmp = SSPOPINT;
 373         rex->offs[paren].start = SSPOPIV;
 374         tmps = SSPOPIV;
 375         if (paren <= rex->lastparen)
 376             rex->offs[paren].end = tmps;
 377         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 378             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 379             depth,
 380             (UV)paren,
 381             (IV)rex->offs[paren].start,
 382             (IV)rex->offs[paren].start_tmp,
 383             (IV)rex->offs[paren].end,
 384             (paren > rex->lastparen ? "(skipped)" : ""));
 385         );
 386         paren--;
 387     }
 388 #if 1
 389     /* It would seem that the similar code in regtry()
 390      * already takes care of this, and in fact it is in
 391      * a better location to since this code can #if 0-ed out
 392      * but the code in regtry() is needed or otherwise tests
 393      * requiring null fields (pat.t#187 and split.t#{13,14}
 394      * (as of patchlevel 7877)  will fail.  Then again,
 395      * this code seems to be necessary or otherwise
 396      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 397      * --jhi updated by dapm */
 398     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 399         if (i > *maxopenparen_p)
 400             rex->offs[i].start = -1;
 401         rex->offs[i].end = -1;
 402         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 403             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 404             depth,
 405             (UV)i,
 406             (i > *maxopenparen_p) ? "-1" : "  "
 407         ));
 408     }
 409 #endif
 410 }
 411
 412 /* restore the parens and associated vars at savestack position ix,
 413  * but without popping the stack */
 414
 415 STATIC void
 416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 417 {
 418     I32 tmpix = PL_savestack_ix;
 419     PERL_ARGS_ASSERT_REGCP_RESTORE;
 420
 421     PL_savestack_ix = ix;
 422     regcppop(rex, maxopenparen_p);
 423     PL_savestack_ix = tmpix;
 424 }
 425
 426 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 427
 428 STATIC bool
 429 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 430 {
 431     /* Returns a boolean as to whether or not 'character' is a member of the
 432      * Posix character class given by 'classnum' that should be equivalent to a
 433      * value in the typedef '_char_class_number'.
 434      *
 435      * Ideally this could be replaced by a just an array of function pointers
 436      * to the C library functions that implement the macros this calls.
 437      * However, to compile, the precise function signatures are required, and
 438      * these may vary from platform to platform.  To avoid having to figure
 439      * out what those all are on each platform, I (khw) am using this method,
 440      * which adds an extra layer of function call overhead (unless the C
 441      * optimizer strips it away).  But we don't particularly care about
 442      * performance with locales anyway. */
 443
 444     switch ((_char_class_number) classnum) {
 445         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 446         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 447         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 448         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 449         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 450                                         || isUPPER_LC(character);
 451         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 452         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 453         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 454         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 455         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 456         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 457         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 458         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 459         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 460         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 461         default:    /* VERTSPACE should never occur in locales */
 462             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 463     }
 464
 465     NOT_REACHED; /* NOTREACHED */
 466     return FALSE;
 467 }
 468
 469 PERL_STATIC_INLINE I32
 470 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
 471 {
 472     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
 473      * folded.  Works on all folds representable without UTF-8, except for
 474      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
 475      * check that the strings each have at least 'len' characters.
 476      *
 477      * There is almost an identical API function where s2 need not be folded:
 478      * Perl_foldEQ_latin1() */
 479
 480     const U8 *a = (const U8 *)s1;
 481     const U8 *b = (const U8 *)s2;
 482
 483     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
 484
 485     assert(len >= 0);
 486
 487     while (len--) {
 488         assert(! isUPPER_L1(*b));
 489         if (toLOWER_L1(*a) != *b) {
 490             return 0;
 491         }
 492         a++, b++;
 493     }
 494     return 1;
 495 }
 496
 497 STATIC bool
 498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 499 {
 500     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 501      * 'character' is a member of the Posix character class given by 'classnum'
 502      * that should be equivalent to a value in the typedef
 503      * '_char_class_number'.
 504      *
 505      * This just calls isFOO_lc on the code point for the character if it is in
 506      * the range 0-255.  Outside that range, all characters use Unicode
 507      * rules, ignoring any locale.  So use the Unicode function if this class
 508      * requires an inversion list, and use the Unicode macro otherwise. */
 509
 510
 511     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 512
 513     if (UTF8_IS_INVARIANT(*character)) {
 514         return isFOO_lc(classnum, *character);
 515     }
 516     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 517         return isFOO_lc(classnum,
 518                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 519     }
 520
 521     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 522
 523     switch ((_char_class_number) classnum) {
 524         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 525         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 526         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 527         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 528         default:
 529             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 530                                         utf8_to_uvchr_buf(character, e, NULL));
 531     }
 532
 533     return FALSE; /* Things like CNTRL are always below 256 */
 534 }
 535
 536 STATIC U8 *
 537 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 538 {
 539     /* Returns the position of the first byte in the sequence between 's' and
 540      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 541      * */
 542
 543     PERL_ARGS_ASSERT_FIND_SPAN_END;
 544
 545     assert(send >= s);
 546
 547     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 548                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 549                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 550     {
 551         PERL_UINTMAX_T span_word;
 552
 553         /* Process per-byte until reach word boundary.  XXX This loop could be
 554          * eliminated if we knew that this platform had fast unaligned reads */
 555         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 556             if (*s != span_byte) {
 557                 return s;
 558             }
 559             s++;
 560         }
 561
 562         /* Create a word filled with the bytes we are spanning */
 563         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 564
 565         /* Process per-word as long as we have at least a full word left */
 566         do {
 567
 568             /* Keep going if the whole word is composed of 'span_byte's */
 569             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 570                 s += PERL_WORDSIZE;
 571                 continue;
 572             }
 573
 574             /* Here, at least one byte in the word isn't 'span_byte'. */
 575
 576 #ifdef EBCDIC
 577
 578             break;
 579
 580 #else
 581
 582             /* This xor leaves 1 bits only in those non-matching bytes */
 583             span_word ^= * (PERL_UINTMAX_T *) s;
 584
 585             /* Make sure the upper bit of each non-matching byte is set.  This
 586              * makes each such byte look like an ASCII platform variant byte */
 587             span_word |= span_word << 1;
 588             span_word |= span_word << 2;
 589             span_word |= span_word << 4;
 590
 591             /* That reduces the problem to what this function solves */
 592             return s + variant_byte_number(span_word);
 593
 594 #endif
 595
 596         } while (s + PERL_WORDSIZE <= send);
 597     }
 598
 599     /* Process the straggler bytes beyond the final word boundary */
 600     while (s < send) {
 601         if (*s != span_byte) {
 602             return s;
 603         }
 604         s++;
 605     }
 606
 607     return s;
 608 }
 609
 610 STATIC U8 *
 611 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 612 {
 613     /* Returns the position of the first byte in the sequence between 's'
 614      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 615      * returns 'send' if none found.  It uses word-level operations instead of
 616      * byte to speed up the process */
 617
 618     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 619
 620     assert(send >= s);
 621     assert((byte & mask) == byte);
 622
 623 #ifndef EBCDIC
 624
 625     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 626                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 627                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 628     {
 629         PERL_UINTMAX_T word, mask_word;
 630
 631         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 632             if (((*s) & mask) == byte) {
 633                 return s;
 634             }
 635             s++;
 636         }
 637
 638         word      = PERL_COUNT_MULTIPLIER * byte;
 639         mask_word = PERL_COUNT_MULTIPLIER * mask;
 640
 641         do {
 642             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 643
 644             /* If 'masked' contains bytes with the bit pattern of 'byte' within
 645              * it, xoring with 'word' will leave each of the 8 bits in such
 646              * bytes be 0, and no byte containing any other bit pattern will be
 647              * 0. */
 648             masked ^= word;
 649
 650             /* This causes the most significant bit to be set to 1 for any
 651              * bytes in the word that aren't completely 0 */
 652             masked |= masked << 1;
 653             masked |= masked << 2;
 654             masked |= masked << 4;
 655
 656             /* The msbits are the same as what marks a byte as variant, so we
 657              * can use this mask.  If all msbits are 1, the word doesn't
 658              * contain 'byte' */
 659             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 660                 s += PERL_WORDSIZE;
 661                 continue;
 662             }
 663
 664             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 665              * and any that are, are 0.  Complement and re-AND to swap that */
 666             masked = ~ masked;
 667             masked &= PERL_VARIANTS_WORD_MASK;
 668
 669             /* This reduces the problem to that solved by this function */
 670             s += variant_byte_number(masked);
 671             return s;
 672
 673         } while (s + PERL_WORDSIZE <= send);
 674     }
 675
 676 #endif
 677
 678     while (s < send) {
 679         if (((*s) & mask) == byte) {
 680             return s;
 681         }
 682         s++;
 683     }
 684
 685     return s;
 686 }
 687
 688 STATIC U8 *
 689 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 690 {
 691     /* Returns the position of the first byte in the sequence between 's' and
 692      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 693      * 'span_byte' should have been ANDed with 'mask' in the call of this
 694      * function.  Returns 'send' if none found.  Works like find_span_end(),
 695      * except for the AND */
 696
 697     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 698
 699     assert(send >= s);
 700     assert((span_byte & mask) == span_byte);
 701
 702     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 703                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 704                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 705     {
 706         PERL_UINTMAX_T span_word, mask_word;
 707
 708         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 709             if (((*s) & mask) != span_byte) {
 710                 return s;
 711             }
 712             s++;
 713         }
 714
 715         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 716         mask_word = PERL_COUNT_MULTIPLIER * mask;
 717
 718         do {
 719             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 720
 721             if (masked == span_word) {
 722                 s += PERL_WORDSIZE;
 723                 continue;
 724             }
 725
 726 #ifdef EBCDIC
 727
 728             break;
 729
 730 #else
 731
 732             masked ^= span_word;
 733             masked |= masked << 1;
 734             masked |= masked << 2;
 735             masked |= masked << 4;
 736             return s + variant_byte_number(masked);
 737
 738 #endif
 739
 740         } while (s + PERL_WORDSIZE <= send);
 741     }
 742
 743     while (s < send) {
 744         if (((*s) & mask) != span_byte) {
 745             return s;
 746         }
 747         s++;
 748     }
 749
 750     return s;
 751 }
 752
 753 /*
 754  * pregexec and friends
 755  */
 756
 757 #ifndef PERL_IN_XSUB_RE
 758 /*
 759  - pregexec - match a regexp against a string
 760  */
 761 I32
 762 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 763          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 764 /* stringarg: the point in the string at which to begin matching */
 765 /* strend:    pointer to null at end of string */
 766 /* strbeg:    real beginning of string */
 767 /* minend:    end of match must be >= minend bytes after stringarg. */
 768 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 769  *            itself is accessed via the pointers above */
 770 /* nosave:    For optimizations. */
 771 {
 772     PERL_ARGS_ASSERT_PREGEXEC;
 773
 774     return
 775         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 776                       nosave ? 0 : REXEC_COPY_STR);
 777 }
 778 #endif
 779
 780
 781
 782 /* re_intuit_start():
 783  *
 784  * Based on some optimiser hints, try to find the earliest position in the
 785  * string where the regex could match.
 786  *
 787  *   rx:     the regex to match against
 788  *   sv:     the SV being matched: only used for utf8 flag; the string
 789  *           itself is accessed via the pointers below. Note that on
 790  *           something like an overloaded SV, SvPOK(sv) may be false
 791  *           and the string pointers may point to something unrelated to
 792  *           the SV itself.
 793  *   strbeg: real beginning of string
 794  *   strpos: the point in the string at which to begin matching
 795  *   strend: pointer to the byte following the last char of the string
 796  *   flags   currently unused; set to 0
 797  *   data:   currently unused; set to NULL
 798  *
 799  * The basic idea of re_intuit_start() is to use some known information
 800  * about the pattern, namely:
 801  *
 802  *   a) the longest known anchored substring (i.e. one that's at a
 803  *      constant offset from the beginning of the pattern; but not
 804  *      necessarily at a fixed offset from the beginning of the
 805  *      string);
 806  *   b) the longest floating substring (i.e. one that's not at a constant
 807  *      offset from the beginning of the pattern);
 808  *   c) Whether the pattern is anchored to the string; either
 809  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 810  *      or anchored to pos(): /\G/;
 811  *   d) A start class: a real or synthetic character class which
 812  *      represents which characters are legal at the start of the pattern;
 813  *
 814  * to either quickly reject the match, or to find the earliest position
 815  * within the string at which the pattern might match, thus avoiding
 816  * running the full NFA engine at those earlier locations, only to
 817  * eventually fail and retry further along.
 818  *
 819  * Returns NULL if the pattern can't match, or returns the address within
 820  * the string which is the earliest place the match could occur.
 821  *
 822  * The longest of the anchored and floating substrings is called 'check'
 823  * and is checked first. The other is called 'other' and is checked
 824  * second. The 'other' substring may not be present.  For example,
 825  *
 826  *    /(abc|xyz)ABC\d{0,3}DEFG/
 827  *
 828  * will have
 829  *
 830  *   check substr (float)    = "DEFG", offset 6..9 chars
 831  *   other substr (anchored) = "ABC",  offset 3..3 chars
 832  *   stclass = [ax]
 833  *
 834  * Be aware that during the course of this function, sometimes 'anchored'
 835  * refers to a substring being anchored relative to the start of the
 836  * pattern, and sometimes to the pattern itself being anchored relative to
 837  * the string. For example:
 838  *
 839  *   /\dabc/:   "abc" is anchored to the pattern;
 840  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 841  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 842  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 843  *                    but the pattern is anchored to the string.
 844  */
 845
 846 char *
 847 Perl_re_intuit_start(pTHX_
 848                     REGEXP * const rx,
 849                     SV *sv,
 850                     const char * const strbeg,
 851                     char *strpos,
 852                     char *strend,
 853                     const U32 flags,
 854                     re_scream_pos_data *data)
 855 {
 856     struct regexp *const prog = ReANY(rx);
 857     SSize_t start_shift = prog->check_offset_min;
 858     /* Should be nonnegative! */
 859     SSize_t end_shift   = 0;
 860     /* current lowest pos in string where the regex can start matching */
 861     char *rx_origin = strpos;
 862     SV *check;
 863     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 864     U8   other_ix = 1 - prog->substrs->check_ix;
 865     bool ml_anch = 0;
 866     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 867     char *check_at = NULL;              /* check substr found at this pos */
 868     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 869     RXi_GET_DECL(prog,progi);
 870     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 871     regmatch_info *const reginfo = &reginfo_buf;
 872     DECLARE_AND_GET_RE_DEBUG_FLAGS;
 873
 874     PERL_ARGS_ASSERT_RE_INTUIT_START;
 875     PERL_UNUSED_ARG(flags);
 876     PERL_UNUSED_ARG(data);
 877
 878     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 879                 "Intuit: trying to determine minimum start position...\n"));
 880
 881     /* for now, assume that all substr offsets are positive. If at some point
 882      * in the future someone wants to do clever things with lookbehind and
 883      * -ve offsets, they'll need to fix up any code in this function
 884      * which uses these offsets. See the thread beginning
 885      * <20140113145929.GF27210@iabyn.com>
 886      */
 887     assert(prog->substrs->data[0].min_offset >= 0);
 888     assert(prog->substrs->data[0].max_offset >= 0);
 889     assert(prog->substrs->data[1].min_offset >= 0);
 890     assert(prog->substrs->data[1].max_offset >= 0);
 891     assert(prog->substrs->data[2].min_offset >= 0);
 892     assert(prog->substrs->data[2].max_offset >= 0);
 893
 894     /* for now, assume that if both present, that the floating substring
 895      * doesn't start before the anchored substring.
 896      * If you break this assumption (e.g. doing better optimisations
 897      * with lookahead/behind), then you'll need to audit the code in this
 898      * function carefully first
 899      */
 900     assert(
 901             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 902               && (prog->float_utf8    || prog->float_substr))
 903            || (prog->float_min_offset >= prog->anchored_offset));
 904
 905     /* byte rather than char calculation for efficiency. It fails
 906      * to quickly reject some cases that can't match, but will reject
 907      * them later after doing full char arithmetic */
 908     if (prog->minlen > strend - strpos) {
 909         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 910                               "  String too short...\n"));
 911         goto fail;
 912     }
 913
 914     RXp_MATCH_UTF8_set(prog, utf8_target);
 915     reginfo->is_utf8_target = cBOOL(utf8_target);
 916     reginfo->info_aux = NULL;
 917     reginfo->strbeg = strbeg;
 918     reginfo->strend = strend;
 919     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 920     reginfo->intuit = 1;
 921     /* not actually used within intuit, but zero for safety anyway */
 922     reginfo->poscache_maxiter = 0;
 923
 924     if (utf8_target) {
 925         if ((!prog->anchored_utf8 && prog->anchored_substr)
 926                 || (!prog->float_utf8 && prog->float_substr))
 927             to_utf8_substr(prog);
 928         check = prog->check_utf8;
 929     } else {
 930         if (!prog->check_substr && prog->check_utf8) {
 931             if (! to_byte_substr(prog)) {
 932                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 933             }
 934         }
 935         check = prog->check_substr;
 936     }
 937
 938     /* dump the various substring data */
 939     DEBUG_OPTIMISE_MORE_r({
 940         int i;
 941         for (i=0; i<=2; i++) {
 942             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 943                                   : prog->substrs->data[i].substr);
 944             if (!sv)
 945                 continue;
 946
 947             Perl_re_printf( aTHX_
 948                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
 949                 " useful=%" IVdf " utf8=%d [%s]\n",
 950                 i,
 951                 (IV)prog->substrs->data[i].min_offset,
 952                 (IV)prog->substrs->data[i].max_offset,
 953                 (IV)prog->substrs->data[i].end_shift,
 954                 BmUSEFUL(sv),
 955                 utf8_target ? 1 : 0,
 956                 SvPEEK(sv));
 957         }
 958     });
 959
 960     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 961
 962         /* ml_anch: check after \n?
 963          *
 964          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 965          * with /.*.../, these flags will have been added by the
 966          * compiler:
 967          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 968          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 969          */
 970         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 971                    && !(prog->intflags & PREGf_IMPLICIT);
 972
 973         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 974             /* we are only allowed to match at BOS or \G */
 975
 976             /* trivially reject if there's a BOS anchor and we're not at BOS.
 977              *
 978              * Note that we don't try to do a similar quick reject for
 979              * \G, since generally the caller will have calculated strpos
 980              * based on pos() and gofs, so the string is already correctly
 981              * anchored by definition; and handling the exceptions would
 982              * be too fiddly (e.g. REXEC_IGNOREPOS).
 983              */
 984             if (   strpos != strbeg
 985                 && (prog->intflags & PREGf_ANCH_SBOL))
 986             {
 987                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 988                                 "  Not at start...\n"));
 989                 goto fail;
 990             }
 991
 992             /* in the presence of an anchor, the anchored (relative to the
 993              * start of the regex) substr must also be anchored relative
 994              * to strpos. So quickly reject if substr isn't found there.
 995              * This works for \G too, because the caller will already have
 996              * subtracted gofs from pos, and gofs is the offset from the
 997              * \G to the start of the regex. For example, in /.abc\Gdef/,
 998              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 999              * caller will have set strpos=pos()-4; we look for the substr
1000              * at position pos()-4+1, which lines up with the "a" */
1001
1002             if (prog->check_offset_min == prog->check_offset_max) {
1003                 /* Substring at constant offset from beg-of-str... */
1004                 SSize_t slen = SvCUR(check);
1005                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1006
1007                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1008                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1009                     (IV)prog->check_offset_min));
1010
1011                 if (SvTAIL(check)) {
1012                     /* In this case, the regex is anchored at the end too.
1013                      * Unless it's a multiline match, the lengths must match
1014                      * exactly, give or take a \n.  NB: slen >= 1 since
1015                      * the last char of check is \n */
1016                     if (!multiline
1017                         && (   strend - s > slen
1018                             || strend - s < slen - 1
1019                             || (strend - s == slen && strend[-1] != '\n')))
1020                     {
1021                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1022                                             "  String too long...\n"));
1023                         goto fail_finish;
1024                     }
1025                     /* Now should match s[0..slen-2] */
1026                     slen--;
1027                 }
1028                 if (slen && (strend - s < slen
1029                     || *SvPVX_const(check) != *s
1030                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1031                 {
1032                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033                                     "  String not equal...\n"));
1034                     goto fail_finish;
1035                 }
1036
1037                 check_at = s;
1038                 goto success_at_start;
1039             }
1040         }
1041     }
1042
1043     end_shift = prog->check_end_shift;
1044
1045 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1046     if (end_shift < 0)
1047         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1048                    (IV)end_shift, RX_PRECOMP(rx));
1049 #endif
1050
1051   restart:
1052
1053     /* This is the (re)entry point of the main loop in this function.
1054      * The goal of this loop is to:
1055      * 1) find the "check" substring in the region rx_origin..strend
1056      *    (adjusted by start_shift / end_shift). If not found, reject
1057      *    immediately.
1058      * 2) If it exists, look for the "other" substr too if defined; for
1059      *    example, if the check substr maps to the anchored substr, then
1060      *    check the floating substr, and vice-versa. If not found, go
1061      *    back to (1) with rx_origin suitably incremented.
1062      * 3) If we find an rx_origin position that doesn't contradict
1063      *    either of the substrings, then check the possible additional
1064      *    constraints on rx_origin of /^.../m or a known start class.
1065      *    If these fail, then depending on which constraints fail, jump
1066      *    back to here, or to various other re-entry points further along
1067      *    that skip some of the first steps.
1068      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1069      *    substring. If the start position was determined to be at the
1070      *    beginning of the string  - so, not rejected, but not optimised,
1071      *    since we have to run regmatch from position 0 - decrement the
1072      *    BmUSEFUL() count. Otherwise increment it.
1073      */
1074
1075
1076     /* first, look for the 'check' substring */
1077
1078     {
1079         U8* start_point;
1080         U8* end_point;
1081
1082         DEBUG_OPTIMISE_MORE_r({
1083             Perl_re_printf( aTHX_
1084                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1085                 " Start shift: %" IVdf " End shift %" IVdf
1086                 " Real end Shift: %" IVdf "\n",
1087                 (IV)(rx_origin - strbeg),
1088                 (IV)prog->check_offset_min,
1089                 (IV)start_shift,
1090                 (IV)end_shift,
1091                 (IV)prog->check_end_shift);
1092         });
1093
1094         end_point = HOPBACK3(strend, end_shift, rx_origin);
1095         if (!end_point)
1096             goto fail_finish;
1097         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1098         if (!start_point)
1099             goto fail_finish;
1100
1101
1102         /* If the regex is absolutely anchored to either the start of the
1103          * string (SBOL) or to pos() (ANCH_GPOS), then
1104          * check_offset_max represents an upper bound on the string where
1105          * the substr could start. For the ANCH_GPOS case, we assume that
1106          * the caller of intuit will have already set strpos to
1107          * pos()-gofs, so in this case strpos + offset_max will still be
1108          * an upper bound on the substr.
1109          */
1110         if (!ml_anch
1111             && prog->intflags & PREGf_ANCH
1112             && prog->check_offset_max != SSize_t_MAX)
1113         {
1114             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1115             const char * const anchor =
1116                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1117             SSize_t targ_len = (char*)end_point - anchor;
1118
1119             if (check_len > targ_len) {
1120                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121                               "Target string too short to match required substring...\n"));
1122                 goto fail_finish;
1123             }
1124
1125             /* do a bytes rather than chars comparison. It's conservative;
1126              * so it skips doing the HOP if the result can't possibly end
1127              * up earlier than the old value of end_point.
1128              */
1129             assert(anchor + check_len <= (char *)end_point);
1130             if (prog->check_offset_max + check_len < targ_len) {
1131                 end_point = HOP3lim((U8*)anchor,
1132                                 prog->check_offset_max,
1133                                 end_point - check_len
1134                             )
1135                             + check_len;
1136                 if (end_point < start_point)
1137                     goto fail_finish;
1138             }
1139         }
1140
1141         check_at = fbm_instr( start_point, end_point,
1142                       check, multiline ? FBMrf_MULTILINE : 0);
1143
1144         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1145             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1146             (IV)((char*)start_point - strbeg),
1147             (IV)((char*)end_point   - strbeg),
1148             (IV)(check_at ? check_at - strbeg : -1)
1149         ));
1150
1151         /* Update the count-of-usability, remove useless subpatterns,
1152             unshift s.  */
1153
1154         DEBUG_EXECUTE_r({
1155             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1156                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1157             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1158                               (check_at ? "Found" : "Did not find"),
1159                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1160                     ? "anchored" : "floating"),
1161                 quoted,
1162                 RE_SV_TAIL(check),
1163                 (check_at ? " at offset " : "...\n") );
1164         });
1165
1166         if (!check_at)
1167             goto fail_finish;
1168         /* set rx_origin to the minimum position where the regex could start
1169          * matching, given the constraint of the just-matched check substring.
1170          * But don't set it lower than previously.
1171          */
1172
1173         if (check_at - rx_origin > prog->check_offset_max)
1174             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1175         /* Finish the diagnostic message */
1176         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1177             "%ld (rx_origin now %" IVdf ")...\n",
1178             (long)(check_at - strbeg),
1179             (IV)(rx_origin - strbeg)
1180         ));
1181     }
1182
1183
1184     /* now look for the 'other' substring if defined */
1185
1186     if (prog->substrs->data[other_ix].utf8_substr
1187         || prog->substrs->data[other_ix].substr)
1188     {
1189         /* Take into account the "other" substring. */
1190         char *last, *last1;
1191         char *s;
1192         SV* must;
1193         struct reg_substr_datum *other;
1194
1195       do_other_substr:
1196         other = &prog->substrs->data[other_ix];
1197         if (!utf8_target && !other->substr) {
1198             if (!to_byte_substr(prog)) {
1199                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1200             }
1201         }
1202
1203         /* if "other" is anchored:
1204          * we've previously found a floating substr starting at check_at.
1205          * This means that the regex origin must lie somewhere
1206          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1207          * and max:                 HOP3(check_at, -check_offset_min)
1208          * (except that min will be >= strpos)
1209          * So the fixed  substr must lie somewhere between
1210          *  HOP3(min, anchored_offset)
1211          *  HOP3(max, anchored_offset) + SvCUR(substr)
1212          */
1213
1214         /* if "other" is floating
1215          * Calculate last1, the absolute latest point where the
1216          * floating substr could start in the string, ignoring any
1217          * constraints from the earlier fixed match. It is calculated
1218          * as follows:
1219          *
1220          * strend - prog->minlen (in chars) is the absolute latest
1221          * position within the string where the origin of the regex
1222          * could appear. The latest start point for the floating
1223          * substr is float_min_offset(*) on from the start of the
1224          * regex.  last1 simply combines thee two offsets.
1225          *
1226          * (*) You might think the latest start point should be
1227          * float_max_offset from the regex origin, and technically
1228          * you'd be correct. However, consider
1229          *    /a\d{2,4}bcd\w/
1230          * Here, float min, max are 3,5 and minlen is 7.
1231          * This can match either
1232          *    /a\d\dbcd\w/
1233          *    /a\d\d\dbcd\w/
1234          *    /a\d\d\d\dbcd\w/
1235          * In the first case, the regex matches minlen chars; in the
1236          * second, minlen+1, in the third, minlen+2.
1237          * In the first case, the floating offset is 3 (which equals
1238          * float_min), in the second, 4, and in the third, 5 (which
1239          * equals float_max). In all cases, the floating string bcd
1240          * can never start more than 4 chars from the end of the
1241          * string, which equals minlen - float_min. As the substring
1242          * starts to match more than float_min from the start of the
1243          * regex, it makes the regex match more than minlen chars,
1244          * and the two cancel each other out. So we can always use
1245          * float_min - minlen, rather than float_max - minlen for the
1246          * latest position in the string.
1247          *
1248          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1249          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1250          */
1251
1252         assert(prog->minlen >= other->min_offset);
1253         last1 = HOP3c(strend,
1254                         other->min_offset - prog->minlen, strbeg);
1255
1256         if (other_ix) {/* i.e. if (other-is-float) */
1257             /* last is the latest point where the floating substr could
1258              * start, *given* any constraints from the earlier fixed
1259              * match. This constraint is that the floating string starts
1260              * <= float_max_offset chars from the regex origin (rx_origin).
1261              * If this value is less than last1, use it instead.
1262              */
1263             assert(rx_origin <= last1);
1264             last =
1265                 /* this condition handles the offset==infinity case, and
1266                  * is a short-cut otherwise. Although it's comparing a
1267                  * byte offset to a char length, it does so in a safe way,
1268                  * since 1 char always occupies 1 or more bytes,
1269                  * so if a string range is  (last1 - rx_origin) bytes,
1270                  * it will be less than or equal to  (last1 - rx_origin)
1271                  * chars; meaning it errs towards doing the accurate HOP3
1272                  * rather than just using last1 as a short-cut */
1273                 (last1 - rx_origin) < other->max_offset
1274                     ? last1
1275                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1276         }
1277         else {
1278             assert(strpos + start_shift <= check_at);
1279             last = HOP4c(check_at, other->min_offset - start_shift,
1280                         strbeg, strend);
1281         }
1282
1283         s = HOP3c(rx_origin, other->min_offset, strend);
1284         if (s < other_last)     /* These positions already checked */
1285             s = other_last;
1286
1287         must = utf8_target ? other->utf8_substr : other->substr;
1288         assert(SvPOK(must));
1289         {
1290             char *from = s;
1291             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1292
1293             if (to > strend)
1294                 to = strend;
1295             if (from > to) {
1296                 s = NULL;
1297                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1299                     (IV)(from - strbeg),
1300                     (IV)(to   - strbeg)
1301                 ));
1302             }
1303             else {
1304                 s = fbm_instr(
1305                     (unsigned char*)from,
1306                     (unsigned char*)to,
1307                     must,
1308                     multiline ? FBMrf_MULTILINE : 0
1309                 );
1310                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1311                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1312                     (IV)(from - strbeg),
1313                     (IV)(to   - strbeg),
1314                     (IV)(s ? s - strbeg : -1)
1315                 ));
1316             }
1317         }
1318
1319         DEBUG_EXECUTE_r({
1320             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1321                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1322             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1323                 s ? "Found" : "Contradicts",
1324                 other_ix ? "floating" : "anchored",
1325                 quoted, RE_SV_TAIL(must));
1326         });
1327
1328
1329         if (!s) {
1330             /* last1 is latest possible substr location. If we didn't
1331              * find it before there, we never will */
1332             if (last >= last1) {
1333                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1334                                         "; giving up...\n"));
1335                 goto fail_finish;
1336             }
1337
1338             /* try to find the check substr again at a later
1339              * position. Maybe next time we'll find the "other" substr
1340              * in range too */
1341             other_last = HOP3c(last, 1, strend) /* highest failure */;
1342             rx_origin =
1343                 other_ix /* i.e. if other-is-float */
1344                     ? HOP3c(rx_origin, 1, strend)
1345                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1346             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1347                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1348                 (other_ix ? "floating" : "anchored"),
1349                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1350                 (IV)(rx_origin - strbeg)
1351             ));
1352             goto restart;
1353         }
1354         else {
1355             if (other_ix) { /* if (other-is-float) */
1356                 /* other_last is set to s, not s+1, since its possible for
1357                  * a floating substr to fail first time, then succeed
1358                  * second time at the same floating position; e.g.:
1359                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1360                  * The first time round, anchored and float match at
1361                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1362                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1363                  */
1364                 other_last = s;
1365             }
1366             else {
1367                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1368                 other_last = HOP3c(s, 1, strend);
1369             }
1370             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1371                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1372                   (long)(s - strbeg),
1373                 (IV)(rx_origin - strbeg)
1374               ));
1375
1376         }
1377     }
1378     else {
1379         DEBUG_OPTIMISE_MORE_r(
1380             Perl_re_printf( aTHX_
1381                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1382                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1383                 " strend:%" IVdf "\n",
1384                 (IV)prog->check_offset_min,
1385                 (IV)prog->check_offset_max,
1386                 (IV)(check_at-strbeg),
1387                 (IV)(rx_origin-strbeg),
1388                 (IV)(rx_origin-check_at),
1389                 (IV)(strend-strbeg)
1390             )
1391         );
1392     }
1393
1394   postprocess_substr_matches:
1395
1396     /* handle the extra constraint of /^.../m if present */
1397
1398     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1399         char *s;
1400
1401         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1402                         "  looking for /^/m anchor"));
1403
1404         /* we have failed the constraint of a \n before rx_origin.
1405          * Find the next \n, if any, even if it's beyond the current
1406          * anchored and/or floating substrings. Whether we should be
1407          * scanning ahead for the next \n or the next substr is debatable.
1408          * On the one hand you'd expect rare substrings to appear less
1409          * often than \n's. On the other hand, searching for \n means
1410          * we're effectively flipping between check_substr and "\n" on each
1411          * iteration as the current "rarest" candidate string, which
1412          * means for example that we'll quickly reject the whole string if
1413          * hasn't got a \n, rather than trying every substr position
1414          * first
1415          */
1416
1417         s = HOP3c(strend, - prog->minlen, strpos);
1418         if (s <= rx_origin ||
1419             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1420         {
1421             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422                             "  Did not find /%s^%s/m...\n",
1423                             PL_colors[0], PL_colors[1]));
1424             goto fail_finish;
1425         }
1426
1427         /* earliest possible origin is 1 char after the \n.
1428          * (since *rx_origin == '\n', it's safe to ++ here rather than
1429          * HOP(rx_origin, 1)) */
1430         rx_origin++;
1431
1432         if (prog->substrs->check_ix == 0  /* check is anchored */
1433             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1434         {
1435             /* Position contradicts check-string; either because
1436              * check was anchored (and thus has no wiggle room),
1437              * or check was float and rx_origin is above the float range */
1438             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1439                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1440                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1441             goto restart;
1442         }
1443
1444         /* if we get here, the check substr must have been float,
1445          * is in range, and we may or may not have had an anchored
1446          * "other" substr which still contradicts */
1447         assert(prog->substrs->check_ix); /* check is float */
1448
1449         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1450             /* whoops, the anchored "other" substr exists, so we still
1451              * contradict. On the other hand, the float "check" substr
1452              * didn't contradict, so just retry the anchored "other"
1453              * substr */
1454             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1455                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1456                 PL_colors[0], PL_colors[1],
1457                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1458                 (IV)(rx_origin - strbeg)
1459             ));
1460             goto do_other_substr;
1461         }
1462
1463         /* success: we don't contradict the found floating substring
1464          * (and there's no anchored substr). */
1465         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1466             "  Found /%s^%s/m with rx_origin %ld...\n",
1467             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1468     }
1469     else {
1470         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1471             "  (multiline anchor test skipped)\n"));
1472     }
1473
1474   success_at_start:
1475
1476
1477     /* if we have a starting character class, then test that extra constraint.
1478      * (trie stclasses are too expensive to use here, we are better off to
1479      * leave it to regmatch itself) */
1480
1481     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1482         const U8* const str = (U8*)STRING(progi->regstclass);
1483
1484         /* XXX this value could be pre-computed */
1485         const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1486                     ?  (reginfo->is_utf8_pat
1487                         ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1488                         : (SSize_t)STR_LEN(progi->regstclass))
1489                     : 1);
1490         char * endpos;
1491         char *s;
1492         /* latest pos that a matching float substr constrains rx start to */
1493         char *rx_max_float = NULL;
1494
1495         /* if the current rx_origin is anchored, either by satisfying an
1496          * anchored substring constraint, or a /^.../m constraint, then we
1497          * can reject the current origin if the start class isn't found
1498          * at the current position. If we have a float-only match, then
1499          * rx_origin is constrained to a range; so look for the start class
1500          * in that range. if neither, then look for the start class in the
1501          * whole rest of the string */
1502
1503         /* XXX DAPM it's not clear what the minlen test is for, and why
1504          * it's not used in the floating case. Nothing in the test suite
1505          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1506          * Here are some old comments, which may or may not be correct:
1507          *
1508          *   minlen == 0 is possible if regstclass is \b or \B,
1509          *   and the fixed substr is ''$.
1510          *   Since minlen is already taken into account, rx_origin+1 is
1511          *   before strend; accidentally, minlen >= 1 guaranties no false
1512          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1513          *   0) below assumes that regstclass does not come from lookahead...
1514          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1515          *   This leaves EXACTF-ish only, which are dealt with in
1516          *   find_byclass().
1517          */
1518
1519         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1520             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1521         else if (prog->float_substr || prog->float_utf8) {
1522             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1523             endpos = HOP3clim(rx_max_float, cl_l, strend);
1524         }
1525         else
1526             endpos= strend;
1527
1528         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1529             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1530             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1531               (IV)start_shift, (IV)(check_at - strbeg),
1532               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1533
1534         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1535                             reginfo);
1536         if (!s) {
1537             if (endpos == strend) {
1538                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1539                                 "  Could not match STCLASS...\n") );
1540                 goto fail;
1541             }
1542             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1543                                "  This position contradicts STCLASS...\n") );
1544             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1545                         && !(prog->intflags & PREGf_IMPLICIT))
1546                 goto fail;
1547
1548             /* Contradict one of substrings */
1549             if (prog->anchored_substr || prog->anchored_utf8) {
1550                 if (prog->substrs->check_ix == 1) { /* check is float */
1551                     /* Have both, check_string is floating */
1552                     assert(rx_origin + start_shift <= check_at);
1553                     if (rx_origin + start_shift != check_at) {
1554                         /* not at latest position float substr could match:
1555                          * Recheck anchored substring, but not floating.
1556                          * The condition above is in bytes rather than
1557                          * chars for efficiency. It's conservative, in
1558                          * that it errs on the side of doing 'goto
1559                          * do_other_substr'. In this case, at worst,
1560                          * an extra anchored search may get done, but in
1561                          * practice the extra fbm_instr() is likely to
1562                          * get skipped anyway. */
1563                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1564                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1565                             (long)(other_last - strbeg),
1566                             (IV)(rx_origin - strbeg)
1567                         ));
1568                         goto do_other_substr;
1569                     }
1570                 }
1571             }
1572             else {
1573                 /* float-only */
1574
1575                 if (ml_anch) {
1576                     /* In the presence of ml_anch, we might be able to
1577                      * find another \n without breaking the current float
1578                      * constraint. */
1579
1580                     /* strictly speaking this should be HOP3c(..., 1, ...),
1581                      * but since we goto a block of code that's going to
1582                      * search for the next \n if any, its safe here */
1583                     rx_origin++;
1584                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1585                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1586                               PL_colors[0], PL_colors[1],
1587                               (long)(rx_origin - strbeg)) );
1588                     goto postprocess_substr_matches;
1589                 }
1590
1591                 /* strictly speaking this can never be true; but might
1592                  * be if we ever allow intuit without substrings */
1593                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1594                     goto fail;
1595
1596                 rx_origin = rx_max_float;
1597             }
1598
1599             /* at this point, any matching substrings have been
1600              * contradicted. Start again... */
1601
1602             rx_origin = HOP3c(rx_origin, 1, strend);
1603
1604             /* uses bytes rather than char calculations for efficiency.
1605              * It's conservative: it errs on the side of doing 'goto restart',
1606              * where there is code that does a proper char-based test */
1607             if (rx_origin + start_shift + end_shift > strend) {
1608                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1609                                        "  Could not match STCLASS...\n") );
1610                 goto fail;
1611             }
1612             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1613                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1614                 (prog->substrs->check_ix ? "floating" : "anchored"),
1615                 (long)(rx_origin + start_shift - strbeg),
1616                 (IV)(rx_origin - strbeg)
1617             ));
1618             goto restart;
1619         }
1620
1621         /* Success !!! */
1622
1623         if (rx_origin != s) {
1624             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1625                         "  By STCLASS: moving %ld --> %ld\n",
1626                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1627                    );
1628         }
1629         else {
1630             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1631                                   "  Does not contradict STCLASS...\n");
1632                    );
1633         }
1634     }
1635
1636     /* Decide whether using the substrings helped */
1637
1638     if (rx_origin != strpos) {
1639         /* Fixed substring is found far enough so that the match
1640            cannot start at strpos. */
1641
1642         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1643         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1644     }
1645     else {
1646         /* The found rx_origin position does not prohibit matching at
1647          * strpos, so calling intuit didn't gain us anything. Decrement
1648          * the BmUSEFUL() count on the check substring, and if we reach
1649          * zero, free it.  */
1650         if (!(prog->intflags & PREGf_NAUGHTY)
1651             && (utf8_target ? (
1652                 prog->check_utf8                /* Could be deleted already */
1653                 && --BmUSEFUL(prog->check_utf8) < 0
1654                 && (prog->check_utf8 == prog->float_utf8)
1655             ) : (
1656                 prog->check_substr              /* Could be deleted already */
1657                 && --BmUSEFUL(prog->check_substr) < 0
1658                 && (prog->check_substr == prog->float_substr)
1659             )))
1660         {
1661             /* If flags & SOMETHING - do not do it many times on the same match */
1662             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1663             /* XXX Does the destruction order has to change with utf8_target? */
1664             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1665             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1666             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1667             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1668             check = NULL;                       /* abort */
1669             /* XXXX This is a remnant of the old implementation.  It
1670                     looks wasteful, since now INTUIT can use many
1671                     other heuristics. */
1672             prog->extflags &= ~RXf_USE_INTUIT;
1673         }
1674     }
1675
1676     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1677             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1678              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1679
1680     return rx_origin;
1681
1682   fail_finish:                          /* Substring not found */
1683     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1684         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1685   fail:
1686     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1687                           PL_colors[4], PL_colors[5]));
1688     return NULL;
1689 }
1690
1691
1692 #define DECL_TRIE_TYPE(scan) \
1693     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1694                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1695                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1696                     trie_type = ((scan->flags == EXACT)                             \
1697                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1698                                  : (scan->flags == EXACTL)                          \
1699                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1700                                     : (scan->flags == EXACTFAA)                     \
1701                                       ? (utf8_target                                \
1702                                          ? trie_utf8_exactfa_fold                   \
1703                                          : trie_latin_utf8_exactfa_fold)            \
1704                                       : (scan->flags == EXACTFLU8                   \
1705                                          ? (utf8_target                             \
1706                                            ? trie_flu8                              \
1707                                            : trie_flu8_latin)                       \
1708                                          : (utf8_target                             \
1709                                            ? trie_utf8_fold                         \
1710                                            : trie_latin_utf8_fold)))
1711
1712 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1713  * 'foldbuf+sizeof(foldbuf)' */
1714 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1715 STMT_START {                                                                        \
1716     STRLEN skiplen;                                                                 \
1717     U8 flags = FOLD_FLAGS_FULL;                                                     \
1718     switch (trie_type) {                                                            \
1719     case trie_flu8:                                                                 \
1720         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1721         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1722             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1723         }                                                                           \
1724         goto do_trie_utf8_fold;                                                     \
1725     case trie_utf8_exactfa_fold:                                                    \
1726         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1727         /* FALLTHROUGH */                                                           \
1728     case trie_utf8_fold:                                                            \
1729       do_trie_utf8_fold:                                                            \
1730         if ( foldlen>0 ) {                                                          \
1731             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1732             foldlen -= len;                                                         \
1733             uscan += len;                                                           \
1734             len=0;                                                                  \
1735         } else {                                                                    \
1736             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1737                                                                             flags); \
1738             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1739             skiplen = UVCHR_SKIP( uvc );                                            \
1740             foldlen -= skiplen;                                                     \
1741             uscan = foldbuf + skiplen;                                              \
1742         }                                                                           \
1743         break;                                                                      \
1744     case trie_flu8_latin:                                                           \
1745         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1746         goto do_trie_latin_utf8_fold;                                               \
1747     case trie_latin_utf8_exactfa_fold:                                              \
1748         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1749         /* FALLTHROUGH */                                                           \
1750     case trie_latin_utf8_fold:                                                      \
1751       do_trie_latin_utf8_fold:                                                      \
1752         if ( foldlen>0 ) {                                                          \
1753             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1754             foldlen -= len;                                                         \
1755             uscan += len;                                                           \
1756             len=0;                                                                  \
1757         } else {                                                                    \
1758             len = 1;                                                                \
1759             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1760             skiplen = UVCHR_SKIP( uvc );                                            \
1761             foldlen -= skiplen;                                                     \
1762             uscan = foldbuf + skiplen;                                              \
1763         }                                                                           \
1764         break;                                                                      \
1765     case trie_utf8l:                                                                \
1766         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1767         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1768             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1769         }                                                                           \
1770         /* FALLTHROUGH */                                                           \
1771     case trie_utf8:                                                                 \
1772         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1773         break;                                                                      \
1774     case trie_plain:                                                                \
1775         uvc = (UV)*uc;                                                              \
1776         len = 1;                                                                    \
1777     }                                                                               \
1778     if (uvc < 256) {                                                                \
1779         charid = trie->charmap[ uvc ];                                              \
1780     }                                                                               \
1781     else {                                                                          \
1782         charid = 0;                                                                 \
1783         if (widecharmap) {                                                          \
1784             SV** const svpp = hv_fetch(widecharmap,                                 \
1785                         (char*)&uvc, sizeof(UV), 0);                                \
1786             if (svpp)                                                               \
1787                 charid = (U16)SvIV(*svpp);                                          \
1788         }                                                                           \
1789     }                                                                               \
1790 } STMT_END
1791
1792 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1793     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1794                 startpos, doutf8, depth)
1795
1796 #define REXEC_FBC_UTF8_SCAN(CODE)                           \
1797     STMT_START {                                            \
1798         while (s < strend) {                                \
1799             CODE                                            \
1800             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1801         }                                                   \
1802     } STMT_END
1803
1804 #define REXEC_FBC_NON_UTF8_SCAN(CODE)                       \
1805     STMT_START {                                            \
1806         while (s < strend) {                                \
1807             CODE                                            \
1808             s++;                                            \
1809         }                                                   \
1810     } STMT_END
1811
1812 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                     \
1813     STMT_START {                                            \
1814         while (s < strend) {                                \
1815             REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)            \
1816         }                                                   \
1817     } STMT_END
1818
1819 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND)                 \
1820     STMT_START {                                            \
1821         while (s < strend) {                                \
1822             REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)        \
1823         }                                                   \
1824     } STMT_END
1825
1826 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)                   \
1827     if (COND) {                                                \
1828         FBC_CHECK_AND_TRY                                      \
1829         s += UTF8_SAFE_SKIP(s, reginfo->strend);               \
1830         previous_occurrence_end = s;                           \
1831     }                                                          \
1832     else {                                                     \
1833         s += UTF8SKIP(s);                                      \
1834     }
1835
1836 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)               \
1837     if (COND) {                                                \
1838         FBC_CHECK_AND_TRY                                      \
1839         s++;                                                   \
1840         previous_occurrence_end = s;                           \
1841     }                                                          \
1842     else {                                                     \
1843         s++;                                                   \
1844     }
1845
1846 /* We keep track of where the next character should start after an occurrence
1847  * of the one we're looking for.  Knowing that, we can see right away if the
1848  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1849  * don't accept the 2nd and succeeding adjacent occurrences */
1850 #define FBC_CHECK_AND_TRY                                           \
1851         if (   (   doevery                                          \
1852                 || s != previous_occurrence_end)                    \
1853             && (   reginfo->intuit                                  \
1854                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1855         {                                                           \
1856             goto got_it;                                            \
1857         }
1858
1859
1860 /* These differ from the above macros in that they call a function which
1861  * returns the next occurrence of the thing being looked for in 's'; and
1862  * 'strend' if there is no such occurrence. */
1863 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f)                    \
1864     while (s < strend) {                                    \
1865         s = (char *) (f);                                   \
1866         if (s >= strend) {                                  \
1867             break;                                          \
1868         }                                                   \
1869                                                             \
1870         FBC_CHECK_AND_TRY                                   \
1871         s += UTF8SKIP(s);                                   \
1872         previous_occurrence_end = s;                        \
1873     }
1874
1875 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f)                \
1876     while (s < strend) {                                    \
1877         s = (char *) (f);                                   \
1878         if (s >= strend) {                                  \
1879             break;                                          \
1880         }                                                   \
1881                                                             \
1882         FBC_CHECK_AND_TRY                                   \
1883         s++;                                                \
1884         previous_occurrence_end = s;                        \
1885     }
1886
1887 /* This differs from the above macros in that it is passed a single byte that
1888  * is known to begin the next occurrence of the thing being looked for in 's'.
1889  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1890  * at that position. */
1891 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
1892     while (s < strend) {                                    \
1893         s = (char *) memchr(s, byte, strend -s);            \
1894         if (s == NULL) {                                    \
1895             s = (char *) strend;                            \
1896             break;                                          \
1897         }                                                   \
1898                                                             \
1899         if (COND) {                                         \
1900             FBC_CHECK_AND_TRY                               \
1901             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1902             previous_occurrence_end = s;                    \
1903         }                                                   \
1904         else {                                              \
1905             s += UTF8SKIP(s);                               \
1906         }                                                   \
1907     }
1908
1909 /* The four macros below are slightly different versions of the same logic.
1910  *
1911  * The first is for /a and /aa when the target string is UTF-8.  This can only
1912  * match ascii, but it must advance based on UTF-8.   The other three handle
1913  * the non-UTF-8 and the more generic UTF-8 cases.   In all four, we are
1914  * looking for the boundary (or non-boundary) between a word and non-word
1915  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1916  * must be different.  Find the "wordness" of the character just prior to this
1917  * one, and compare it with the wordness of this one.  If they differ, we have
1918  * a boundary.  At the beginning of the string, pretend that the previous
1919  * character was a new-line.
1920  *
1921  * All these macros uncleanly have side-effects with each other and outside
1922  * variables.  So far it's been too much trouble to clean-up
1923  *
1924  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1925  *               a word character or not.
1926  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1927  *               word/non-word
1928  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1929  *
1930  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1931  * are looking for a boundary or for a non-boundary.  If we are looking for a
1932  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1933  * see if this tentative match actually works, and if so, to quit the loop
1934  * here.  And vice-versa if we are looking for a non-boundary.
1935  *
1936  * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
1937  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1938  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1939  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1940  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1941  * complement.  But in that branch we complement tmp, meaning that at the
1942  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1943  * which means at the top of the loop in the next iteration, it is
1944  * TEST_NON_UTF8(s-1) */
1945 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1946     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1947     tmp = TEST_NON_UTF8(tmp);                                                  \
1948     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1949         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1950             tmp = !tmp;                                                        \
1951             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1952         }                                                                      \
1953         else {                                                                 \
1954             IF_FAIL;                                                           \
1955         }                                                                      \
1956     );                                                                         \
1957
1958 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1959  * TEST_UTF8 is a macro that for the same input code points returns identically
1960  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1961  * end pointer as well) */
1962 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1963     if (s == reginfo->strbeg) {                                                \
1964         tmp = '\n';                                                            \
1965     }                                                                          \
1966     else { /* Back-up to the start of the previous character */                \
1967         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1968         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1969                                                        0, UTF8_ALLOW_DEFAULT); \
1970     }                                                                          \
1971     tmp = TEST_UV(tmp);                                                        \
1972     REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */                      \
1973         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1974             tmp = !tmp;                                                        \
1975             IF_SUCCESS;                                                        \
1976         }                                                                      \
1977         else {                                                                 \
1978             IF_FAIL;                                                           \
1979         }                                                                      \
1980     );
1981
1982 /* Like the above two macros, for a UTF-8 target string.  UTF8_CODE is the
1983  * complete code for handling UTF-8.  Common to the BOUND and NBOUND cases,
1984  * set-up by the FBC_BOUND, etc macros below */
1985 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)   \
1986     UTF8_CODE;                                                                 \
1987     /* Here, things have been set up by the previous code so that tmp is the   \
1988      * return of TEST_NON_UTF8(s-1).  We also have to check if this matches    \
1989      * against the EOS, which we treat as a \n */                              \
1990     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1991         IF_SUCCESS;                                                            \
1992     }                                                                          \
1993     else {                                                                     \
1994         IF_FAIL;                                                               \
1995     }
1996
1997 /* Same as the macro above, but the target isn't UTF-8 */
1998 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)       \
1999     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                   \
2000     tmp = TEST_NON_UTF8(tmp);                                               \
2001     REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */               \
2002         if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) {                           \
2003             IF_SUCCESS;                                                     \
2004             tmp = !tmp;                                                     \
2005         }                                                                   \
2006         else {                                                              \
2007             IF_FAIL;                                                        \
2008         }                                                                   \
2009     );                                                                      \
2010     /* Here, things have been set up by the previous code so that tmp is    \
2011      * the return of TEST_NON_UTF8(s-1).   We also have to check if this    \
2012      * matches against the EOS, which we treat as a \n */                   \
2013     if (tmp == ! TEST_NON_UTF8('\n')) {                                     \
2014         IF_SUCCESS;                                                         \
2015     }                                                                       \
2016     else {                                                                  \
2017         IF_FAIL;                                                            \
2018     }
2019
2020 /* This is the macro to use when we want to see if something that looks like it
2021  * could match, actually does, and if so exits the loop.  It needs to be used
2022  * only for bounds checking macros, as it allows for matching beyond the end of
2023  * string (which should be zero length without having to look at the string
2024  * contents) */
2025 #define REXEC_FBC_TRYIT                                                     \
2026     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
2027         goto got_it
2028
2029 /* The only difference between the BOUND and NBOUND cases is that
2030  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2031  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2032  * with the other one being empty (PLACEHOLDER is defined as empty).
2033  *
2034  * The TEST_FOO parameters are for operating on different forms of input, but
2035  * all should be ones that return identically for the same underlying code
2036  * points */
2037
2038 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                   \
2039     FBC_BOUND_COMMON_UTF8(                                                  \
2040           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),       \
2041           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2042
2043 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8)                                   \
2044     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2045
2046 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8)                                     \
2047     FBC_BOUND_COMMON_UTF8(                                                  \
2048                     FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2049                     TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2050
2051 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8)                                 \
2052     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2053
2054 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                  \
2055     FBC_BOUND_COMMON_UTF8(                                                  \
2056               FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),   \
2057               TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2058
2059 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8)                                  \
2060     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2061
2062 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8)                                    \
2063     FBC_BOUND_COMMON_UTF8(                                                  \
2064             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),        \
2065             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2066
2067 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8)                                \
2068     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2069
2070 #ifdef DEBUGGING
2071 static IV
2072 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2073   IV cp_out = _invlist_search(invlist, cp_in);
2074   assert(cp_out >= 0);
2075   return cp_out;
2076 }
2077 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2078         invmap[S_get_break_val_cp_checked(invlist, cp)]
2079 #else
2080 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2081         invmap[_invlist_search(invlist, cp)]
2082 #endif
2083
2084 /* Takes a pointer to an inversion list, a pointer to its corresponding
2085  * inversion map, and a code point, and returns the code point's value
2086  * according to the two arrays.  It assumes that all code points have a value.
2087  * This is used as the base macro for macros for particular properties */
2088 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2089         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2090
2091 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2092  * of a code point, returning the value for the first code point in the string.
2093  * And it takes the particular macro name that finds the desired value given a
2094  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2095 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2096              (__ASSERT_(pos < strend)                                          \
2097                  /* Note assumes is valid UTF-8 */                             \
2098              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2099
2100 /* Returns the GCB value for the input code point */
2101 #define getGCB_VAL_CP(cp)                                                      \
2102           _generic_GET_BREAK_VAL_CP(                                           \
2103                                     PL_GCB_invlist,                            \
2104                                     _Perl_GCB_invmap,                          \
2105                                     (cp))
2106
2107 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2108  * bounded by pos and strend */
2109 #define getGCB_VAL_UTF8(pos, strend)                                           \
2110     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2111
2112 /* Returns the LB value for the input code point */
2113 #define getLB_VAL_CP(cp)                                                       \
2114           _generic_GET_BREAK_VAL_CP(                                           \
2115                                     PL_LB_invlist,                             \
2116                                     _Perl_LB_invmap,                           \
2117                                     (cp))
2118
2119 /* Returns the LB value for the first code point in the UTF-8 encoded string
2120  * bounded by pos and strend */
2121 #define getLB_VAL_UTF8(pos, strend)                                            \
2122     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2123
2124
2125 /* Returns the SB value for the input code point */
2126 #define getSB_VAL_CP(cp)                                                       \
2127           _generic_GET_BREAK_VAL_CP(                                           \
2128                                     PL_SB_invlist,                             \
2129                                     _Perl_SB_invmap,                     \
2130                                     (cp))
2131
2132 /* Returns the SB value for the first code point in the UTF-8 encoded string
2133  * bounded by pos and strend */
2134 #define getSB_VAL_UTF8(pos, strend)                                            \
2135     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2136
2137 /* Returns the WB value for the input code point */
2138 #define getWB_VAL_CP(cp)                                                       \
2139           _generic_GET_BREAK_VAL_CP(                                           \
2140                                     PL_WB_invlist,                             \
2141                                     _Perl_WB_invmap,                         \
2142                                     (cp))
2143
2144 /* Returns the WB value for the first code point in the UTF-8 encoded string
2145  * bounded by pos and strend */
2146 #define getWB_VAL_UTF8(pos, strend)                                            \
2147     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2148
2149 /* We know what class REx starts with.  Try to find this position... */
2150 /* if reginfo->intuit, its a dryrun */
2151 /* annoyingly all the vars in this routine have different names from their counterparts
2152    in regmatch. /grrr */
2153 STATIC char *
2154 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2155     const char *strend, regmatch_info *reginfo)
2156 {
2157
2158     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2159     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2160
2161     char *pat_string;   /* The pattern's exactish string */
2162     char *pat_end;          /* ptr to end char of pat_string */
2163     re_fold_t folder;   /* Function for computing non-utf8 folds */
2164     const U8 *fold_array;   /* array for folding ords < 256 */
2165     STRLEN ln;
2166     STRLEN lnc;
2167     U8 c1;
2168     U8 c2;
2169     char *e = NULL;
2170
2171     /* In some cases we accept only the first occurence of 'x' in a sequence of
2172      * them.  This variable points to just beyond the end of the previous
2173      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2174      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2175      * hop back.) */
2176     char * previous_occurrence_end = 0;
2177
2178     I32 tmp;            /* Scratch variable */
2179     const bool utf8_target = reginfo->is_utf8_target;
2180     UV utf8_fold_flags = 0;
2181     const bool is_utf8_pat = reginfo->is_utf8_pat;
2182     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2183                                    with a result inverts that result, as 0^1 =
2184                                    1 and 1^1 = 0 */
2185     _char_class_number classnum;
2186
2187     RXi_GET_DECL(prog,progi);
2188
2189     PERL_ARGS_ASSERT_FIND_BYCLASS;
2190
2191     /* We know what class it must start with. The case statements below have
2192      * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2193      * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2194      * ('p8' and 'pb'. */
2195     switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2196
2197       case ANYOFPOSIXL_t8_pb:
2198       case ANYOFPOSIXL_t8_p8:
2199       case ANYOFL_t8_pb:
2200       case ANYOFL_t8_p8:
2201         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2202         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2203
2204         /* FALLTHROUGH */
2205
2206       case ANYOFD_t8_pb:
2207       case ANYOFD_t8_p8:
2208       case ANYOF_t8_pb:
2209       case ANYOF_t8_p8:
2210         REXEC_FBC_UTF8_CLASS_SCAN(
2211                 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2212         break;
2213
2214       case ANYOFPOSIXL_tb_pb:
2215       case ANYOFPOSIXL_tb_p8:
2216       case ANYOFL_tb_pb:
2217       case ANYOFL_tb_p8:
2218         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2219         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2220
2221         /* FALLTHROUGH */
2222
2223       case ANYOFD_tb_pb:
2224       case ANYOFD_tb_p8:
2225       case ANYOF_tb_pb:
2226       case ANYOF_tb_p8:
2227         if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2228             /* We know that s is in the bitmap range since the target isn't
2229              * UTF-8, so what happens for out-of-range values is not relevant,
2230              * so exclude that from the flags */
2231             REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2232                                                      0));
2233         }
2234         else {
2235             REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2236         }
2237         break;
2238
2239       case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
2240       case ANYOFM_tb_p8:
2241         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2242              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2243         break;
2244
2245       case ANYOFM_t8_pb:
2246       case ANYOFM_t8_p8:
2247         /* UTF-8ness doesn't matter because only matches UTF-8 invariants.  But
2248          * we do anyway for performance reasons, as otherwise we would have to
2249          * examine all the continuation characters */
2250         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2251              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2252         break;
2253
2254       case NANYOFM_tb_pb:
2255       case NANYOFM_tb_p8:
2256         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2257            find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2258         break;
2259
2260       case NANYOFM_t8_pb:
2261       case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2262                                   variants. */
2263         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2264                         (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2265                                                     (U8) ARG(c), FLAGS(c)));
2266         break;
2267
2268       /* These nodes all require at least one code point to be in UTF-8 to
2269        * match */
2270       case ANYOFH_tb_pb:
2271       case ANYOFH_tb_p8:
2272       case ANYOFHb_tb_pb:
2273       case ANYOFHb_tb_p8:
2274       case ANYOFHr_tb_pb:
2275       case ANYOFHr_tb_p8:
2276       case ANYOFHs_tb_pb:
2277       case ANYOFHs_tb_p8:
2278       case EXACTFLU8_tb_pb:
2279       case EXACTFLU8_tb_p8:
2280       case EXACTFU_REQ8_tb_pb:
2281       case EXACTFU_REQ8_tb_p8:
2282         break;
2283
2284       case ANYOFH_t8_pb:
2285       case ANYOFH_t8_p8:
2286         REXEC_FBC_UTF8_CLASS_SCAN(
2287               (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2288                && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2289         break;
2290
2291       case ANYOFHb_t8_pb:
2292       case ANYOFHb_t8_p8:
2293         {
2294             /* We know what the first byte of any matched string should be. */
2295             U8 first_byte = FLAGS(c);
2296
2297             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2298                     reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2299         }
2300         break;
2301
2302       case ANYOFHr_t8_pb:
2303       case ANYOFHr_t8_p8:
2304         REXEC_FBC_UTF8_CLASS_SCAN(
2305                     (   inRANGE(NATIVE_UTF8_TO_I8(*s),
2306                                 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2307                                 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2308                     && reginclass(prog, c, (U8*)s, (U8*) strend,
2309                                                            1 /* is utf8 */)));
2310         break;
2311
2312       case ANYOFHs_t8_pb:
2313       case ANYOFHs_t8_p8:
2314         REXEC_FBC_UTF8_CLASS_SCAN(
2315                 (   strend -s >= FLAGS(c)
2316                 && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
2317                 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2318         break;
2319
2320       case ANYOFR_tb_pb:
2321       case ANYOFR_tb_p8:
2322         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2323                                             ANYOFRbase(c), ANYOFRdelta(c)));
2324         break;
2325
2326       case ANYOFR_t8_pb:
2327       case ANYOFR_t8_p8:
2328         REXEC_FBC_UTF8_CLASS_SCAN(
2329                             (   NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2330                              && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2331                                                               (U8 *) strend,
2332                                                               NULL),
2333                                             ANYOFRbase(c), ANYOFRdelta(c))));
2334         break;
2335
2336       case ANYOFRb_tb_pb:
2337       case ANYOFRb_tb_p8:
2338         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2339                                             ANYOFRbase(c), ANYOFRdelta(c)));
2340         break;
2341
2342       case ANYOFRb_t8_pb:
2343       case ANYOFRb_t8_p8:
2344         {   /* We know what the first byte of any matched string should be */
2345             U8 first_byte = FLAGS(c);
2346
2347             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2348                                 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2349                                                               (U8 *) strend,
2350                                                               NULL),
2351                                             ANYOFRbase(c), ANYOFRdelta(c)));
2352         }
2353         break;
2354
2355       case EXACTFAA_tb_pb:
2356
2357         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2358          * which these functions don't handle anyway */
2359         fold_array = PL_fold_latin1;
2360         folder = foldEQ_latin1_s2_folded;
2361         goto do_exactf_non_utf8;
2362
2363       case EXACTF_tb_pb:
2364         fold_array = PL_fold;
2365         folder = foldEQ;
2366         goto do_exactf_non_utf8;
2367
2368       case EXACTFL_tb_pb:
2369         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2370
2371         if (IN_UTF8_CTYPE_LOCALE) {
2372             utf8_fold_flags = FOLDEQ_LOCALE;
2373             goto do_exactf_utf8;
2374         }
2375
2376         fold_array = PL_fold_locale;
2377         folder = foldEQ_locale;
2378         goto do_exactf_non_utf8;
2379
2380       case EXACTFU_tb_pb:
2381         /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2382          * don't have to worry here about this single special case in the
2383          * Latin1 range */
2384         fold_array = PL_fold_latin1;
2385         folder = foldEQ_latin1_s2_folded;
2386
2387         /* FALLTHROUGH */
2388
2389        do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2390                               are no glitches with fold-length differences
2391                               between the target string and pattern */
2392
2393         /* The idea in the non-utf8 EXACTF* cases is to first find the first
2394          * character of the EXACTF* node and then, if necessary,
2395          * case-insensitively compare the full text of the node.  c1 is the
2396          * first character.  c2 is its fold.  This logic will not work for
2397          * Unicode semantics and the german sharp ss, which hence should not be
2398          * compiled into a node that gets here. */
2399         pat_string = STRINGs(c);
2400         ln  = STR_LENs(c);      /* length to match in octets/bytes */
2401
2402         /* We know that we have to match at least 'ln' bytes (which is the same
2403          * as characters, since not utf8).  If we have to match 3 characters,
2404          * and there are only 2 availabe, we know without trying that it will
2405          * fail; so don't start a match past the required minimum number from
2406          * the far end */
2407         e = HOP3c(strend, -((SSize_t)ln), s);
2408         if (e < s)
2409             break;
2410
2411         c1 = *pat_string;
2412         c2 = fold_array[c1];
2413         if (c1 == c2) { /* If char and fold are the same */
2414             while (s <= e) {
2415                 s = (char *) memchr(s, c1, e + 1 - s);
2416                 if (s == NULL) {
2417                     break;
2418                 }
2419
2420                 /* Check that the rest of the node matches */
2421                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2422                     && (reginfo->intuit || regtry(reginfo, &s)) )
2423                 {
2424                     goto got_it;
2425                 }
2426                 s++;
2427             }
2428         }
2429         else {
2430             U8 bits_differing = c1 ^ c2;
2431
2432             /* If the folds differ in one bit position only, we can mask to
2433              * match either of them, and can use this faster find method.  Both
2434              * ASCII and EBCDIC tend to have their case folds differ in only
2435              * one position, so this is very likely */
2436             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2437                 bits_differing = ~ bits_differing;
2438                 while (s <= e) {
2439                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2440                                         (c1 & bits_differing), bits_differing);
2441                     if (s > e) {
2442                         break;
2443                     }
2444
2445                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2446                         && (reginfo->intuit || regtry(reginfo, &s)) )
2447                     {
2448                         goto got_it;
2449                     }
2450                     s++;
2451                 }
2452             }
2453             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2454                        should actually happen only in EXACTFL nodes */
2455                 while (s <= e) {
2456                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2457                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2458                         && (reginfo->intuit || regtry(reginfo, &s)) )
2459                     {
2460                         goto got_it;
2461                     }
2462                     s++;
2463                 }
2464             }
2465         }
2466         break;
2467
2468       case EXACTFAA_tb_p8:
2469       case EXACTFAA_t8_p8:
2470         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2471                          |FOLDEQ_S2_ALREADY_FOLDED
2472                          |FOLDEQ_S2_FOLDS_SANE;
2473         goto do_exactf_utf8;
2474
2475       case EXACTFAA_NO_TRIE_tb_pb:
2476       case EXACTFAA_NO_TRIE_t8_pb:
2477       case EXACTFAA_t8_pb:
2478
2479         /* Here, and elsewhere in this file, the reason we can't consider a
2480          * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2481          * is because any MICRO SIGN in the pattern won't be folded.  Since the
2482          * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2483          * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2484         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2485         goto do_exactf_utf8;
2486
2487       case EXACTFL_tb_p8:
2488       case EXACTFL_t8_pb:
2489       case EXACTFL_t8_p8:
2490         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2491         utf8_fold_flags = FOLDEQ_LOCALE;
2492         goto do_exactf_utf8;
2493
2494       case EXACTFLU8_t8_pb:
2495       case EXACTFLU8_t8_p8:
2496         utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2497                                          | FOLDEQ_S2_FOLDS_SANE;
2498         goto do_exactf_utf8;
2499
2500       case EXACTFU_REQ8_t8_p8:
2501         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2502         goto do_exactf_utf8;
2503
2504       case EXACTFU_tb_p8:
2505       case EXACTFU_t8_pb:
2506       case EXACTFU_t8_p8:
2507         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2508         goto do_exactf_utf8;
2509
2510       /* The following are problematic even though pattern isn't UTF-8.  Use
2511        * full functionality normally not done except for UTF-8. */
2512       case EXACTF_t8_pb:
2513       case EXACTFUP_tb_pb:
2514       case EXACTFUP_t8_pb:
2515
2516        do_exactf_utf8:
2517         {
2518             unsigned expansion;
2519
2520             /* If one of the operands is in utf8, we can't use the simpler
2521              * folding above, due to the fact that many different characters
2522              * can have the same fold, or portion of a fold, or different-
2523              * length fold */
2524             pat_string = STRINGs(c);
2525             ln  = STR_LENs(c);  /* length to match in octets/bytes */
2526             pat_end = pat_string + ln;
2527             lnc = is_utf8_pat       /* length to match in characters */
2528                   ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2529                   : ln;
2530
2531             /* We have 'lnc' characters to match in the pattern, but because of
2532              * multi-character folding, each character in the target can match
2533              * up to 3 characters (Unicode guarantees it will never exceed
2534              * this) if it is utf8-encoded; and up to 2 if not (based on the
2535              * fact that the Latin 1 folds are already determined, and the only
2536              * multi-char fold in that range is the sharp-s folding to 'ss'.
2537              * Thus, a pattern character can match as little as 1/3 of a string
2538              * character.  Adjust lnc accordingly, rounding up, so that if we
2539              * need to match at least 4+1/3 chars, that really is 5. */
2540             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2541             lnc = (lnc + expansion - 1) / expansion;
2542
2543             /* As in the non-UTF8 case, if we have to match 3 characters, and
2544              * only 2 are left, it's guaranteed to fail, so don't start a match
2545              * that would require us to go beyond the end of the string */
2546             e = HOP3c(strend, -((SSize_t)lnc), s);
2547
2548             /* XXX Note that we could recalculate e to stop the loop earlier,
2549              * as the worst case expansion above will rarely be met, and as we
2550              * go along we would usually find that e moves further to the left.
2551              * This would happen only after we reached the point in the loop
2552              * where if there were no expansion we should fail.  Unclear if
2553              * worth the expense */
2554
2555             while (s <= e) {
2556                 char *my_strend= (char *)strend;
2557                 if (   foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2558                                          pat_string, NULL, ln, is_utf8_pat,
2559                                          utf8_fold_flags)
2560                     && (reginfo->intuit || regtry(reginfo, &s)) )
2561                 {
2562                     goto got_it;
2563                 }
2564                 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2565             }
2566         }
2567         break;
2568
2569       case BOUNDA_tb_pb:
2570       case BOUNDA_tb_p8:
2571       case BOUND_tb_pb:  /* /d without utf8 target is /a */
2572       case BOUND_tb_p8:
2573         /* regcomp.c makes sure that these only have the traditional \b
2574          * meaning. */
2575         assert(FLAGS(c) == TRADITIONAL_BOUND);
2576
2577         FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2578         break;
2579
2580       case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2581       case BOUNDA_t8_p8:
2582         /* regcomp.c makes sure that these only have the traditional \b
2583          * meaning. */
2584         assert(FLAGS(c) == TRADITIONAL_BOUND);
2585
2586         FBC_BOUND_A_UTF8(isWORDCHAR_A);
2587         break;
2588
2589       case NBOUNDA_tb_pb:
2590       case NBOUNDA_tb_p8:
2591       case NBOUND_tb_pb: /* /d without utf8 target is /a */
2592       case NBOUND_tb_p8:
2593         /* regcomp.c makes sure that these only have the traditional \b
2594          * meaning. */
2595         assert(FLAGS(c) == TRADITIONAL_BOUND);
2596
2597         FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2598         break;
2599
2600       case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2601       case NBOUNDA_t8_p8:
2602         /* regcomp.c makes sure that these only have the traditional \b
2603          * meaning. */
2604         assert(FLAGS(c) == TRADITIONAL_BOUND);
2605
2606         FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2607         break;
2608
2609       case NBOUNDU_tb_pb:
2610       case NBOUNDU_tb_p8:
2611         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2612             FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2613             break;
2614         }
2615
2616         to_complement = 1;
2617         goto do_boundu_non_utf8;
2618
2619       case NBOUNDL_tb_pb:
2620       case NBOUNDL_tb_p8:
2621         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2622         if (FLAGS(c) == TRADITIONAL_BOUND) {
2623             FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2624             break;
2625         }
2626
2627         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2628
2629         to_complement = 1;
2630         goto do_boundu_non_utf8;
2631
2632       case BOUNDL_tb_pb:
2633       case BOUNDL_tb_p8:
2634         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2635         if (FLAGS(c) == TRADITIONAL_BOUND) {
2636             FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2637             break;
2638         }
2639
2640         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2641
2642         goto do_boundu_non_utf8;
2643
2644       case BOUNDU_tb_pb:
2645       case BOUNDU_tb_p8:
2646         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2647             FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2648             break;
2649         }
2650
2651       do_boundu_non_utf8:
2652         if (s == reginfo->strbeg) {
2653             if (reginfo->intuit || regtry(reginfo, &s))
2654             {
2655                 goto got_it;
2656             }
2657
2658             /* Didn't match.  Try at the next position (if there is one) */
2659             s++;
2660             if (UNLIKELY(s >= reginfo->strend)) {
2661                 break;
2662             }
2663         }
2664
2665         switch((bound_type) FLAGS(c)) {
2666           case TRADITIONAL_BOUND: /* Should have already been handled */
2667             assert(0);
2668             break;
2669
2670           case GCB_BOUND:
2671             /* Not utf8.  Everything is a GCB except between CR and LF */
2672             while (s < strend) {
2673                 if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2674                                       || UCHARAT(s) != '\n'))
2675                     && (reginfo->intuit || regtry(reginfo, &s)))
2676                 {
2677                     goto got_it;
2678                 }
2679                 s++;
2680             }
2681
2682             break;
2683
2684           case LB_BOUND:
2685             {
2686                 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2687                 while (s < strend) {
2688                     LB_enum after = getLB_VAL_CP((U8) *s);
2689                     if (to_complement ^ isLB(before,
2690                                              after,
2691                                              (U8*) reginfo->strbeg,
2692                                              (U8*) s,
2693                                              (U8*) reginfo->strend,
2694                                              0 /* target not utf8 */ )
2695                         && (reginfo->intuit || regtry(reginfo, &s)))
2696                     {
2697                         goto got_it;
2698                     }
2699                     before = after;
2700                     s++;
2701                 }
2702             }
2703
2704             break;
2705
2706           case SB_BOUND:
2707             {
2708                 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2709                 while (s < strend) {
2710                     SB_enum after = getSB_VAL_CP((U8) *s);
2711                     if ((to_complement ^ isSB(before,
2712                                               after,
2713                                               (U8*) reginfo->strbeg,
2714                                               (U8*) s,
2715                                               (U8*) reginfo->strend,
2716                                              0 /* target not utf8 */ ))
2717                         && (reginfo->intuit || regtry(reginfo, &s)))
2718                     {
2719                         goto got_it;
2720                     }
2721                     before = after;
2722                     s++;
2723                 }
2724             }
2725
2726             break;
2727
2728           case WB_BOUND:
2729             {
2730                 WB_enum previous = WB_UNKNOWN;
2731                 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2732                 while (s < strend) {
2733                     WB_enum after = getWB_VAL_CP((U8) *s);
2734                     if ((to_complement ^ isWB(previous,
2735                                               before,
2736                                               after,
2737                                               (U8*) reginfo->strbeg,
2738                                               (U8*) s,
2739                                               (U8*) reginfo->strend,
2740                                                0 /* target not utf8 */ ))
2741                         && (reginfo->intuit || regtry(reginfo, &s)))
2742                     {
2743                         goto got_it;
2744                     }
2745                     previous = before;
2746                     before = after;
2747                     s++;
2748                 }
2749             }
2750         }
2751
2752         /* Here are at the final position in the target string, which is a
2753          * boundary by definition, so matches, depending on other constraints.
2754          * */
2755         if (   reginfo->intuit
2756             || (s <= reginfo->strend && regtry(reginfo, &s)))
2757         {
2758             goto got_it;
2759         }
2760
2761         break;
2762
2763       case BOUNDL_t8_pb:
2764       case BOUNDL_t8_p8:
2765         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2766         if (FLAGS(c) == TRADITIONAL_BOUND) {
2767             FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2768                            isWORDCHAR_LC_utf8_safe);
2769             break;
2770         }
2771
2772         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2773
2774         to_complement = 1;
2775         goto do_boundu_utf8;
2776
2777       case NBOUNDL_t8_pb:
2778       case NBOUNDL_t8_p8:
2779         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2780         if (FLAGS(c) == TRADITIONAL_BOUND) {
2781             FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2782                             isWORDCHAR_LC_utf8_safe);
2783             break;
2784         }
2785
2786         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2787
2788         to_complement = 1;
2789         goto do_boundu_utf8;
2790
2791       case NBOUND_t8_pb:
2792       case NBOUND_t8_p8:
2793         /* regcomp.c makes sure that these only have the traditional \b
2794          * meaning. */
2795         assert(FLAGS(c) == TRADITIONAL_BOUND);
2796
2797         /* FALLTHROUGH */
2798
2799       case NBOUNDU_t8_pb:
2800       case NBOUNDU_t8_p8:
2801         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2802             FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2803                             isWORDCHAR_utf8_safe);
2804             break;
2805         }
2806
2807         to_complement = 1;
2808         goto do_boundu_utf8;
2809
2810       case BOUND_t8_pb:
2811       case BOUND_t8_p8:
2812         /* regcomp.c makes sure that these only have the traditional \b
2813          * meaning. */
2814         assert(FLAGS(c) == TRADITIONAL_BOUND);
2815
2816         /* FALLTHROUGH */
2817
2818       case BOUNDU_t8_pb:
2819       case BOUNDU_t8_p8:
2820         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2821             FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2822             break;
2823         }
2824
2825       do_boundu_utf8:
2826         if (s == reginfo->strbeg) {
2827             if (reginfo->intuit || regtry(reginfo, &s))
2828             {
2829                 goto got_it;
2830             }
2831
2832             /* Didn't match.  Try at the next position (if there is one) */
2833             s += UTF8_SAFE_SKIP(s, reginfo->strend);
2834             if (UNLIKELY(s >= reginfo->strend)) {
2835                 break;
2836             }
2837         }
2838
2839         switch((bound_type) FLAGS(c)) {
2840           case TRADITIONAL_BOUND: /* Should have already been handled */
2841             assert(0);
2842             break;
2843
2844           case GCB_BOUND:
2845             {
2846                 GCB_enum before = getGCB_VAL_UTF8(
2847                                            reghop3((U8*)s, -1,
2848                                                    (U8*)(reginfo->strbeg)),
2849                                            (U8*) reginfo->strend);
2850                 while (s < strend) {
2851                     GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2852                                                     (U8*) reginfo->strend);
2853                     if (   (to_complement ^ isGCB(before,
2854                                                   after,
2855                                                   (U8*) reginfo->strbeg,
2856                                                   (U8*) s,
2857                                                   1 /* target is utf8 */ ))
2858                         && (reginfo->intuit || regtry(reginfo, &s)))
2859                     {
2860                         goto got_it;
2861                     }
2862                     before = after;
2863                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2864                 }
2865             }
2866             break;
2867
2868           case LB_BOUND:
2869             {
2870                 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2871                                                         -1,
2872                                                         (U8*)(reginfo->strbeg)),
2873                                                    (U8*) reginfo->strend);
2874                 while (s < strend) {
2875                     LB_enum after = getLB_VAL_UTF8((U8*) s,
2876                                                    (U8*) reginfo->strend);
2877                     if (to_complement ^ isLB(before,
2878                                              after,
2879                                              (U8*) reginfo->strbeg,
2880                                              (U8*) s,
2881                                              (U8*) reginfo->strend,
2882                                              1 /* target is utf8 */ )
2883                         && (reginfo->intuit || regtry(reginfo, &s)))
2884                     {
2885                         goto got_it;
2886                     }
2887                     before = after;
2888                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2889                 }
2890             }
2891
2892             break;
2893
2894           case SB_BOUND:
2895             {
2896                 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2897                                                     -1,
2898                                                     (U8*)(reginfo->strbeg)),
2899                                                   (U8*) reginfo->strend);
2900                 while (s < strend) {
2901                     SB_enum after = getSB_VAL_UTF8((U8*) s,
2902                                                      (U8*) reginfo->strend);
2903                     if ((to_complement ^ isSB(before,
2904                                               after,
2905                                               (U8*) reginfo->strbeg,
2906                                               (U8*) s,
2907                                               (U8*) reginfo->strend,
2908                                               1 /* target is utf8 */ ))
2909                         && (reginfo->intuit || regtry(reginfo, &s)))
2910                     {
2911                         goto got_it;
2912                     }
2913                     before = after;
2914                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2915                 }
2916             }
2917
2918             break;
2919
2920           case WB_BOUND:
2921             {
2922                 /* We are at a boundary between char_sub_0 and char_sub_1.
2923                  * We also keep track of the value for char_sub_-1 as we
2924                  * loop through the line.   Context may be needed to make a
2925                  * determination, and if so, this can save having to
2926                  * recalculate it */
2927                 WB_enum previous = WB_UNKNOWN;
2928                 WB_enum before = getWB_VAL_UTF8(
2929                                           reghop3((U8*)s,
2930                                                   -1,
2931                                                   (U8*)(reginfo->strbeg)),
2932                                           (U8*) reginfo->strend);
2933                 while (s < strend) {
2934                     WB_enum after = getWB_VAL_UTF8((U8*) s,
2935                                                     (U8*) reginfo->strend);
2936                     if ((to_complement ^ isWB(previous,
2937                                               before,
2938                                               after,
2939                                               (U8*) reginfo->strbeg,
2940                                               (U8*) s,
2941                                               (U8*) reginfo->strend,
2942                                               1 /* target is utf8 */ ))
2943                         && (reginfo->intuit || regtry(reginfo, &s)))
2944                     {
2945                         goto got_it;
2946                     }
2947                     previous = before;
2948                     before = after;
2949                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2950                 }
2951             }
2952         }
2953
2954         /* Here are at the final position in the target string, which is a
2955          * boundary by definition, so matches, depending on other constraints.
2956          * */
2957
2958         if (   reginfo->intuit
2959             || (s <= reginfo->strend && regtry(reginfo, &s)))
2960         {
2961             goto got_it;
2962         }
2963         break;
2964
2965       case LNBREAK_t8_pb:
2966       case LNBREAK_t8_p8:
2967         REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
2968         break;
2969
2970       case LNBREAK_tb_pb:
2971       case LNBREAK_tb_p8:
2972         REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
2973         break;
2974
2975       /* The argument to all the POSIX node types is the class number to pass
2976        * to _generic_isCC() to build a mask for searching in PL_charclass[] */
2977
2978       case NPOSIXL_t8_pb:
2979       case NPOSIXL_t8_p8:
2980         to_complement = 1;
2981         /* FALLTHROUGH */
2982
2983       case POSIXL_t8_pb:
2984       case POSIXL_t8_p8:
2985         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2986         REXEC_FBC_UTF8_CLASS_SCAN(
2987             to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
2988                                                           (U8 *) strend)));
2989         break;
2990
2991       case NPOSIXL_tb_pb:
2992       case NPOSIXL_tb_p8:
2993         to_complement = 1;
2994         /* FALLTHROUGH */
2995
2996       case POSIXL_tb_pb:
2997       case POSIXL_tb_p8:
2998         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2999         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3000                                 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3001         break;
3002
3003       case NPOSIXA_t8_pb:
3004       case NPOSIXA_t8_p8:
3005         /* The complement of something that matches only ASCII matches all
3006          * non-ASCII, plus everything in ASCII that isn't in the class. */
3007         REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
3008                                   || ! _generic_isCC_A(*s, FLAGS(c)));
3009         break;
3010
3011       case POSIXA_t8_pb:
3012       case POSIXA_t8_p8:
3013         /* Don't need to worry about utf8, as it can match only a single
3014          * byte invariant character.  But we do anyway for performance reasons,
3015          * as otherwise we would have to examine all the continuation
3016          * characters */
3017         REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
3018         break;
3019
3020       case NPOSIXD_tb_pb:
3021       case NPOSIXD_tb_p8:
3022       case NPOSIXA_tb_pb:
3023       case NPOSIXA_tb_p8:
3024         to_complement = 1;
3025         /* FALLTHROUGH */
3026
3027       case POSIXD_tb_pb:
3028       case POSIXD_tb_p8:
3029       case POSIXA_tb_pb:
3030       case POSIXA_tb_p8:
3031         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3032                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
3033         break;
3034
3035       case NPOSIXU_tb_pb:
3036       case NPOSIXU_tb_p8:
3037         to_complement = 1;
3038         /* FALLTHROUGH */
3039
3040       case POSIXU_tb_pb:
3041       case POSIXU_tb_p8:
3042             REXEC_FBC_NON_UTF8_CLASS_SCAN(
3043                                  to_complement ^ cBOOL(_generic_isCC(*s,
3044                                                                     FLAGS(c))));
3045         break;
3046
3047       case NPOSIXD_t8_pb:
3048       case NPOSIXD_t8_p8:
3049       case NPOSIXU_t8_pb:
3050       case NPOSIXU_t8_p8:
3051         to_complement = 1;
3052         /* FALLTHROUGH */
3053
3054       case POSIXD_t8_pb:
3055       case POSIXD_t8_p8:
3056       case POSIXU_t8_pb:
3057       case POSIXU_t8_p8:
3058         classnum = (_char_class_number) FLAGS(c);
3059         switch (classnum) {
3060           default:
3061             REXEC_FBC_UTF8_CLASS_SCAN(
3062                         to_complement ^ cBOOL(_invlist_contains_cp(
3063                                                 PL_XPosix_ptrs[classnum],
3064                                                 utf8_to_uvchr_buf((U8 *) s,
3065                                                                 (U8 *) strend,
3066                                                                 NULL))));
3067             break;
3068
3069           case _CC_ENUM_SPACE:
3070             REXEC_FBC_UTF8_CLASS_SCAN(
3071                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3072             break;
3073
3074           case _CC_ENUM_BLANK:
3075             REXEC_FBC_UTF8_CLASS_SCAN(
3076                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3077             break;
3078
3079           case _CC_ENUM_XDIGIT:
3080             REXEC_FBC_UTF8_CLASS_SCAN(
3081                         to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3082             break;
3083
3084           case _CC_ENUM_VERTSPACE:
3085             REXEC_FBC_UTF8_CLASS_SCAN(
3086                         to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3087             break;
3088
3089           case _CC_ENUM_CNTRL:
3090             REXEC_FBC_UTF8_CLASS_SCAN(
3091                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3092             break;
3093         }
3094         break;
3095
3096       case AHOCORASICKC_tb_pb:
3097       case AHOCORASICKC_tb_p8:
3098       case AHOCORASICKC_t8_pb:
3099       case AHOCORASICKC_t8_p8:
3100       case AHOCORASICK_tb_pb:
3101       case AHOCORASICK_tb_p8:
3102       case AHOCORASICK_t8_pb:
3103       case AHOCORASICK_t8_p8:
3104         {
3105             DECL_TRIE_TYPE(c);
3106             /* what trie are we using right now */
3107             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3108             reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3109             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3110
3111             const char *last_start = strend - trie->minlen;
3112 #ifdef DEBUGGING
3113             const char *real_start = s;
3114 #endif
3115             STRLEN maxlen = trie->maxlen;
3116             SV *sv_points;
3117             U8 **points; /* map of where we were in the input string
3118                             when reading a given char. For ASCII this
3119                             is unnecessary overhead as the relationship
3120                             is always 1:1, but for Unicode, especially
3121                             case folded Unicode this is not true. */
3122             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3123             U8 *bitmap=NULL;
3124
3125
3126             DECLARE_AND_GET_RE_DEBUG_FLAGS;
3127
3128             /* We can't just allocate points here. We need to wrap it in
3129              * an SV so it gets freed properly if there is a croak while
3130              * running the match */
3131             ENTER;
3132             SAVETMPS;
3133             sv_points=newSV(maxlen * sizeof(U8 *));
3134             SvCUR_set(sv_points,
3135                 maxlen * sizeof(U8 *));
3136             SvPOK_on(sv_points);
3137             sv_2mortal(sv_points);
3138             points=(U8**)SvPV_nolen(sv_points );
3139             if ( trie_type != trie_utf8_fold
3140                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
3141             {
3142                 if (trie->bitmap)
3143                     bitmap=(U8*)trie->bitmap;
3144                 else
3145                     bitmap=(U8*)ANYOF_BITMAP(c);
3146             }
3147             /* this is the Aho-Corasick algorithm modified a touch
3148                to include special handling for long "unknown char" sequences.
3149                The basic idea being that we use AC as long as we are dealing
3150                with a possible matching char, when we encounter an unknown char
3151                (and we have not encountered an accepting state) we scan forward
3152                until we find a legal starting char.
3153                AC matching is basically that of trie matching, except that when
3154                we encounter a failing transition, we fall back to the current
3155                states "fail state", and try the current char again, a process
3156                we repeat until we reach the root state, state 1, or a legal
3157                transition. If we fail on the root state then we can either
3158                terminate if we have reached an accepting state previously, or
3159                restart the entire process from the beginning if we have not.
3160
3161              */
3162             while (s <= last_start) {
3163                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3164                 U8 *uc = (U8*)s;
3165                 U16 charid = 0;
3166                 U32 base = 1;
3167                 U32 state = 1;
3168                 UV uvc = 0;
3169                 STRLEN len = 0;
3170                 STRLEN foldlen = 0;
3171                 U8 *uscan = (U8*)NULL;
3172                 U8 *leftmost = NULL;
3173 #ifdef DEBUGGING
3174                 U32 accepted_word= 0;
3175 #endif
3176                 U32 pointpos = 0;
3177
3178                 while ( state && uc <= (U8*)strend ) {
3179                     int failed=0;
3180                     U32 word = aho->states[ state ].wordnum;
3181
3182                     if( state==1 ) {
3183                         if ( bitmap ) {
3184                             DEBUG_TRIE_EXECUTE_r(
3185                                 if (  uc <= (U8*)last_start
3186                                     && !BITMAP_TEST(bitmap,*uc) )
3187                                 {
3188                                     dump_exec_pos( (char *)uc, c, strend,
3189                                         real_start,
3190                                         (char *)uc, utf8_target, 0 );
3191                                     Perl_re_printf( aTHX_
3192                                         " Scanning for legal start char...\n");
3193                                 }
3194                             );
3195                             if (utf8_target) {
3196                                 while (  uc <= (U8*)last_start
3197                                        && !BITMAP_TEST(bitmap,*uc) )
3198                                 {
3199                                     uc += UTF8SKIP(uc);
3200                                 }
3201                             } else {
3202                                 while (  uc <= (U8*)last_start
3203                                        && ! BITMAP_TEST(bitmap,*uc) )
3204                                 {
3205                                     uc++;
3206                                 }
3207                             }
3208                             s= (char *)uc;
3209                         }
3210                         if (uc >(U8*)last_start) break;
3211                     }
3212
3213                     if ( word ) {
3214                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3215                                                                     % maxlen ];
3216                         if (!leftmost || lpos < leftmost) {
3217                             DEBUG_r(accepted_word=word);
3218                             leftmost= lpos;
3219                         }
3220                         if (base==0) break;
3221
3222                     }
3223                     points[pointpos++ % maxlen]= uc;
3224                     if (foldlen || uc < (U8*)strend) {
3225                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3226                                              (U8 *) strend, uscan, len, uvc,
3227                                              charid, foldlen, foldbuf,
3228                                              uniflags);
3229                         DEBUG_TRIE_EXECUTE_r({
3230                             dump_exec_pos( (char *)uc, c, strend,
3231                                         real_start, s, utf8_target, 0);
3232                             Perl_re_printf( aTHX_
3233                                 " Charid:%3u CP:%4" UVxf " ",
3234                                  charid, uvc);
3235                         });
3236                     }
3237                     else {
3238                         len = 0;
3239                         charid = 0;
3240                     }
3241
3242
3243                     do {
3244 #ifdef DEBUGGING
3245                         word = aho->states[ state ].wordnum;
3246 #endif
3247                         base = aho->states[ state ].trans.base;
3248
3249                         DEBUG_TRIE_EXECUTE_r({
3250                             if (failed)
3251                                 dump_exec_pos((char *)uc, c, strend, real_start,
3252                                     s,   utf8_target, 0 );
3253                             Perl_re_printf( aTHX_
3254                                 "%sState: %4" UVxf ", word=%" UVxf,
3255                                 failed ? " Fail transition to " : "",
3256                                 (UV)state, (UV)word);
3257                         });
3258                         if ( base ) {
3259                             U32 tmp;
3260                             I32 offset;
3261                             if (charid &&
3262                                  ( ((offset = base + charid
3263                                     - 1 - trie->uniquecharcount)) >= 0)
3264                                  && ((U32)offset < trie->lasttrans)
3265                                  && trie->trans[offset].check == state
3266                                  && (tmp=trie->trans[offset].next))
3267                             {
3268                                 DEBUG_TRIE_EXECUTE_r(
3269                                     Perl_re_printf( aTHX_ " - legal\n"));
3270                                 state = tmp;
3271                                 break;
3272                             }
3273                             else {
3274                                 DEBUG_TRIE_EXECUTE_r(
3275                                     Perl_re_printf( aTHX_ " - fail\n"));
3276                                 failed = 1;
3277                                 state = aho->fail[state];
3278                             }
3279                         }
3280                         else {
3281                             /* we must be accepting here */
3282                             DEBUG_TRIE_EXECUTE_r(
3283                                     Perl_re_printf( aTHX_ " - accepting\n"));
3284                             failed = 1;
3285                             break;
3286                         }
3287                     } while(state);
3288                     uc += len;
3289                     if (failed) {
3290                         if (leftmost)
3291                             break;
3292                         if (!state) state = 1;
3293                     }
3294                 }
3295                 if ( aho->states[ state ].wordnum ) {
3296                     U8 *lpos = points[ (pointpos
3297                                       - trie->wordinfo[aho->states[ state ]
3298                                                     .wordnum].len) % maxlen ];
3299                     if (!leftmost || lpos < leftmost) {
3300                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3301                         leftmost = lpos;
3302                     }
3303                 }
3304                 if (leftmost) {
3305                     s = (char*)leftmost;
3306                     DEBUG_TRIE_EXECUTE_r({
3307                         Perl_re_printf( aTHX_  "Matches word #%" UVxf
3308                                         " at position %" IVdf ". Trying full"
3309                                         " pattern...\n",
3310                             (UV)accepted_word, (IV)(s - real_start)
3311                         );
3312                     });
3313                     if (reginfo->intuit || regtry(reginfo, &s)) {
3314                         FREETMPS;
3315                         LEAVE;
3316                         goto got_it;
3317                     }
3318                     if (s < reginfo->strend) {
3319                         s = HOPc(s,1);
3320                     }
3321                     DEBUG_TRIE_EXECUTE_r({
3322                         Perl_re_printf( aTHX_
3323                                        "Pattern failed. Looking for new start"
3324                                        " point...\n");
3325                     });
3326                 } else {
3327                     DEBUG_TRIE_EXECUTE_r(
3328                         Perl_re_printf( aTHX_ "No match.\n"));
3329                     break;
3330                 }
3331             }
3332             FREETMPS;
3333             LEAVE;
3334         }
3335         break;
3336
3337       case EXACTFU_REQ8_t8_pb:
3338       case EXACTFUP_tb_p8:
3339       case EXACTFUP_t8_p8:
3340       case EXACTF_tb_p8:
3341       case EXACTF_t8_p8:   /* This node only generated for non-utf8 patterns */
3342       case EXACTFAA_NO_TRIE_tb_p8:
3343       case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3344                                       patterns */
3345         assert(0);
3346
3347       default:
3348         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3349     } /* End of switch on node type */
3350
3351     return 0;
3352
3353   got_it:
3354     return s;
3355 }
3356
3357 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3358  * flags have same meanings as with regexec_flags() */
3359
3360 static void
3361 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3362                             char *strbeg,
3363                             char *strend,
3364                             SV *sv,
3365                             U32 flags,
3366                             bool utf8_target)
3367 {
3368     struct regexp *const prog = ReANY(rx);
3369
3370     if (flags & REXEC_COPY_STR) {
3371 #ifdef PERL_ANY_COW
3372         if (SvCANCOW(sv)) {
3373             DEBUG_C(Perl_re_printf( aTHX_
3374                               "Copy on write: regexp capture, type %d\n",
3375                                     (int) SvTYPE(sv)));
3376             /* Create a new COW SV to share the match string and store
3377              * in saved_copy, unless the current COW SV in saved_copy
3378              * is valid and suitable for our purpose */
3379             if ((   prog->saved_copy
3380                  && SvIsCOW(prog->saved_copy)
3381                  && SvPOKp(prog->saved_copy)
3382                  && SvIsCOW(sv)
3383                  && SvPOKp(sv)
3384                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3385             {
3386                 /* just reuse saved_copy SV */
3387                 if (RXp_MATCH_COPIED(prog)) {
3388                     Safefree(prog->subbeg);
3389                     RXp_MATCH_COPIED_off(prog);
3390                 }
3391             }
3392             else {
3393                 /* create new COW SV to share string */
3394                 RXp_MATCH_COPY_FREE(prog);
3395                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3396             }
3397             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3398             assert (SvPOKp(prog->saved_copy));
3399             prog->sublen  = strend - strbeg;
3400             prog->suboffset = 0;
3401             prog->subcoffset = 0;
3402         } else
3403 #endif
3404         {
3405             SSize_t min = 0;
3406             SSize_t max = strend - strbeg;
3407             SSize_t sublen;
3408
3409             if (    (flags & REXEC_COPY_SKIP_POST)
3410                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3411                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3412             ) { /* don't copy $' part of string */
3413                 U32 n = 0;
3414                 max = -1;
3415                 /* calculate the right-most part of the string covered
3416                  * by a capture. Due to lookahead, this may be to
3417                  * the right of $&, so we have to scan all captures */
3418                 while (n <= prog->lastparen) {
3419                     if (prog->offs[n].end > max)
3420                         max = prog->offs[n].end;
3421                     n++;
3422                 }
3423                 if (max == -1)
3424                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3425                             ? prog->offs[0].start
3426                             : 0;
3427                 assert(max >= 0 && max <= strend - strbeg);
3428             }
3429
3430             if (    (flags & REXEC_COPY_SKIP_PRE)
3431                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3432                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3433             ) { /* don't copy $` part of string */
3434                 U32 n = 0;
3435                 min = max;
3436                 /* calculate the left-most part of the string covered
3437                  * by a capture. Due to lookbehind, this may be to
3438                  * the left of $&, so we have to scan all captures */
3439                 while (min && n <= prog->lastparen) {
3440                     if (   prog->offs[n].start != -1
3441                         && prog->offs[n].start < min)
3442                     {
3443                         min = prog->offs[n].start;
3444                     }
3445                     n++;
3446                 }
3447                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3448                     && min >  prog->offs[0].end
3449                 )
3450                     min = prog->offs[0].end;
3451
3452             }
3453
3454             assert(min >= 0 && min <= max && min <= strend - strbeg);
3455             sublen = max - min;
3456
3457             if (RXp_MATCH_COPIED(prog)) {
3458                 if (sublen > prog->sublen)
3459                     prog->subbeg =
3460                             (char*)saferealloc(prog->subbeg, sublen+1);
3461             }
3462             else
3463                 prog->subbeg = (char*)safemalloc(sublen+1);
3464             Copy(strbeg + min, prog->subbeg, sublen, char);
3465             prog->subbeg[sublen] = '\0';
3466             prog->suboffset = min;
3467             prog->sublen = sublen;
3468             RXp_MATCH_COPIED_on(prog);
3469         }
3470         prog->subcoffset = prog->suboffset;
3471         if (prog->suboffset && utf8_target) {
3472             /* Convert byte offset to chars.
3473              * XXX ideally should only compute this if @-/@+
3474              * has been seen, a la PL_sawampersand ??? */
3475
3476             /* If there's a direct correspondence between the
3477              * string which we're matching and the original SV,
3478              * then we can use the utf8 len cache associated with
3479              * the SV. In particular, it means that under //g,
3480              * sv_pos_b2u() will use the previously cached
3481              * position to speed up working out the new length of
3482              * subcoffset, rather than counting from the start of
3483              * the string each time. This stops
3484              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3485              * from going quadratic */
3486             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3487                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3488                                                 SV_GMAGIC|SV_CONST_RETURN);
3489             else
3490                 prog->subcoffset = utf8_length((U8*)strbeg,
3491                                     (U8*)(strbeg+prog->suboffset));
3492         }
3493     }
3494     else {
3495         RXp_MATCH_COPY_FREE(prog);
3496         prog->subbeg = strbeg;
3497         prog->suboffset = 0;
3498         prog->subcoffset = 0;
3499         prog->sublen = strend - strbeg;
3500     }
3501 }
3502
3503
3504
3505
3506 /*
3507  - regexec_flags - match a regexp against a string
3508  */
3509 I32
3510 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3511               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3512 /* stringarg: the point in the string at which to begin matching */
3513 /* strend:    pointer to null at end of string */
3514 /* strbeg:    real beginning of string */
3515 /* minend:    end of match must be >= minend bytes after stringarg. */
3516 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3517  *            itself is accessed via the pointers above */
3518 /* data:      May be used for some additional optimizations.
3519               Currently unused. */
3520 /* flags:     For optimizations. See REXEC_* in regexp.h */
3521
3522 {
3523     struct regexp *const prog = ReANY(rx);
3524     char *s;
3525     regnode *c;
3526     char *startpos;
3527     SSize_t minlen;             /* must match at least this many chars */
3528     SSize_t dontbother = 0;     /* how many characters not to try at end */
3529     const bool utf8_target = cBOOL(DO_UTF8(sv));
3530     I32 multiline;
3531     RXi_GET_DECL(prog,progi);
3532     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3533     regmatch_info *const reginfo = &reginfo_buf;
3534     regexp_paren_pair *swap = NULL;
3535     I32 oldsave;
3536     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3537
3538     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3539     PERL_UNUSED_ARG(data);
3540
3541     /* Be paranoid... */
3542     if (prog == NULL) {
3543         Perl_croak(aTHX_ "NULL regexp parameter");
3544     }
3545
3546     DEBUG_EXECUTE_r(
3547         debug_start_match(rx, utf8_target, stringarg, strend,
3548         "Matching");
3549     );
3550
3551     startpos = stringarg;
3552
3553     /* set these early as they may be used by the HOP macros below */
3554     reginfo->strbeg = strbeg;
3555     reginfo->strend = strend;
3556     reginfo->is_utf8_target = cBOOL(utf8_target);
3557
3558     if (prog->intflags & PREGf_GPOS_SEEN) {
3559         MAGIC *mg;
3560
3561         /* set reginfo->ganch, the position where \G can match */
3562
3563         reginfo->ganch =
3564             (flags & REXEC_IGNOREPOS)
3565             ? stringarg /* use start pos rather than pos() */
3566             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3567               /* Defined pos(): */
3568             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3569             : strbeg; /* pos() not defined; use start of string */
3570
3571         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3572             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3573
3574         /* in the presence of \G, we may need to start looking earlier in
3575          * the string than the suggested start point of stringarg:
3576          * if prog->gofs is set, then that's a known, fixed minimum
3577          * offset, such as
3578          * /..\G/:   gofs = 2
3579          * /ab|c\G/: gofs = 1
3580          * or if the minimum offset isn't known, then we have to go back
3581          * to the start of the string, e.g. /w+\G/
3582          */
3583
3584         if (prog->intflags & PREGf_ANCH_GPOS) {
3585             if (prog->gofs) {
3586                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3587                 if (!startpos ||
3588                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3589                 {
3590                     DEBUG_GPOS_r(Perl_re_printf( aTHX_
3591                             "fail: ganch-gofs before earliest possible start\n"));
3592                     return 0;
3593                 }
3594             }
3595             else
3596                 startpos = reginfo->ganch;
3597         }
3598         else if (prog->gofs) {
3599             startpos = HOPBACKc(startpos, prog->gofs);
3600             if (!startpos)
3601                 startpos = strbeg;
3602         }
3603         else if (prog->intflags & PREGf_GPOS_FLOAT)
3604             startpos = strbeg;
3605     }
3606
3607     minlen = prog->minlen;
3608     if ((startpos + minlen) > strend || startpos < strbeg) {
3609         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3610                         "Regex match can't succeed, so not even tried\n"));
3611         return 0;
3612     }
3613
3614     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3615      * which will call destuctors to reset PL_regmatch_state, free higher
3616      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3617      * regmatch_info_aux_eval */
3618
3619     oldsave = PL_savestack_ix;
3620
3621     s = startpos;
3622
3623     if ((prog->extflags & RXf_USE_INTUIT)
3624         && !(flags & REXEC_CHECKED))
3625     {
3626         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3627                                     flags, NULL);
3628         if (!s)
3629             return 0;
3630
3631         if (prog->extflags & RXf_CHECK_ALL) {
3632             /* we can match based purely on the result of INTUIT.
3633              * Set up captures etc just for $& and $-[0]
3634              * (an intuit-only match wont have $1,$2,..) */
3635             assert(!prog->nparens);
3636
3637             /* s/// doesn't like it if $& is earlier than where we asked it to
3638              * start searching (which can happen on something like /.\G/) */
3639             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3640                     && (s < stringarg))
3641             {
3642                 /* this should only be possible under \G */
3643                 assert(prog->intflags & PREGf_GPOS_SEEN);
3644                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3645                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3646                 goto phooey;
3647             }
3648
3649             /* match via INTUIT shouldn't have any captures.
3650              * Let @-, @+, $^N know */
3651             prog->lastparen = prog->lastcloseparen = 0;
3652             RXp_MATCH_UTF8_set(prog, utf8_target);
3653             prog->offs[0].start = s - strbeg;
3654             prog->offs[0].end = utf8_target
3655                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3656                 : s - strbeg + prog->minlenret;
3657             if ( !(flags & REXEC_NOT_FIRST) )
3658                 S_reg_set_capture_string(aTHX_ rx,
3659                                         strbeg, strend,
3660                                         sv, flags, utf8_target);
3661
3662             return 1;
3663         }
3664     }
3665
3666     multiline = prog->extflags & RXf_PMf_MULTILINE;
3667
3668     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3669         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3670                               "String too short [regexec_flags]...\n"));
3671         goto phooey;
3672     }
3673
3674     /* Check validity of program. */
3675     if (UCHARAT(progi->program) != REG_MAGIC) {
3676         Perl_croak(aTHX_ "corrupted regexp program");
3677     }
3678
3679     RXp_MATCH_TAINTED_off(prog);
3680     RXp_MATCH_UTF8_set(prog, utf8_target);
3681
3682     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3683     reginfo->intuit = 0;
3684     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3685     reginfo->warned = FALSE;
3686     reginfo->sv = sv;
3687     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3688     /* see how far we have to get to not match where we matched before */
3689     reginfo->till = stringarg + minend;
3690
3691     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3692         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3693            S_cleanup_regmatch_info_aux has executed (registered by
3694            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3695            magic belonging to this SV.
3696            Not newSVsv, either, as it does not COW.
3697         */
3698         reginfo->sv = newSV(0);
3699         SvSetSV_nosteal(reginfo->sv, sv);
3700         SAVEFREESV(reginfo->sv);
3701     }
3702
3703     /* reserve next 2 or 3 slots in PL_regmatch_state:
3704      * slot N+0: may currently be in use: skip it
3705      * slot N+1: use for regmatch_info_aux struct
3706      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3707      * slot N+3: ready for use by regmatch()
3708      */
3709
3710     {
3711         regmatch_state *old_regmatch_state;
3712         regmatch_slab  *old_regmatch_slab;
3713         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3714
3715         /* on first ever match, allocate first slab */
3716         if (!PL_regmatch_slab) {
3717             Newx(PL_regmatch_slab, 1, regmatch_slab);
3718             PL_regmatch_slab->prev = NULL;
3719             PL_regmatch_slab->next = NULL;
3720             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3721         }
3722
3723         old_regmatch_state = PL_regmatch_state;
3724         old_regmatch_slab  = PL_regmatch_slab;
3725
3726         for (i=0; i <= max; i++) {
3727             if (i == 1)
3728                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3729             else if (i ==2)
3730                 reginfo->info_aux_eval =
3731                 reginfo->info_aux->info_aux_eval =
3732                             &(PL_regmatch_state->u.info_aux_eval);
3733
3734             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3735                 PL_regmatch_state = S_push_slab(aTHX);
3736         }
3737
3738         /* note initial PL_regmatch_state position; at end of match we'll
3739          * pop back to there and free any higher slabs */
3740
3741         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3742         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3743         reginfo->info_aux->poscache = NULL;
3744
3745         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3746
3747         if ((prog->extflags & RXf_EVAL_SEEN))
3748             S_setup_eval_state(aTHX_ reginfo);
3749         else
3750             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3751     }
3752
3753     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3754         /* We have to be careful. If the previous successful match
3755            was from this regex we don't want a subsequent partially
3756            successful match to clobber the old results.
3757            So when we detect this possibility we add a swap buffer
3758            to the re, and switch the buffer each match. If we fail,
3759            we switch it back; otherwise we leave it swapped.
3760         */
3761         swap = prog->offs;
3762         /* avoid leak if we die, or clean up anyway if match completes */
3763         SAVEFREEPV(swap);
3764         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3765         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3766             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3767             0,
3768             PTR2UV(prog),
3769             PTR2UV(swap),
3770             PTR2UV(prog->offs)
3771         ));
3772     }
3773
3774     if (prog->recurse_locinput)
3775         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3776
3777     /* Simplest case: anchored match (but not \G) need be tried only once,
3778      * or with MBOL, only at the beginning of each line.
3779      *
3780      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3781      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3782      * match at the start of the string then it won't match anywhere else
3783      * either; while with /.*.../, if it doesn't match at the beginning,
3784      * the earliest it could match is at the start of the next line */
3785
3786     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3787         char *end;
3788
3789         if (regtry(reginfo, &s))
3790             goto got_it;
3791
3792         if (!(prog->intflags & PREGf_ANCH_MBOL))
3793             goto phooey;
3794
3795         /* didn't match at start, try at other newline positions */
3796
3797         if (minlen)
3798             dontbother = minlen - 1;
3799         end = HOP3c(strend, -dontbother, strbeg) - 1;
3800
3801         /* skip to next newline */
3802
3803         while (s <= end) { /* note it could be possible to match at the end of the string */
3804             /* NB: newlines are the same in unicode as they are in latin */
3805             if (*s++ != '\n')
3806                 continue;
3807             if (prog->check_substr || prog->check_utf8) {
3808             /* note that with PREGf_IMPLICIT, intuit can only fail
3809              * or return the start position, so it's of limited utility.
3810              * Nevertheless, I made the decision that the potential for
3811              * quick fail was still worth it - DAPM */
3812                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3813                 if (!s)
3814                     goto phooey;
3815             }
3816             if (regtry(reginfo, &s))
3817                 goto got_it;
3818         }
3819         goto phooey;
3820     } /* end anchored search */
3821
3822     /* anchored \G match */
3823     if (prog->intflags & PREGf_ANCH_GPOS)
3824     {
3825         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3826         assert(prog->intflags & PREGf_GPOS_SEEN);
3827         /* For anchored \G, the only position it can match from is
3828          * (ganch-gofs); we already set startpos to this above; if intuit
3829          * moved us on from there, we can't possibly succeed */
3830         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3831         if (s == startpos && regtry(reginfo, &s))
3832             goto got_it;
3833         goto phooey;
3834     }
3835
3836     /* Messy cases:  unanchored match. */
3837
3838     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3839         /* we have /x+whatever/ */
3840         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3841         char ch;
3842 #ifdef DEBUGGING
3843         int did_match = 0;
3844 #endif
3845         if (utf8_target) {
3846             if (! prog->anchored_utf8) {
3847                 to_utf8_substr(prog);
3848             }
3849             ch = SvPVX_const(prog->anchored_utf8)[0];
3850             REXEC_FBC_UTF8_SCAN(
3851                 if (*s == ch) {
3852                     DEBUG_EXECUTE_r( did_match = 1 );
3853                     if (regtry(reginfo, &s)) goto got_it;
3854                     s += UTF8_SAFE_SKIP(s, strend);
3855                     while (s < strend && *s == ch)
3856                         s += UTF8SKIP(s);
3857                 }
3858             );
3859
3860         }
3861         else {
3862             if (! prog->anchored_substr) {
3863                 if (! to_byte_substr(prog)) {
3864                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3865                 }
3866             }
3867             ch = SvPVX_const(prog->anchored_substr)[0];
3868             REXEC_FBC_NON_UTF8_SCAN(
3869                 if (*s == ch) {
3870                     DEBUG_EXECUTE_r( did_match = 1 );
3871                     if (regtry(reginfo, &s)) goto got_it;
3872                     s++;
3873                     while (s < strend && *s == ch)
3874                         s++;
3875                 }
3876             );
3877         }
3878         DEBUG_EXECUTE_r(if (!did_match)
3879                 Perl_re_printf( aTHX_
3880                                   "Did not find anchored character...\n")
3881                );
3882     }
3883     else if (prog->anchored_substr != NULL
3884               || prog->anchored_utf8 != NULL
3885               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3886                   && prog->float_max_offset < strend - s)) {
3887         SV *must;
3888         SSize_t back_max;
3889         SSize_t back_min;
3890         char *last;
3891         char *last1;            /* Last position checked before */
3892 #ifdef DEBUGGING
3893         int did_match = 0;
3894 #endif
3895         if (prog->anchored_substr || prog->anchored_utf8) {
3896             if (utf8_target) {
3897                 if (! prog->anchored_utf8) {
3898                     to_utf8_substr(prog);
3899                 }
3900                 must = prog->anchored_utf8;
3901             }
3902             else {
3903                 if (! prog->anchored_substr) {
3904                     if (! to_byte_substr(prog)) {
3905                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3906                     }
3907                 }
3908                 must = prog->anchored_substr;
3909             }
3910             back_max = back_min = prog->anchored_offset;
3911         } else {
3912             if (utf8_target) {
3913                 if (! prog->float_utf8) {
3914                     to_utf8_substr(prog);
3915                 }
3916                 must = prog->float_utf8;
3917             }
3918             else {
3919                 if (! prog->float_substr) {
3920                     if (! to_byte_substr(prog)) {
3921                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3922                     }
3923                 }
3924                 must = prog->float_substr;
3925             }
3926             back_max = prog->float_max_offset;
3927             back_min = prog->float_min_offset;
3928         }
3929
3930         if (back_min<0) {
3931             last = strend;
3932         } else {
3933             last = HOP3c(strend,        /* Cannot start after this */
3934                   -(SSize_t)(CHR_SVLEN(must)
3935                          - (SvTAIL(must) != 0) + back_min), strbeg);
3936         }
3937         if (s > reginfo->strbeg)
3938             last1 = HOPc(s, -1);
3939         else
3940             last1 = s - 1;      /* bogus */
3941
3942         /* XXXX check_substr already used to find "s", can optimize if
3943            check_substr==must. */
3944         dontbother = 0;
3945         strend = HOPc(strend, -dontbother);
3946         while ( (s <= last) &&
3947                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3948                                   (unsigned char*)strend, must,
3949                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3950             DEBUG_EXECUTE_r( did_match = 1 );
3951             if (HOPc(s, -back_max) > last1) {
3952                 last1 = HOPc(s, -back_min);
3953                 s = HOPc(s, -back_max);
3954             }
3955             else {
3956                 char * const t = (last1 >= reginfo->strbeg)
3957                                     ? HOPc(last1, 1) : last1 + 1;
3958
3959                 last1 = HOPc(s, -back_min);
3960                 s = t;
3961             }
3962             if (utf8_target) {
3963                 while (s <= last1) {
3964                     if (regtry(reginfo, &s))
3965                         goto got_it;
3966                     if (s >= last1) {
3967                         s++; /* to break out of outer loop */
3968                         break;
3969                     }
3970                     s += UTF8SKIP(s);
3971                 }
3972             }
3973             else {
3974                 while (s <= last1) {
3975                     if (regtry(reginfo, &s))
3976                         goto got_it;
3977                     s++;
3978                 }
3979             }
3980         }
3981         DEBUG_EXECUTE_r(if (!did_match) {
3982             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3983                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3984             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3985                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3986                                ? "anchored" : "floating"),
3987                 quoted, RE_SV_TAIL(must));
3988         });
3989         goto phooey;
3990     }
3991     else if ( (c = progi->regstclass) ) {
3992         if (minlen) {
3993             const OPCODE op = OP(progi->regstclass);
3994             /* don't bother with what can't match */
3995             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3996                 strend = HOPc(strend, -(minlen - 1));
3997         }
3998         DEBUG_EXECUTE_r({
3999             SV * const prop = sv_newmortal();
4000             regprop(prog, prop, c, reginfo, NULL);
4001             {
4002                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4003                     s,strend-s,PL_dump_re_max_len);
4004                 Perl_re_printf( aTHX_
4005                     "Matching stclass %.*s against %s (%d bytes)\n",
4006                     (int)SvCUR(prop), SvPVX_const(prop),
4007                      quoted, (int)(strend - s));
4008             }
4009         });
4010         if (find_byclass(prog, c, s, strend, reginfo))
4011             goto got_it;
4012         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
4013     }
4014     else {
4015         dontbother = 0;
4016         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4017             /* Trim the end. */
4018             char *last= NULL;
4019             SV* float_real;
4020             STRLEN len;
4021             const char *little;
4022
4023             if (utf8_target) {
4024                 if (! prog->float_utf8) {
4025                     to_utf8_substr(prog);
4026                 }
4027                 float_real = prog->float_utf8;
4028             }
4029             else {
4030                 if (! prog->float_substr) {
4031                     if (! to_byte_substr(prog)) {
4032                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4033                     }
4034                 }
4035                 float_real = prog->float_substr;
4036             }
4037
4038             little = SvPV_const(float_real, len);
4039             if (SvTAIL(float_real)) {
4040                     /* This means that float_real contains an artificial \n on
4041                      * the end due to the presence of something like this:
4042                      * /foo$/ where we can match both "foo" and "foo\n" at the
4043                      * end of the string.  So we have to compare the end of the
4044                      * string first against the float_real without the \n and
4045                      * then against the full float_real with the string.  We
4046                      * have to watch out for cases where the string might be
4047                      * smaller than the float_real or the float_real without
4048                      * the \n. */
4049                     char *checkpos= strend - len;
4050                     DEBUG_OPTIMISE_r(
4051                         Perl_re_printf( aTHX_
4052                             "%sChecking for float_real.%s\n",
4053                             PL_colors[4], PL_colors[5]));
4054                     if (checkpos + 1 < strbeg) {
4055                         /* can't match, even if we remove the trailing \n
4056                          * string is too short to match */
4057                         DEBUG_EXECUTE_r(
4058                             Perl_re_printf( aTHX_
4059                                 "%sString shorter than required trailing substring, cannot match.%s\n",
4060                                 PL_colors[4], PL_colors[5]));
4061                         goto phooey;
4062                     } else if (memEQ(checkpos + 1, little, len - 1)) {
4063                         /* can match, the end of the string matches without the
4064                          * "\n" */
4065                         last = checkpos + 1;
4066                     } else if (checkpos < strbeg) {
4067                         /* cant match, string is too short when the "\n" is
4068                          * included */
4069                         DEBUG_EXECUTE_r(
4070                             Perl_re_printf( aTHX_
4071                                 "%sString does not contain required trailing substring, cannot match.%s\n",
4072                                 PL_colors[4], PL_colors[5]));
4073                         goto phooey;
4074                     } else if (!multiline) {
4075                         /* non multiline match, so compare with the "\n" at the
4076                          * end of the string */
4077                         if (memEQ(checkpos, little, len)) {
4078                             last= checkpos;
4079                         } else {
4080                             DEBUG_EXECUTE_r(
4081                                 Perl_re_printf( aTHX_
4082                                     "%sString does not contain required trailing substring, cannot match.%s\n",
4083                                     PL_colors[4], PL_colors[5]));
4084                             goto phooey;
4085                         }
4086                     } else {
4087                         /* multiline match, so we have to search for a place
4088                          * where the full string is located */
4089                         goto find_last;
4090                     }
4091             } else {
4092                   find_last:
4093                     if (len)
4094                         last = rninstr(s, strend, little, little + len);
4095                     else
4096                         last = strend;  /* matching "$" */
4097             }
4098             if (!last) {
4099                 /* at one point this block contained a comment which was
4100                  * probably incorrect, which said that this was a "should not
4101                  * happen" case.  Even if it was true when it was written I am
4102                  * pretty sure it is not anymore, so I have removed the comment
4103                  * and replaced it with this one. Yves */
4104                 DEBUG_EXECUTE_r(
4105                     Perl_re_printf( aTHX_
4106                         "%sString does not contain required substring, cannot match.%s\n",
4107                         PL_colors[4], PL_colors[5]
4108                     ));
4109                 goto phooey;
4110             }
4111             dontbother = strend - last + prog->float_min_offset;
4112         }
4113         if (minlen && (dontbother < minlen))
4114             dontbother = minlen - 1;
4115         strend -= dontbother;              /* this one's always in bytes! */
4116         /* We don't know much -- general case. */
4117         if (utf8_target) {
4118             for (;;) {
4119                 if (regtry(reginfo, &s))
4120                     goto got_it;
4121                 if (s >= strend)
4122                     break;
4123                 s += UTF8SKIP(s);
4124             };
4125         }
4126         else {
4127             do {
4128                 if (regtry(reginfo, &s))
4129                     goto got_it;
4130             } while (s++ < strend);
4131         }
4132     }
4133
4134     /* Failure. */
4135     goto phooey;
4136
4137   got_it:
4138     /* s/// doesn't like it if $& is earlier than where we asked it to
4139      * start searching (which can happen on something like /.\G/) */
4140     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
4141             && (prog->offs[0].start < stringarg - strbeg))
4142     {
4143         /* this should only be possible under \G */
4144         assert(prog->intflags & PREGf_GPOS_SEEN);
4145         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4146             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4147         goto phooey;
4148     }
4149
4150     /* clean up; this will trigger destructors that will free all slabs
4151      * above the current one, and cleanup the regmatch_info_aux
4152      * and regmatch_info_aux_eval sructs */
4153
4154     LEAVE_SCOPE(oldsave);
4155
4156     if (RXp_PAREN_NAMES(prog))
4157         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4158
4159     /* make sure $`, $&, $', and $digit will work later */
4160     if ( !(flags & REXEC_NOT_FIRST) )
4161         S_reg_set_capture_string(aTHX_ rx,
4162                                     strbeg, reginfo->strend,
4163                                     sv, flags, utf8_target);
4164
4165     return 1;
4166
4167   phooey:
4168     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
4169                           PL_colors[4], PL_colors[5]));
4170
4171     if (swap) {
4172         /* we failed :-( roll it back.
4173          * Since the swap buffer will be freed on scope exit which follows
4174          * shortly, restore the old captures by copying 'swap's original
4175          * data to the new offs buffer
4176          */
4177         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4178             "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4179             0,
4180             PTR2UV(prog),
4181             PTR2UV(prog->offs),
4182             PTR2UV(swap)
4183         ));
4184
4185         Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
4186     }
4187
4188     /* clean up; this will trigger destructors that will free all slabs
4189      * above the current one, and cleanup the regmatch_info_aux
4190      * and regmatch_info_aux_eval sructs */
4191
4192     LEAVE_SCOPE(oldsave);
4193
4194     return 0;
4195 }
4196
4197
4198 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4199  * Do inc before dec, in case old and new rex are the same */
4200 #define SET_reg_curpm(Re2)                          \
4201     if (reginfo->info_aux_eval) {                   \
4202         (void)ReREFCNT_inc(Re2);                    \
4203         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
4204         PM_SETRE((PL_reg_curpm), (Re2));            \
4205     }
4206
4207
4208 /*
4209  - regtry - try match at specific point
4210  */
4211 STATIC bool                     /* 0 failure, 1 success */
4212 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4213 {
4214     CHECKPOINT lastcp;
4215     REGEXP *const rx = reginfo->prog;
4216     regexp *const prog = ReANY(rx);
4217     SSize_t result;
4218 #ifdef DEBUGGING
4219     U32 depth = 0; /* used by REGCP_SET */
4220 #endif
4221     RXi_GET_DECL(prog,progi);
4222     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4223
4224     PERL_ARGS_ASSERT_REGTRY;
4225
4226     reginfo->cutpoint=NULL;
4227
4228     prog->offs[0].start = *startposp - reginfo->strbeg;
4229     prog->lastparen = 0;
4230     prog->lastcloseparen = 0;
4231
4232     /* XXXX What this code is doing here?!!!  There should be no need
4233        to do this again and again, prog->lastparen should take care of
4234        this!  --ilya*/
4235
4236     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4237      * Actually, the code in regcppop() (which Ilya may be meaning by
4238      * prog->lastparen), is not needed at all by the test suite
4239      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4240      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4241      * Meanwhile, this code *is* needed for the
4242      * above-mentioned test suite tests to succeed.  The common theme
4243      * on those tests seems to be returning null fields from matches.
4244      * --jhi updated by dapm */
4245
4246     /* After encountering a variant of the issue mentioned above I think
4247      * the point Ilya was making is that if we properly unwind whenever
4248      * we set lastparen to a smaller value then we should not need to do
4249      * this every time, only when needed. So if we have tests that fail if
4250      * we remove this, then it suggests somewhere else we are improperly
4251      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4252      * places it is called, and related regcp() routines. - Yves */
4253 #if 1
4254     if (prog->nparens) {
4255         regexp_paren_pair *pp = prog->offs;
4256         I32 i;
4257         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4258             ++pp;
4259             pp->start = -1;
4260             pp->end = -1;
4261         }
4262     }
4263 #endif
4264     REGCP_SET(lastcp);
4265     result = regmatch(reginfo, *startposp, progi->program + 1);
4266     if (result != -1) {
4267         prog->offs[0].end = result;
4268         return 1;
4269     }
4270     if (reginfo->cutpoint)
4271         *startposp= reginfo->cutpoint;
4272     REGCP_UNWIND(lastcp);
4273     return 0;
4274 }
4275
4276 /* this is used to determine how far from the left messages like
4277    'failed...' are printed in regexec.c. It should be set such that
4278    messages are inline with the regop output that created them.
4279 */
4280 #define REPORT_CODE_OFF 29
4281 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4282 #ifdef DEBUGGING
4283 int
4284 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4285 {
4286     va_list ap;
4287     int result;
4288     PerlIO *f= Perl_debug_log;
4289     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4290     va_start(ap, depth);
4291     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4292     result = PerlIO_vprintf(f, fmt, ap);
4293     va_end(ap);
4294     return result;
4295 }
4296 #endif /* DEBUGGING */
4297
4298 /* grab a new slab and return the first slot in it */
4299
4300 STATIC regmatch_state *
4301 S_push_slab(pTHX)
4302 {
4303     regmatch_slab *s = PL_regmatch_slab->next;
4304     if (!s) {
4305         Newx(s, 1, regmatch_slab);
4306         s->prev = PL_regmatch_slab;
4307         s->next = NULL;
4308         PL_regmatch_slab->next = s;
4309     }
4310     PL_regmatch_slab = s;
4311     return SLAB_FIRST(s);
4312 }
4313
4314 #ifdef DEBUGGING
4315
4316 STATIC void
4317 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4318     const char *start, const char *end, const char *blurb)
4319 {
4320     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4321
4322     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4323
4324     if (!PL_colorset)
4325             reginitcolors();
4326     {
4327         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4328             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4329
4330         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4331             start, end - start, PL_dump_re_max_len);
4332
4333         Perl_re_printf( aTHX_
4334             "%s%s REx%s %s against %s\n",
4335                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4336
4337         if (utf8_target||utf8_pat)
4338             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4339                 utf8_pat ? "pattern" : "",
4340                 utf8_pat && utf8_target ? " and " : "",
4341                 utf8_target ? "string" : ""
4342             );
4343     }
4344 }
4345
4346 STATIC void
4347 S_dump_exec_pos(pTHX_ const char *locinput,
4348                       const regnode *scan,
4349                       const char *loc_regeol,
4350                       const char *loc_bostr,
4351                       const char *loc_reg_starttry,
4352                       const bool utf8_target,
4353                       const U32 depth
4354                 )
4355 {
4356     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4357     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4358     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4359     /* The part of the string before starttry has one color
4360        (pref0_len chars), between starttry and current
4361        position another one (pref_len - pref0_len chars),
4362        after the current position the third one.
4363        We assume that pref0_len <= pref_len, otherwise we
4364        decrease pref0_len.  */
4365     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4366         ? (5 + taill) - l : locinput - loc_bostr;
4367     int pref0_len;
4368
4369     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4370
4371     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4372         pref_len++;
4373     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4374     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4375         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4376               ? (5 + taill) - pref_len : loc_regeol - locinput);
4377     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4378         l--;
4379     if (pref0_len < 0)
4380         pref0_len = 0;
4381     if (pref0_len > pref_len)
4382         pref0_len = pref_len;
4383     {
4384         const int is_uni = utf8_target ? 1 : 0;
4385
4386         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4387             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4388
4389         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4390                     (locinput - pref_len + pref0_len),
4391                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4392
4393         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4394                     locinput, loc_regeol - locinput, 10, 0, 1);
4395
4396         const STRLEN tlen=len0+len1+len2;
4397         Perl_re_printf( aTHX_
4398                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4399                     (IV)(locinput - loc_bostr),
4400                     len0, s0,
4401                     len1, s1,
4402                     (docolor ? "" : "> <"),
4403                     len2, s2,
4404                     (int)(tlen > 19 ? 0 :  19 - tlen),
4405                     "",
4406                     depth);
4407     }
4408 }
4409
4410 #endif
4411
4412 /* reg_check_named_buff_matched()
4413  * Checks to see if a named buffer has matched. The data array of
4414  * buffer numbers corresponding to the buffer is expected to reside
4415  * in the regexp->data->data array in the slot stored in the ARG() of
4416  * node involved. Note that this routine doesn't actually care about the
4417  * name, that information is not preserved from compilation to execution.
4418  * Returns the index of the leftmost defined buffer with the given name
4419  * or 0 if non of the buffers matched.
4420  */
4421 STATIC I32
4422 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4423 {
4424     I32 n;
4425     RXi_GET_DECL(rex,rexi);
4426     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4427     I32 *nums=(I32*)SvPVX(sv_dat);
4428
4429     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4430
4431     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4432         if ((I32)rex->lastparen >= nums[n] &&
4433             rex->offs[nums[n]].end != -1)
4434         {
4435             return nums[n];
4436         }
4437     }
4438     return 0;
4439 }
4440
4441 static bool
4442 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4443                           struct next_matchable_info * m,
4444                           regmatch_info *reginfo)
4445 {
4446     /* This function determines various characteristics about every possible
4447      * initial match of the passed-in EXACTish <text_node>, and stores them in
4448      * <*m>.
4449      *
4450      * That includes a match string and a parallel mask, such that if you AND
4451      * the target string with the mask and compare with the match string,
4452      * you'll have a pretty good idea, perhaps even perfect, if that portion of
4453      * the target matches or not.
4454      *
4455      * The motivation behind this function is to allow the caller to set up
4456      * tight loops for matching.  Consider patterns like '.*B' or '.*?B' where
4457      * B is an arbitrary EXACTish node.  To find the end of .*, we look for the
4458      * beginning oF B, which is the passed in <text_node>  That's where this
4459      * function comes in.  The values it returns can quickly be used to rule
4460      * out many, or all, cases of possible matches not actually being the
4461      * beginning of B, <text_node>.  It is also used in regrepeat() where we
4462      * have 'A*', for arbitrary 'A'.  This sets up criteria to more efficiently
4463      * determine where the span of 'A's stop.
4464      *
4465      * If <text_node> is of type EXACT, there is only one possible character
4466      * that can match its first character, and so the situation is quite
4467      * simple.  But things can get much more complicated if folding is
4468      * involved.  It may be that the first character of an EXACTFish node
4469      * doesn't participate in any possible fold, e.g., punctuation, so it can
4470      * be matched only by itself.  The vast majority of characters that are in
4471      * folds match just two things, their lower and upper-case equivalents.
4472      * But not all are like that; some have multiple possible matches, or match
4473      * sequences of more than one character.  This function sorts all that out.
4474      *
4475      * It returns information about all possibilities of what the first
4476      * character(s) of <text_node> could look like.  Again, if <text_node> is a
4477      * plain EXACT node, that's just the actual first bytes of the first
4478      * character; but otherwise it is the bytes, that when masked, match all
4479      * possible combinations of all the initial bytes of all the characters
4480      * that could match, folded.  (Actually, this is a slight over promise.  It
4481      * handles only up to the initial 5 bytes, which is enough for all Unicode
4482      * characters, but not for all non-Unicode ones.)
4483      *
4484      * Here's an example to clarify.  Suppose the first character of
4485      * <text_node> is the letter 'C', and we are under /i matching.  That means
4486      * 'c' also matches.  The representations of these two characters differ in
4487      * just one bit, so the mask would be a zero in that position and ones in
4488      * the other 7.  And the returned string would be the AND of these two
4489      * characters, and would be one byte long, since these characters are each
4490      * a single byte.  ANDing the target <text_node> with this mask will yield
4491      * the returned string if and only if <text_node> begins with one of these
4492      * two characters.  So, the function would also return that the definitive
4493      * length matched is 1 byte.
4494      *
4495      * Now, suppose instead of the letter 'C',  <text_node> begins with the
4496      * letter 'F'.  The situation is much more complicated because there are
4497      * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4498      * begins with 'f', and hence could match.  We add these into the returned
4499      * string and mask, but the result isn't definitive; the caller has to
4500      * check further if its AND and compare pass.  But the failure of that
4501      * compare will quickly rule out most possible inputs.
4502      *
4503      * Much of this could be done in regcomp.c at compile time, except for
4504      * locale-dependent, and UTF-8 target dependent data.  Extra data fields
4505      * could be used for one or the other eventualities.
4506      *
4507      * If this function determines that no possible character in the target
4508      * string can match, it returns FALSE; otherwise TRUE.  (The FALSE
4509      * situation occurs if the first character in <text_node> requires UTF-8 to
4510      * represent, and the target string isn't in UTF-8.)
4511      *
4512      * Some analysis is in GH #18414, located at the time of this writing at:
4513      * https://github.com/Perl/perl5/issues/18414
4514      */
4515
4516     const bool utf8_target = reginfo->is_utf8_target;
4517     bool utf8_pat = reginfo->is_utf8_pat;
4518
4519     PERL_UINT_FAST8_T i;
4520
4521     /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4522      */
4523     U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4524     U8 lengths[MAX_MATCHES] = { 0 };
4525
4526     U8 index_of_longest = 0;
4527
4528     U8 *pat = (U8*)STRING(text_node);
4529     Size_t pat_len = STR_LEN(text_node);
4530     U8 op = OP(text_node);
4531
4532     U8 byte_mask[5]  = {0};
4533     U8 byte_anded[5] = {0};
4534
4535     /* There are some folds in Unicode to multiple characters.  This will hold
4536      * such characters that could fold to the beginning of 'text_node' */
4537     UV multi_fold_from = 0;
4538
4539     /* We may have to create a modified copy of the pattern */
4540     U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4541
4542     m->max_length = 0;
4543     m->min_length = 255;
4544     m->count = 0;
4545
4546     /* Even if the first character in the node can match something in Latin1,
4547      * if there is anything in the node that can't, the match must fail */
4548     if (! utf8_target && isEXACT_REQ8(op)) {
4549         return FALSE;
4550     }
4551
4552 /* Define a temporary op for use in this function, using an existing one that
4553  * should never be a real op during execution */
4554 #define TURKISH  PSEUDO
4555
4556     /* What to do about these two nodes had to be deferred to runtime (which is
4557      * now).  If the extra information we now have so indicates, turn them into
4558      * EXACTFU nodes */
4559     if (   (op == EXACTF && utf8_target)
4560         || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4561     {
4562         if (op == EXACTFL && PL_in_utf8_turkic_locale) {
4563             op = TURKISH;
4564         }
4565         else {
4566             op = EXACTFU;
4567         }
4568
4569         /* And certain situations are better handled if we create a modified
4570          * version of the pattern */
4571         if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4572                            specific problematic characters */
4573             if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4574
4575                 /* The node could start with characters that are the first ones
4576                  * of a multi-character fold. */
4577                 multi_fold_from
4578                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4579                 if (multi_fold_from) {
4580
4581                     /* Here, they do form a sequence that matches the fold of a
4582                      * single character.  That single character then is a
4583                      * possible match.  Below we will look again at this, but
4584                      * the code below is expecting every character in the
4585                      * pattern to be folded, which the input isn't required to
4586                      * be in this case.  So, just fold the single character,
4587                      * and the result will be in the expected form. */
4588                     _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4589                                        FOLD_FLAGS_FULL);
4590                     pat = mod_pat;
4591                 }
4592                          /* Turkish has a couple extra possibilities. */
4593                 else if (   UNLIKELY(op == TURKISH)
4594                          &&  pat_len >= 3
4595                          &&  isALPHA_FOLD_EQ(pat[0], 'f')
4596                          && (   memBEGINs(pat + 1, pat_len - 1,
4597                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4598                              || (   pat_len >= 4
4599                                  && isALPHA_FOLD_EQ(pat[1], 'f')
4600                                  && memBEGINs(pat + 2, pat_len - 2,
4601                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4602                 ))) {
4603                     /* The macros for finding a multi-char fold don't include
4604                      * the Turkish possibilities, in which U+130 folds to 'i'.
4605                      * Hard-code these.  It's very unlikely that Unicode will
4606                      * ever add any others.  */
4607                     if (pat[1] == 'f') {
4608                         pat_len = 3;
4609                         Copy("ffi", mod_pat, pat_len, U8);
4610                     }
4611                     else {
4612                         pat_len = 2;
4613                         Copy("fi", mod_pat, pat_len, U8);
4614                     }
4615                     pat = mod_pat;
4616                 }
4617                 else if (    UTF8_IS_DOWNGRADEABLE_START(*pat)
4618                          &&  LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4619                          &&  LIKELY(memNEs(pat, pat_len,
4620                                            LATIN_SMALL_LETTER_SHARP_S_UTF8))
4621                          && (LIKELY(op != TURKISH || *pat != 'I')))
4622                 {
4623                     /* For all cases of things between 0-255, except the ones
4624                      * in the conditional above, the fold is just the lower
4625                      * case, which is faster than the more general case. */
4626                     mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4627                                                                      pat[1]));
4628                     pat_len = 1;
4629                     pat = mod_pat;
4630                     utf8_pat = FALSE;
4631                 }
4632                 else {  /* Code point above 255, or needs special handling */
4633                     _to_utf8_fold_flags(pat, pat + pat_len,
4634                                         mod_pat, &pat_len,
4635                                         FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4636                     pat = mod_pat;
4637                 }
4638             }
4639         }
4640         else if /* Below is not a UTF-8 pattern; there's a somewhat different
4641                    set of problematic characters */
4642                 ((multi_fold_from
4643                           = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4644         {
4645             /* We may have to canonicalize a multi-char fold, as in the UTF-8
4646              * case */
4647             _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4648                                FOLD_FLAGS_FULL);
4649             pat = mod_pat;
4650         }
4651         else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4652             mod_pat[0] = mod_pat[1] = 's';
4653             pat_len = 2;
4654             utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4655                                        chars, and speeds copying */
4656             pat = mod_pat;
4657         }
4658         else if (LIKELY(op != TURKISH || *pat != 'I')) {
4659             mod_pat[0] = toLOWER_L1(*pat);
4660             pat_len = 1;
4661             pat = mod_pat;
4662         }
4663     }
4664     else if /* Below isn't a node that we convert to UTF-8 */
4665             (     utf8_target
4666              && ! utf8_pat
4667              &&   op == EXACTFAA_NO_TRIE
4668              &&  *pat == LATIN_SMALL_LETTER_SHARP_S)
4669     {
4670         /* A very special case.  Folding U+DF goes to U+17F under /iaa.  We
4671          * did this at compile time when the pattern was UTF-8 , but otherwise
4672          * we couldn't do it earlier, because it requires a UTF-8 target for
4673          * this match to be legal. */
4674         pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4675         Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4676              LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4677         pat = mod_pat;
4678         utf8_pat = TRUE;
4679     }
4680
4681     /* Here, we have taken care of the initial work for a few very problematic
4682      * situations, possibly creating a modified pattern.
4683      *
4684      * Now ready for the general case.  We build up all the possible things
4685      * that could match the first character of the pattern into the elements of
4686      * 'matches[]'
4687      *
4688      * Everything generally matches at least itself.  But if there is a
4689      * UTF8ness mismatch, we have to convert to that of the target string. */
4690     if (UTF8_IS_INVARIANT(*pat)) {  /* Immaterial if either is in UTF-8 */
4691         matches[0][0] = pat[0];
4692         lengths[0] = 1;
4693         m->count++;
4694     }
4695     else if (utf8_target) {
4696         if (utf8_pat) {
4697             lengths[0] = UTF8SKIP(pat);
4698             Copy(pat, matches[0], lengths[0], U8);
4699             m->count++;
4700         }
4701         else {  /* target is UTF-8, pattern isn't */
4702             matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4703             matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4704             lengths[0] = 2;
4705             m->count++;
4706         }
4707     }
4708     else if (! utf8_pat) {  /* Neither is UTF-8 */
4709         matches[0][0] = pat[0];
4710         lengths[0] = 1;
4711         m->count++;
4712     }
4713     else     /* target isn't UTF-8; pattern is.  No match possible unless the
4714                 pattern's first character can fit in a byte */
4715          if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4716     {
4717         matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4718         lengths[0] = 1;
4719         m->count++;
4720     }
4721
4722     /* Here we have taken care of any necessary node-type changes */
4723
4724     if (m->count) {
4725         m->max_length = lengths[0];
4726         m->min_length = lengths[0];
4727     }
4728
4729     /* For non-folding nodes, there are no other possible candidate matches,
4730      * but for foldable ones, we have to look further. */
4731     if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4732         UV folded;  /* The first character in the pattern, folded */
4733         U32 first_fold_from;    /* A character that folds to it */
4734         const U32 * remaining_fold_froms;   /* The remaining characters that
4735                                                fold to it, if any */
4736         Size_t folds_to_count;  /* The total number of characters that fold to
4737                                    'folded' */
4738
4739         /* If the node begins with a sequence of more than one character that
4740          * together form the fold of a single character, it is called a
4741          * 'multi-character fold', and the normal functions don't handle this
4742          * case.  We set 'multi_fold_from' to the single folded-from character,
4743          * which is handled in an extra iteration below */
4744         if (utf8_pat) {
4745             folded = valid_utf8_to_uvchr(pat, NULL);
4746             multi_fold_from
4747                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4748         }
4749         else {
4750             folded = *pat;
4751
4752             /* This may generate illegal combinations for things like EXACTF,
4753              * but rather than repeat the logic and exclude them here, all such
4754              * illegalities are checked for and skipped below in the loop */
4755             multi_fold_from
4756                         = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4757         }
4758
4759         /* Everything matches at least itself; initialize to that because the
4760          * only the branches below that set it are the ones where the number
4761          * isn't 1. */
4762         folds_to_count = 1;
4763
4764         /* There are a few special cases for locale-dependent nodes, where the
4765          * run-time context was needed before we could know what matched */
4766         if (UNLIKELY(op == EXACTFL) && folded < 256)  {
4767             first_fold_from = PL_fold_locale[folded];
4768         }
4769         else if (   op == EXACTFL && utf8_target && utf8_pat
4770                  && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4771                                             LATIN_SMALL_LETTER_LONG_S_UTF8))
4772         {
4773             first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4774         }
4775         else if (UNLIKELY(    op == TURKISH
4776                           && (   isALPHA_FOLD_EQ(folded, 'i')
4777                               || inRANGE(folded,
4778                                          LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4779                                          LATIN_SMALL_LETTER_DOTLESS_I))))
4780         {   /* Turkish folding requires special handling */
4781             if (folded == 'i')
4782                 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4783             else if (folded == 'I')
4784                 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4785             else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4786                 first_fold_from = 'i';
4787             else first_fold_from = 'I';
4788         }
4789         else {
4790             /* Here, isn't a special case: use the generic function to
4791              * calculate what folds to this */
4792           redo_multi:
4793             /* Look up what code points (besides itself) fold to 'folded';
4794              * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4795             folds_to_count = _inverse_folds(folded, &first_fold_from,
4796                                                        &remaining_fold_froms);
4797         }
4798
4799         /* Add each character that folds to 'folded' to the list of them,
4800          * subject to limitations based on the node type and target UTF8ness.
4801          * If there was a character that folded to multiple characters, do an
4802          * extra iteration for it.  (Note the extra iteration if there is a
4803          * multi-character fold) */
4804         for (i = 0; i < folds_to_count
4805                       + UNLIKELY(multi_fold_from != 0); i++)
4806         {
4807             UV fold_from = 0;
4808
4809             if (i >= folds_to_count) {  /* Final iteration: handle the
4810                                            multi-char */
4811                 fold_from = multi_fold_from;
4812             }
4813             else if (i == 0) {
4814                 fold_from = first_fold_from;
4815             }
4816             else if (i < folds_to_count) {
4817                 fold_from = remaining_fold_froms[i-1];
4818             }
4819
4820             if (folded == fold_from) {  /* We already added the character
4821                                            itself */
4822                 continue;
4823             }
4824
4825             /* EXACTF doesn't have any non-ascii folds */
4826             if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4827                 continue;
4828             }
4829
4830             /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4831              * */
4832             if (    isASCII(folded) != isASCII(fold_from)
4833                 &&  inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4834
4835             {
4836                 continue;
4837             }
4838
4839             /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4840              * locale, but those have been converted to EXACTFU above) */
4841             if (   op == EXACTFL
4842                 && (folded < 256) != (fold_from < 256))
4843             {
4844                 continue;
4845             }
4846
4847             /* If this triggers, it likely is because of the unlikely case
4848              * where a new Unicode standard has changed what MAX_MATCHES should
4849              * be set to */
4850             assert(m->count < MAX_MATCHES);
4851
4852             /* Add this character to the list of possible matches */
4853             if (utf8_target) {
4854                 uvchr_to_utf8(matches[m->count], fold_from);
4855                 lengths[m->count] = UVCHR_SKIP(fold_from);
4856                 m->count++;
4857             }
4858             else { /* Non-UTF8 target: no code point above 255 can appear in it
4859                     */
4860                 if (fold_from > 255) {
4861                     continue;
4862                 }
4863
4864                 matches[m->count][0] = fold_from;
4865                 lengths[m->count] = 1;
4866                 m->count++;
4867             }
4868
4869             /* Update min and mlengths */
4870             if (m->min_length > lengths[m->count-1]) {
4871                 m->min_length = lengths[m->count-1];
4872             }
4873
4874             if (m->max_length < lengths[m->count-1]) {
4875                 index_of_longest = m->count - 1;
4876                 m->max_length = lengths[index_of_longest];
4877             }
4878         } /* looped through each potential fold */
4879
4880         /* If there is something that folded to an initial multi-character
4881          * fold, repeat, using it.  This catches some edge cases.  An example
4882          * of one is /ss/i when UTF-8 encoded.  The function
4883          * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
4884          * (LATIN SMALL SHARP S).  If it returned a list of characters, this
4885          * code wouldn't be needed.  But since it doesn't, we have to look what
4886          * folds to the U+DF.  In this case, U+1E9E does, and has to be added.
4887          * */
4888         if (multi_fold_from) {
4889             folded = multi_fold_from;
4890             multi_fold_from = 0;
4891             goto redo_multi;
4892         }
4893     } /* End of finding things that participate in this fold */
4894
4895     if (m->count == 0) {    /* If nothing found, can't match */
4896         m->min_length = 0;
4897         return FALSE;
4898     }
4899
4900     /* Have calculated all possible matches.  Now calculate the mask and AND
4901      * values */
4902     m->initial_exact = 0;
4903     m->initial_definitive = 0;
4904
4905     {
4906         unsigned int mask_ones = 0;
4907         unsigned int possible_ones = 0;
4908         U8 j;
4909
4910         /* For each byte that is in all possible matches ... */
4911         for (j = 0; j < MIN(m->min_length, 5); j++) {
4912
4913             /* Initialize the accumulator for this byte */
4914             byte_mask[j] = 0xFF;
4915             byte_anded[j] = matches[0][j];
4916
4917             /* Then the rest of the rows (folds).  The mask is based on, like,
4918              * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
4919              * where they differ. */
4920             for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
4921                 byte_mask[j]  &= ~ (byte_anded[j] ^ matches[i][j]);
4922                 byte_anded[j] &= matches[i][j];
4923             }
4924
4925             /* Keep track of the number of initial mask bytes that are all one
4926              * bits.  The code calling this can use this number to know that
4927              * a string that matches this number of bytes in the pattern is an
4928              * exact match of that pattern for this number of bytes.  But also
4929              * counted are the number of initial bytes that in total have a
4930              * single zero bit.  If a string matches those, masked, it must be
4931              * one of two possibilites, both of which this function has
4932              * determined are legal.  (But if that single 0 is one of the
4933              * initial bits for masking a UTF-8 start byte, that could
4934              * incorrectly lead to different length strings appearing to be
4935              * equivalent, so only do this optimization when the matchables are
4936              * all the same length.  This was uncovered by testing
4937              * /\x{029E}/i.) */
4938             if (m->min_length == m->max_length) {
4939                 mask_ones += PL_bitcount[byte_mask[j]];
4940                 possible_ones += 8;
4941                 if (mask_ones + 1 >= possible_ones) {
4942                     m->initial_definitive++;
4943                     if (mask_ones >= possible_ones) {
4944                         m->initial_exact++;
4945                     }
4946                 }
4947             }
4948         }
4949     }
4950
4951     /* The first byte is separate for speed */
4952     m->first_byte_mask = byte_mask[0];
4953     m->first_byte_anded = byte_anded[0];
4954
4955     /* Then pack up to the next 4 bytes into a word */
4956     m->mask32 = m->anded32 = 0;
4957     for (i = 1; i < MIN(m->min_length, 5); i++) {
4958         U8 which = i;
4959         U8 shift = (which - 1) * 8;
4960         m->mask32  |= (U32) byte_mask[i]  << shift;
4961         m->anded32 |= (U32) byte_anded[i] << shift;
4962     }
4963
4964     /* Finally, take the match strings and place them sequentially into a
4965      * one-dimensional array.  (This is done to save significant space in the
4966      * structure.) Sort so the longest (presumably the least likely) is last.
4967      * XXX When this gets moved to regcomp, may want to fully sort shortest
4968      * first, but above we generally used the folded code point first, and
4969      * those tend to be no longer than their upper case values, so this is
4970      * already pretty well sorted by size.
4971      *
4972      * If the asserts fail, it's most likely because a new version of the
4973      * Unicode standard requires more space; simply increase the declaration
4974      * size. */
4975     {
4976         U8 cur_pos = 0;
4977         U8 output_index = 0;
4978
4979         if (m->count > 1) { /* No need to sort a single entry */
4980             for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
4981
4982                 /* Keep the same order for all but the longest.  (If the
4983                  * asserts fail, it could be because m->matches is declared too
4984                  * short, either because of a new Unicode release, or an
4985                  * overlooked test case, or it could be a bug.) */
4986                 if (i != index_of_longest) {
4987                     assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
4988                     Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
4989                     cur_pos += lengths[i];
4990                     m->lengths[output_index++] = lengths[i];
4991                 }
4992             }
4993         }
4994
4995         assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
4996         Copy(matches[index_of_longest], m->matches + cur_pos,
4997              lengths[index_of_longest], U8);
4998
4999         /* Place the longest match last */
5000         m->lengths[output_index] = lengths[index_of_longest];
5001     }
5002
5003
5004     return TRUE;
5005 }
5006
5007 PERL_STATIC_FORCE_INLINE    /* We want speed at the expense of size */
5008 bool
5009 S_test_EXACTISH_ST(const char * loc,
5010                    struct next_matchable_info info)
5011 {
5012     /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5013      * bytes starting at 'loc' can match based on 'next_matchable_info' */
5014
5015     U32 input32 = 0;
5016
5017     /* Check the first byte */
5018     if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5019         return FALSE;
5020
5021     /* Pack the next up-to-4 bytes into a 32 bit word */
5022     switch (info.min_length) {
5023         default:
5024             input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5025             /* FALLTHROUGH */
5026         case 4:
5027             input32 |= (U8) loc[3] << 2 * 8;
5028             /* FALLTHROUGH */
5029         case 3:
5030             input32 |= (U8) loc[2] << 1 * 8;
5031             /* FALLTHROUGH */
5032         case 2:
5033             input32 |= (U8) loc[1];
5034             break;
5035         case 1:
5036             return TRUE;    /* We already tested and passed the 0th byte */
5037         case 0:
5038             ASSUME(0);
5039     }
5040
5041     /* And AND that with the mask and compare that with the assembled ANDED
5042      * values */
5043     return (input32 & info.mask32) == info.anded32;
5044 }
5045
5046 STATIC bool
5047 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5048 {
5049     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5050      * between the inputs.  See https://www.unicode.org/reports/tr29/. */
5051
5052     PERL_ARGS_ASSERT_ISGCB;
5053
5054     switch (GCB_table[before][after]) {
5055         case GCB_BREAKABLE:
5056             return TRUE;
5057
5058         case GCB_NOBREAK:
5059             return FALSE;
5060
5061         case GCB_RI_then_RI:
5062             {
5063                 int RI_count = 1;
5064                 U8 * temp_pos = (U8 *) curpos;
5065
5066                 /* Do not break within emoji flag sequences. That is, do not
5067                  * break between regional indicator (RI) symbols if there is an
5068                  * odd number of RI characters before the break point.
5069                  *  GB12   sot (RI RI)* RI × RI
5070                  *  GB13 [^RI] (RI RI)* RI × RI */
5071
5072                 while (backup_one_GCB(strbeg,
5073                                     &temp_pos,
5074                                     utf8_target) == GCB_Regional_Indicator)
5075                 {
5076                     RI_count++;
5077                 }
5078
5079                 return RI_count % 2 != 1;
5080             }
5081
5082         case GCB_EX_then_EM:
5083
5084             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
5085             {
5086                 U8 * temp_pos = (U8 *) curpos;
5087                 GCB_enum prev;
5088
5089                 do {
5090                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5091                 }
5092                 while (prev == GCB_Extend);
5093
5094                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5095             }
5096
5097         case GCB_Maybe_Emoji_NonBreak:
5098
5099             {
5100
5101             /* Do not break within emoji modifier sequences or emoji zwj sequences.
5102               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5103               */
5104                 U8 * temp_pos = (U8 *) curpos;
5105                 GCB_enum prev;
5106
5107                 do {
5108                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5109                 }
5110                 while (prev == GCB_Extend);
5111
5112                 return prev != GCB_ExtPict_XX;
5113             }
5114
5115         default:
5116             break;
5117     }
5118
5119 #ifdef DEBUGGING
5120     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5121                                   before, after, GCB_table[before][after]);
5122     assert(0);
5123 #endif
5124     return TRUE;
5125 }
5126
5127 STATIC GCB_enum
5128 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5129 {
5130     GCB_enum gcb;
5131
5132     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5133
5134     if (*curpos < strbeg) {
5135         return GCB_EDGE;
5136     }
5137
5138     if (utf8_target) {
5139         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5140         U8 * prev_prev_char_pos;
5141
5142         if (! prev_char_pos) {
5143             return GCB_EDGE;
5144         }
5145
5146         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5147             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5148             *curpos = prev_char_pos;
5149             prev_char_pos = prev_prev_char_pos;
5150         }
5151         else {
5152             *curpos = (U8 *) strbeg;
5153             return GCB_EDGE;
5154         }
5155     }
5156     else {
5157         if (*curpos - 2 < strbeg) {
5158             *curpos = (U8 *) strbeg;
5159             return GCB_EDGE;
5160         }
5161         (*curpos)--;
5162         gcb = getGCB_VAL_CP(*(*curpos - 1));
5163     }
5164
5165     return gcb;
5166 }
5167
5168 /* Combining marks attach to most classes that precede them, but this defines
5169  * the exceptions (from TR14) */
5170 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
5171                                      || prev == LB_Mandatory_Break      \
5172                                      || prev == LB_Carriage_Return      \
5173                                      || prev == LB_Line_Feed            \
5174                                      || prev == LB_Next_Line            \
5175                                      || prev == LB_Space                \
5176                                      || prev == LB_ZWSpace))
5177
5178 STATIC bool
5179 S_isLB(pTHX_ LB_enum before,
5180              LB_enum after,
5181              const U8 * const strbeg,
5182              const U8 * const curpos,
5183              const U8 * const strend,
5184              const bool utf8_target)
5185 {
5186     U8 * temp_pos = (U8 *) curpos;
5187     LB_enum prev = before;
5188
5189     /* Is the boundary between 'before' and 'after' line-breakable?
5190      * Most of this is just a table lookup of a generated table from Unicode
5191      * rules.  But some rules require context to decide, and so have to be
5192      * implemented in code */
5193
5194     PERL_ARGS_ASSERT_ISLB;
5195
5196     /* Rule numbers in the comments below are as of Unicode 9.0 */
5197
5198   redo:
5199     before = prev;
5200     switch (LB_table[before][after]) {
5201         case LB_BREAKABLE:
5202             return TRUE;
5203
5204         case LB_NOBREAK:
5205         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5206             return FALSE;
5207
5208         case LB_SP_foo + LB_BREAKABLE:
5209         case LB_SP_foo + LB_NOBREAK:
5210         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5211
5212             /* When we have something following a SP, we have to look at the
5213              * context in order to know what to do.
5214              *
5215              * SP SP should not reach here because LB7: Do not break before
5216              * spaces.  (For two spaces in a row there is nothing that
5217              * overrides that) */
5218             assert(after != LB_Space);
5219
5220             /* Here we have a space followed by a non-space.  Mostly this is a
5221              * case of LB18: "Break after spaces".  But there are complications
5222              * as the handling of spaces is somewhat tricky.  They are in a
5223              * number of rules, which have to be applied in priority order, but
5224              * something earlier in the string can cause a rule to be skipped
5225              * and a lower priority rule invoked.  A prime example is LB7 which
5226              * says don't break before a space.  But rule LB8 (lower priority)
5227              * says that the first break opportunity after a ZW is after any
5228              * span of spaces immediately after it.  If a ZW comes before a SP
5229              * in the input, rule LB8 applies, and not LB7.  Other such rules
5230              * involve combining marks which are rules 9 and 10, but they may
5231              * override higher priority rules if they come earlier in the
5232              * string.  Since we're doing random access into the middle of the
5233              * string, we have to look for rules that should get applied based
5234              * on both string position and priority.  Combining marks do not
5235              * attach to either ZW nor SP, so we don't have to consider them
5236              * until later.
5237              *
5238              * To check for LB8, we have to find the first non-space character
5239              * before this span of spaces */
5240             do {
5241                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5242             }
5243             while (prev == LB_Space);
5244
5245             /* LB8 Break before any character following a zero-width space,
5246              * even if one or more spaces intervene.
5247              *      ZW SP* ÷
5248              * So if we have a ZW just before this span, and to get here this
5249              * is the final space in the span. */
5250             if (prev == LB_ZWSpace) {
5251                 return TRUE;
5252             }
5253
5254             /* Here, not ZW SP+.  There are several rules that have higher
5255              * priority than LB18 and can be resolved now, as they don't depend
5256              * on anything earlier in the string (except ZW, which we have
5257              * already handled).  One of these rules is LB11 Do not break
5258              * before Word joiner, but we have specially encoded that in the
5259              * lookup table so it is caught by the single test below which
5260              * catches the other ones. */
5261             if (LB_table[LB_Space][after] - LB_SP_foo
5262                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5263             {
5264                 return FALSE;
5265             }
5266
5267             /* If we get here, we have to XXX consider combining marks. */
5268             if (prev == LB_Combining_Mark) {
5269
5270                 /* What happens with these depends on the character they
5271                  * follow.  */
5272                 do {
5273                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5274                 }
5275                 while (prev == LB_Combining_Mark);
5276
5277                 /* Most times these attach to and inherit the characteristics
5278                  * of that character, but not always, and when not, they are to
5279                  * be treated as AL by rule LB10. */
5280                 if (! LB_CM_ATTACHES_TO(prev)) {
5281                     prev = LB_Alphabetic;
5282                 }
5283             }
5284
5285             /* Here, we have the character preceding the span of spaces all set
5286              * up.  We follow LB18: "Break after spaces" unless the table shows
5287              * that is overriden */
5288             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5289
5290         case LB_CM_ZWJ_foo:
5291
5292             /* We don't know how to treat the CM except by looking at the first
5293              * non-CM character preceding it.  ZWJ is treated as CM */
5294             do {
5295                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5296             }
5297             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5298
5299             /* Here, 'prev' is that first earlier non-CM character.  If the CM
5300              * attatches to it, then it inherits the behavior of 'prev'.  If it
5301              * doesn't attach, it is to be treated as an AL */
5302             if (! LB_CM_ATTACHES_TO(prev)) {
5303                 prev = LB_Alphabetic;
5304             }
5305
5306             goto redo;
5307
5308         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5309         case LB_HY_or_BA_then_foo + LB_NOBREAK:
5310
5311             /* LB21a Don't break after Hebrew + Hyphen.
5312              * HL (HY | BA) × */
5313
5314             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5315                                                           == LB_Hebrew_Letter)
5316             {
5317                 return FALSE;
5318             }
5319
5320             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5321
5322         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5323         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5324
5325             /* LB25a (PR | PO) × ( OP | HY )? NU */
5326             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5327                 return FALSE;
5328             }
5329
5330             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5331                                                                 == LB_BREAKABLE;
5332
5333         case LB_SY_or_IS_then_various + LB_BREAKABLE:
5334         case LB_SY_or_IS_then_various + LB_NOBREAK:
5335         {
5336             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5337
5338             LB_enum temp = prev;
5339             do {
5340                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5341             }
5342             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5343             if (temp == LB_Numeric) {
5344                 return FALSE;
5345             }
5346
5347             return LB_table[prev][after] - LB_SY_or_IS_then_various
5348                                                                == LB_BREAKABLE;
5349         }
5350
5351         case LB_various_then_PO_or_PR + LB_BREAKABLE:
5352         case LB_various_then_PO_or_PR + LB_NOBREAK:
5353         {
5354             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5355
5356             LB_enum temp = prev;
5357             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5358             {
5359                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5360             }
5361             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5362                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5363             }
5364             if (temp == LB_Numeric) {
5365                 return FALSE;
5366             }
5367             return LB_various_then_PO_or_PR;
5368         }
5369
5370         case LB_RI_then_RI + LB_NOBREAK:
5371         case LB_RI_then_RI + LB_BREAKABLE:
5372             {
5373                 int RI_count = 1;
5374
5375                 /* LB30a Break between two regional indicator symbols if and
5376                  * only if there are an even number of regional indicators
5377                  * preceding the position of the break.
5378                  *
5379                  *    sot (RI RI)* RI × RI
5380                  *  [^RI] (RI RI)* RI × RI */
5381
5382                 while (backup_one_LB(strbeg,
5383                                      &temp_pos,
5384                                      utf8_target) == LB_Regional_Indicator)
5385                 {
5386                     RI_count++;
5387                 }
5388
5389                 return RI_count % 2 == 0;
5390             }
5391
5392         default:
5393             break;
5394     }
5395
5396 #ifdef DEBUGGING
5397     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5398                                   before, after, LB_table[before][after]);
5399     assert(0);
5400 #endif
5401     return TRUE;
5402 }
5403
5404 STATIC LB_enum
5405 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5406 {
5407
5408     LB_enum lb;
5409
5410     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5411
5412     if (*curpos >= strend) {
5413         return LB_EDGE;
5414     }
5415
5416     if (utf8_target) {
5417         *curpos += UTF8SKIP(*curpos);
5418         if (*curpos >= strend) {
5419             return LB_EDGE;
5420         }
5421         lb = getLB_VAL_UTF8(*curpos, strend);
5422     }
5423     else {
5424         (*curpos)++;
5425         if (*curpos >= strend) {
5426             return LB_EDGE;
5427         }
5428         lb = getLB_VAL_CP(**curpos);
5429     }
5430
5431     return lb;
5432 }
5433
5434 STATIC LB_enum
5435 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5436 {
5437     LB_enum lb;
5438
5439     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5440
5441     if (*curpos < strbeg) {
5442         return LB_EDGE;
5443     }
5444
5445     if (utf8_target) {
5446         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5447         U8 * prev_prev_char_pos;
5448
5449         if (! prev_char_pos) {
5450             return LB_EDGE;
5451         }
5452
5453         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5454             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5455             *curpos = prev_char_pos;
5456             prev_char_pos = prev_prev_char_pos;
5457         }
5458         else {
5459             *curpos = (U8 *) strbeg;
5460             return LB_EDGE;
5461         }
5462     }
5463     else {
5464         if (*curpos - 2 < strbeg) {
5465             *curpos = (U8 *) strbeg;
5466             return LB_EDGE;
5467         }
5468         (*curpos)--;
5469         lb = getLB_VAL_CP(*(*curpos - 1));
5470     }
5471
5472     return lb;
5473 }
5474
5475 STATIC bool
5476 S_isSB(pTHX_ SB_enum before,
5477              SB_enum after,
5478              const U8 * const strbeg,
5479              const U8 * const curpos,
5480              const U8 * const strend,
5481              const bool utf8_target)
5482 {
5483     /* returns a boolean indicating if there is a Sentence Boundary Break
5484      * between the inputs.  See https://www.unicode.org/reports/tr29/ */
5485
5486     U8 * lpos = (U8 *) curpos;
5487     bool has_para_sep = FALSE;
5488     bool has_sp = FALSE;
5489
5490     PERL_ARGS_ASSERT_ISSB;
5491
5492     /* Break at the start and end of text.
5493         SB1.  sot  ÷
5494         SB2.  ÷  eot
5495       But unstated in Unicode is don't break if the text is empty */
5496     if (before == SB_EDGE || after == SB_EDGE) {
5497         return before != after;
5498     }
5499
5500     /* SB 3: Do not break within CRLF. */
5501     if (before == SB_CR && after == SB_LF) {
5502         return FALSE;
5503     }
5504
5505     /* Break after paragraph separators.  CR and LF are considered
5506      * so because Unicode views text as like word processing text where there
5507      * are no newlines except between paragraphs, and the word processor takes
5508      * care of wrapping without there being hard line-breaks in the text *./
5509        SB4.  Sep | CR | LF  ÷ */
5510     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5511         return TRUE;
5512     }
5513
5514     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5515      * (See Section 6.2, Replacing Ignore Rules.)
5516         SB5.  X (Extend | Format)*  →  X */
5517     if (after == SB_Extend || after == SB_Format) {
5518
5519         /* Implied is that the these characters attach to everything
5520          * immediately prior to them except for those separator-type
5521          * characters.  And the rules earlier have already handled the case
5522          * when one of those immediately precedes the extend char */
5523         return FALSE;
5524     }
5525
5526     if (before == SB_Extend || before == SB_Format) {
5527         U8 * temp_pos = lpos;
5528         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5529         if (   backup != SB_EDGE
5530             && backup != SB_Sep
5531             && backup != SB_CR
5532             && backup != SB_LF)
5533         {
5534             before = backup;
5535             lpos = temp_pos;
5536         }
5537
5538         /* Here, both 'before' and 'backup' are these types; implied is that we
5539          * don't break between them */
5540         if (backup == SB_Extend || backup == SB_Format) {
5541             return FALSE;
5542         }
5543     }
5544
5545     /* Do not break after ambiguous terminators like period, if they are
5546      * immediately followed by a number or lowercase letter, if they are
5547      * between uppercase letters, if the first following letter (optionally
5548      * after certain punctuation) is lowercase, or if they are followed by
5549      * "continuation" punctuation such as comma, colon, or semicolon. For
5550      * example, a period may be an abbreviation or numeric period, and thus may
5551      * not mark the end of a sentence.
5552
5553      * SB6. ATerm  ×  Numeric */
5554     if (before == SB_ATerm && after == SB_Numeric) {
5555         return FALSE;
5556     }
5557
5558     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5559     if (before == SB_ATerm && after == SB_Upper) {
5560         U8 * temp_pos = lpos;
5561         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5562         if (backup == SB_Upper || backup == SB_Lower) {
5563             return FALSE;
5564         }
5565     }
5566
5567     /* The remaining rules that aren't the final one, all require an STerm or
5568      * an ATerm after having backed up over some Close* Sp*, and in one case an
5569      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5570      * So do that backup now, setting flags if either Sp or a paragraph
5571      * separator are found */
5572
5573     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5574         has_para_sep = TRUE;
5575         before = backup_one_SB(strbeg, &lpos, utf8_target);
5576     }
5577
5578     if (before == SB_Sp) {
5579         has_sp = TRUE;
5580         do {
5581             before = backup_one_SB(strbeg, &lpos, utf8_target);
5582         }
5583         while (before == SB_Sp);
5584     }
5585
5586     while (before == SB_Close) {
5587         before = backup_one_SB(strbeg, &lpos, utf8_target);
5588     }
5589
5590     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5591      * most cases an STerm */
5592     if (before == SB_STerm || before == SB_ATerm) {
5593
5594         /* So, here the lhs matches
5595          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5596          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5597          * The rules that apply here are:
5598          *
5599          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5600                                            | LF | STerm | ATerm) )* Lower
5601            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5602            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5603            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5604            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5605          */
5606
5607         /* And all but SB11 forbid having seen a paragraph separator */
5608         if (! has_para_sep) {
5609             if (before == SB_ATerm) {          /* SB8 */
5610                 U8 * rpos = (U8 *) curpos;
5611                 SB_enum later = after;
5612
5613                 while (    later != SB_OLetter
5614                         && later != SB_Upper
5615                         && later != SB_Lower
5616                         && later != SB_Sep
5617                         && later != SB_CR
5618                         && later != SB_LF
5619                         && later != SB_STerm
5620                         && later != SB_ATerm
5621                         && later != SB_EDGE)
5622                 {
5623                     later = advance_one_SB(&rpos, strend, utf8_target);
5624                 }
5625                 if (later == SB_Lower) {
5626                     return FALSE;
5627                 }
5628             }
5629
5630             if (   after == SB_SContinue    /* SB8a */
5631                 || after == SB_STerm
5632                 || after == SB_ATerm)
5633             {
5634                 return FALSE;
5635             }
5636
5637             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5638                 if (   after == SB_Close
5639                     || after == SB_Sp
5640                     || after == SB_Sep
5641                     || after == SB_CR
5642                     || after == SB_LF)
5643                 {
5644                     return FALSE;
5645                 }
5646             }
5647
5648             /* SB10.  This and SB9 could probably be combined some way, but khw
5649              * has decided to follow the Unicode rule book precisely for
5650              * simplified maintenance */
5651             if (   after == SB_Sp
5652                 || after == SB_Sep
5653                 || after == SB_CR
5654                 || after == SB_LF)
5655             {
5656                 return FALSE;
5657             }
5658         }
5659
5660         /* SB11.  */
5661         return TRUE;
5662     }
5663
5664     /* Otherwise, do not break.
5665     SB12.  Any  ×  Any */
5666
5667     return FALSE;
5668 }
5669
5670 STATIC SB_enum
5671 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5672 {
5673     SB_enum sb;
5674
5675     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5676
5677     if (*curpos >= strend) {
5678         return SB_EDGE;
5679     }
5680
5681     if (utf8_target) {
5682         do {
5683             *curpos += UTF8SKIP(*curpos);
5684             if (*curpos >= strend) {
5685                 return SB_EDGE;
5686             }
5687             sb = getSB_VAL_UTF8(*curpos, strend);
5688         } while (sb == SB_Extend || sb == SB_Format);
5689     }
5690     else {
5691         do {
5692             (*curpos)++;
5693             if (*curpos >= strend) {
5694                 return SB_EDGE;
5695             }
5696             sb = getSB_VAL_CP(**curpos);
5697         } while (sb == SB_Extend || sb == SB_Format);
5698     }
5699
5700     return sb;
5701 }
5702
5703 STATIC SB_enum
5704 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5705 {
5706     SB_enum sb;
5707
5708     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5709
5710     if (*curpos < strbeg) {
5711         return SB_EDGE;
5712     }
5713
5714     if (utf8_target) {
5715         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5716         if (! prev_char_pos) {
5717             return SB_EDGE;
5718         }
5719
5720         /* Back up over Extend and Format.  curpos is always just to the right
5721          * of the characater whose value we are getting */
5722         do {
5723             U8 * prev_prev_char_pos;
5724             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5725                                                                       strbeg)))
5726             {
5727                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5728                 *curpos = prev_char_pos;
5729                 prev_char_pos = prev_prev_char_pos;
5730             }
5731             else {
5732                 *curpos = (U8 *) strbeg;
5733                 return SB_EDGE;
5734             }
5735         } while (sb == SB_Extend || sb == SB_Format);
5736     }
5737     else {
5738         do {
5739             if (*curpos - 2 < strbeg) {
5740                 *curpos = (U8 *) strbeg;
5741                 return SB_EDGE;
5742             }
5743             (*curpos)--;
5744             sb = getSB_VAL_CP(*(*curpos - 1));
5745         } while (sb == SB_Extend || sb == SB_Format);
5746     }
5747
5748     return sb;
5749 }
5750
5751 STATIC bool
5752 S_isWB(pTHX_ WB_enum previous,
5753              WB_enum before,
5754              WB_enum after,
5755              const U8 * const strbeg,
5756              const U8 * const curpos,
5757              const U8 * const strend,
5758              const bool utf8_target)
5759 {
5760     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5761      *  a Unicode word break, using their published algorithm, but tailored for
5762      *  Perl by treating spans of white space as one unit.  Context may be
5763      *  needed to make this determination.  If the value for the character
5764      *  before 'before' is known, it is passed as 'previous'; otherwise that
5765      *  should be set to WB_UNKNOWN.  The other input parameters give the
5766      *  boundaries and current position in the matching of the string.  That
5767      *  is, 'curpos' marks the position where the character whose wb value is
5768      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5769
5770     U8 * before_pos = (U8 *) curpos;
5771     U8 * after_pos = (U8 *) curpos;
5772     WB_enum prev = before;
5773     WB_enum next;
5774
5775     PERL_ARGS_ASSERT_ISWB;
5776
5777     /* Rule numbers in the comments below are as of Unicode 9.0 */
5778
5779   redo:
5780     before = prev;
5781     switch (WB_table[before][after]) {
5782         case WB_BREAKABLE:
5783             return TRUE;
5784
5785         case WB_NOBREAK:
5786             return FALSE;
5787
5788         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5789             next = advance_one_WB(&after_pos, strend, utf8_target,
5790                                  FALSE /* Don't skip Extend nor Format */ );
5791             /* A space immediately preceeding an Extend or Format is attached
5792              * to by them, and hence gets separated from previous spaces.
5793              * Otherwise don't break between horizontal white space */
5794             return next == WB_Extend || next == WB_Format;
5795
5796         /* WB4 Ignore Format and Extend characters, except when they appear at
5797          * the beginning of a region of text.  This code currently isn't
5798          * general purpose, but it works as the rules are currently and likely
5799          * to be laid out.  The reason it works is that when 'they appear at
5800          * the beginning of a region of text', the rule is to break before
5801          * them, just like any other character.  Therefore, the default rule
5802          * applies and we don't have to look in more depth.  Should this ever
5803          * change, we would have to have 2 'case' statements, like in the rules
5804          * below, and backup a single character (not spacing over the extend
5805          * ones) and then see if that is one of the region-end characters and
5806          * go from there */
5807         case WB_Ex_or_FO_or_ZWJ_then_foo:
5808             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5809             goto redo;
5810
5811         case WB_DQ_then_HL + WB_BREAKABLE:
5812         case WB_DQ_then_HL + WB_NOBREAK:
5813
5814             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5815
5816             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5817                                                             == WB_Hebrew_Letter)
5818             {
5819                 return FALSE;
5820             }
5821
5822              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5823
5824         case WB_HL_then_DQ + WB_BREAKABLE:
5825         case WB_HL_then_DQ + WB_NOBREAK:
5826
5827             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5828
5829             if (advance_one_WB(&after_pos, strend, utf8_target,
5830                                        TRUE /* Do skip Extend and Format */ )
5831                                                             == WB_Hebrew_Letter)
5832             {
5833                 return FALSE;
5834             }
5835
5836             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5837
5838         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5839         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5840
5841             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5842              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5843
5844             next = advance_one_WB(&after_pos, strend, utf8_target,
5845                                        TRUE /* Do skip Extend and Format */ );
5846
5847             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5848             {
5849                 return FALSE;
5850             }
5851
5852             return WB_table[before][after]
5853                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5854
5855         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5856         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5857
5858             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5859              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5860
5861             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5862             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5863             {
5864                 return FALSE;
5865             }
5866
5867             return WB_table[before][after]
5868                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5869
5870         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5871         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5872
5873             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5874              * */
5875
5876             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5877                                                             == WB_Numeric)
5878             {
5879                 return FALSE;
5880             }
5881
5882             return WB_table[before][after]
5883                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5884
5885         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5886         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5887
5888             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5889
5890             if (advance_one_WB(&after_pos, strend, utf8_target,
5891                                        TRUE /* Do skip Extend and Format */ )
5892                                                             == WB_Numeric)
5893             {
5894                 return FALSE;
5895             }
5896
5897             return WB_table[before][after]
5898                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5899
5900         case WB_RI_then_RI + WB_NOBREAK:
5901         case WB_RI_then_RI + WB_BREAKABLE:
5902             {
5903                 int RI_count = 1;
5904
5905                 /* Do not break within emoji flag sequences. That is, do not
5906                  * break between regional indicator (RI) symbols if there is an
5907                  * odd number of RI characters before the potential break
5908                  * point.
5909                  *
5910                  * WB15   sot (RI RI)* RI × RI
5911                  * WB16 [^RI] (RI RI)* RI × RI */
5912
5913                 while (backup_one_WB(&previous,
5914                                      strbeg,
5915                                      &before_pos,
5916                                      utf8_target) == WB_Regional_Indicator)
5917                 {
5918                     RI_count++;
5919                 }
5920
5921                 return RI_count % 2 != 1;
5922             }
5923
5924         default:
5925             break;
5926     }
5927
5928 #ifdef DEBUGGING
5929     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5930                                   before, after, WB_table[before][after]);
5931     assert(0);
5932 #endif
5933     return TRUE;
5934 }
5935
5936 STATIC WB_enum
5937 S_advance_one_WB(pTHX_ U8 ** curpos,
5938                        const U8 * const strend,
5939                        const bool utf8_target,
5940                        const bool skip_Extend_Format)
5941 {
5942     WB_enum wb;
5943
5944     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5945
5946     if (*curpos >= strend) {
5947         return WB_EDGE;
5948     }
5949
5950     if (utf8_target) {
5951
5952         /* Advance over Extend and Format */
5953         do {
5954             *curpos += UTF8SKIP(*curpos);
5955             if (*curpos >= strend) {
5956                 return WB_EDGE;
5957             }
5958             wb = getWB_VAL_UTF8(*curpos, strend);
5959         } while (    skip_Extend_Format
5960                  && (wb == WB_Extend || wb == WB_Format));
5961     }
5962     else {
5963         do {
5964             (*curpos)++;
5965             if (*curpos >= strend) {
5966                 return WB_EDGE;
5967             }
5968             wb = getWB_VAL_CP(**curpos);
5969         } while (    skip_Extend_Format
5970                  && (wb == WB_Extend || wb == WB_Format));
5971     }
5972
5973     return wb;
5974 }
5975
5976 STATIC WB_enum
5977 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5978 {
5979     WB_enum wb;
5980
5981     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5982
5983     /* If we know what the previous character's break value is, don't have
5984         * to look it up */
5985     if (*previous != WB_UNKNOWN) {
5986         wb = *previous;
5987
5988         /* But we need to move backwards by one */
5989         if (utf8_target) {
5990             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5991             if (! *curpos) {
5992                 *previous = WB_EDGE;
5993                 *curpos = (U8 *) strbeg;
5994             }
5995             else {
5996                 *previous = WB_UNKNOWN;
5997             }
5998         }
5999         else {
6000             (*curpos)--;
6001             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6002         }
6003
6004         /* And we always back up over these three types */
6005         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6006             return wb;
6007         }
6008     }
6009
6010     if (*curpos < strbeg) {
6011         return WB_EDGE;
6012     }
6013
6014     if (utf8_target) {
6015         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6016         if (! prev_char_pos) {
6017             return WB_EDGE;
6018         }
6019
6020         /* Back up over Extend and Format.  curpos is always just to the right
6021          * of the characater whose value we are getting */
6022         do {
6023             U8 * prev_prev_char_pos;
6024             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6025                                                    -1,
6026                                                    strbeg)))
6027             {
6028                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6029                 *curpos = prev_char_pos;
6030                 prev_char_pos = prev_prev_char_pos;
6031             }
6032             else {
6033                 *curpos = (U8 *) strbeg;
6034                 return WB_EDGE;
6035             }
6036         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6037     }
6038     else {
6039         do {
6040             if (*curpos - 2 < strbeg) {
6041                 *curpos = (U8 *) strbeg;
6042                 return WB_EDGE;
6043             }
6044             (*curpos)--;
6045             wb = getWB_VAL_CP(*(*curpos - 1));
6046         } while (wb == WB_Extend || wb == WB_Format);
6047     }
6048
6049     return wb;
6050 }
6051
6052 /* Macros for regmatch(), using its internal variables */
6053 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6054 #define NEXTCHR_IS_EOS (nextbyte < 0)
6055
6056 #define SET_nextchr \
6057     nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6058
6059 #define SET_locinput(p) \
6060     locinput = (p);  \
6061     SET_nextchr
6062
6063 #define sayYES goto yes
6064 #define sayNO goto no
6065 #define sayNO_SILENT goto no_silent
6066
6067 /* we dont use STMT_START/END here because it leads to
6068    "unreachable code" warnings, which are bogus, but distracting. */
6069 #define CACHEsayNO \
6070     if (ST.cache_mask) \
6071        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6072     sayNO
6073
6074 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
6075 (                                                           \
6076     (   ( st )                                         ) && \
6077     (   ( st )->u.eval.close_paren                     ) && \
6078     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6079 )
6080
6081 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
6082 (                                                           \
6083     (   ( st )                                         ) && \
6084     (   ( st )->u.eval.close_paren                     ) && \
6085     (   ( expr )                                       ) && \
6086     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6087 )
6088
6089
6090 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6091     (st)->u.eval.close_paren = ( (expr) + 1 )
6092
6093 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6094     (st)->u.eval.close_paren = 0
6095
6096 /* push a new state then goto it */
6097
6098 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
6099     pushinput = input; \
6100     pusheol = eol; \
6101     pushsr0 = sr0; \
6102     scan = node; \
6103     st->resume_state = state; \
6104     goto push_state;
6105
6106 /* push a new state with success backtracking, then goto it */
6107
6108 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
6109     pushinput = input; \
6110     pusheol = eol;     \
6111     pushsr0 = sr0; \
6112     scan = node; \
6113     st->resume_state = state; \
6114     goto push_yes_state;
6115
6116 #define DEBUG_STATE_pp(pp)                                  \
6117     DEBUG_STATE_r({                                         \
6118         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
6119         Perl_re_printf( aTHX_                               \
6120             "%*s" pp " %s%s%s%s%s\n",                       \
6121             INDENT_CHARS(depth), "",                        \
6122             PL_reg_name[st->resume_state],                  \
6123             ((st==yes_state||st==mark_state) ? "[" : ""),   \
6124             ((st==yes_state) ? "Y" : ""),                   \
6125             ((st==mark_state) ? "M" : ""),                  \
6126             ((st==yes_state||st==mark_state) ? "]" : "")    \
6127         );                                                  \
6128     });
6129
6130 /*
6131
6132 regmatch() - main matching routine
6133
6134 This is basically one big switch statement in a loop. We execute an op,
6135 set 'next' to point the next op, and continue. If we come to a point which
6136 we may need to backtrack to on failure such as (A|B|C), we push a
6137 backtrack state onto the backtrack stack. On failure, we pop the top
6138 state, and re-enter the loop at the state indicated. If there are no more
6139 states to pop, we return failure.
6140
6141 Sometimes we also need to backtrack on success; for example /A+/, where
6142 after successfully matching one A, we need to go back and try to
6143 match another one; similarly for lookahead assertions: if the assertion
6144 completes successfully, we backtrack to the state just before the assertion
6145 and then carry on.  In these cases, the pushed state is marked as
6146 'backtrack on success too'. This marking is in fact done by a chain of
6147 pointers, each pointing to the previous 'yes' state. On success, we pop to
6148 the nearest yes state, discarding any intermediate failure-only states.
6149 Sometimes a yes state is pushed just to force some cleanup code to be
6150 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6151 it to free the inner regex.
6152
6153 Note that failure backtracking rewinds the cursor position, while
6154 success backtracking leaves it alone.
6155
6156 A pattern is complete when the END op is executed, while a subpattern
6157 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6158 ops trigger the "pop to last yes state if any, otherwise return true"
6159 behaviour.
6160
6161 A common convention in this function is to use A and B to refer to the two
6162 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6163 the subpattern to be matched possibly multiple times, while B is the entire
6164 rest of the pattern. Variable and state names reflect this convention.
6165
6166 The states in the main switch are the union of ops and failure/success of
6167 substates associated with that op.  For example, IFMATCH is the op
6168 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6169 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6170 successfully matched A and IFMATCH_A_fail is a state saying that we have
6171 just failed to match A. Resume states always come in pairs. The backtrack
6172 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6173 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6174 on success or failure.
6175
6176 The struct that holds a backtracking state is actually a big union, with
6177 one variant for each major type of op. The variable st points to the
6178 top-most backtrack struct. To make the code clearer, within each
6179 block of code we #define ST to alias the relevant union.
6180
6181 Here's a concrete example of a (vastly oversimplified) IFMATCH
6182 implementation:
6183
6184     switch (state) {
6185     ....
6186
6187 #define ST st->u.ifmatch
6188
6189     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6190         ST.foo = ...; // some state we wish to save
6191         ...
6192         // push a yes backtrack state with a resume value of
6193         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6194         // first node of A:
6195         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6196         // NOTREACHED
6197
6198     case IFMATCH_A: // we have successfully executed A; now continue with B
6199         next = B;
6200         bar = ST.foo; // do something with the preserved value
6201         break;
6202
6203     case IFMATCH_A_fail: // A failed, so the assertion failed
6204         ...;   // do some housekeeping, then ...
6205         sayNO; // propagate the failure
6206
6207 #undef ST
6208
6209     ...
6210     }
6211
6212 For any old-timers reading this who are familiar with the old recursive
6213 approach, the code above is equivalent to:
6214
6215     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6216     {
6217         int foo = ...
6218         ...
6219         if (regmatch(A)) {
6220             next = B;
6221             bar = foo;
6222             break;
6223         }
6224         ...;   // do some housekeeping, then ...
6225         sayNO; // propagate the failure
6226     }
6227
6228 The topmost backtrack state, pointed to by st, is usually free. If you
6229 want to claim it, populate any ST.foo fields in it with values you wish to
6230 save, then do one of
6231
6232         PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6233         PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6234
6235 which sets that backtrack state's resume value to 'resume_state', pushes a
6236 new free entry to the top of the backtrack stack, then goes to 'node'.
6237 On backtracking, the free slot is popped, and the saved state becomes the
6238 new free state. An ST.foo field in this new top state can be temporarily
6239 accessed to retrieve values, but once the main loop is re-entered, it
6240 becomes available for reuse.
6241
6242 Note that the depth of the backtrack stack constantly increases during the
6243 left-to-right execution of the pattern, rather than going up and down with
6244 the pattern nesting. For example the stack is at its maximum at Z at the
6245 end of the pattern, rather than at X in the following:
6246
6247     /(((X)+)+)+....(Y)+....Z/
6248
6249 The only exceptions to this are lookahead/behind assertions and the cut,
6250 (?>A), which pop all the backtrack states associated with A before
6251 continuing.
6252
6253 Backtrack state structs are allocated in slabs of about 4K in size.
6254 PL_regmatch_state and st always point to the currently active state,
6255 and PL_regmatch_slab points to the slab currently containing
6256 PL_regmatch_state.  The first time regmatch() is called, the first slab is
6257 allocated, and is never freed until interpreter destruction. When the slab
6258 is full, a new one is allocated and chained to the end. At exit from
6259 regmatch(), slabs allocated since entry are freed.
6260
6261 In order to work with variable length lookbehinds, an upper limit is placed on
6262 lookbehinds which is set to where the match position is at the end of where the
6263 lookbehind would get to.  Nothing in the lookbehind should match above that,
6264 except we should be able to look beyond if for things like \b, which need the
6265 next character in the string to be able to determine if this is a boundary or
6266 not.  We also can't match the end of string/line unless we are also at the end
6267 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6268 that match a width, we have to add a condition that they are within the legal
6269 bounds of our window into the string.
6270
6271 */
6272
6273 /* returns -1 on failure, $+[0] on success */
6274 STATIC SSize_t
6275 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6276 {
6277     const bool utf8_target = reginfo->is_utf8_target;
6278     const U32 uniflags = UTF8_ALLOW_DEFAULT;
6279     REGEXP *rex_sv = reginfo->prog;
6280     regexp *rex = ReANY(rex_sv);
6281     RXi_GET_DECL(rex,rexi);
6282     /* the current state. This is a cached copy of PL_regmatch_state */
6283     regmatch_state *st;
6284     /* cache heavy used fields of st in registers */
6285     regnode *scan;
6286     regnode *next;
6287     U32 n = 0;  /* general value; init to avoid compiler warning */
6288     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
6289     SSize_t endref = 0; /* offset of end of backref when ln is start */
6290     char *locinput = startpos;
6291     char *loceol = reginfo->strend;
6292     char *pushinput; /* where to continue after a PUSH */
6293     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
6294     U8   *pushsr0;   /* save starting pos of script run */
6295     PERL_INT_FAST16_T nextbyte;   /* is always set to UCHARAT(locinput), or -1
6296                                      at EOS */
6297
6298     bool result = 0;        /* return value of S_regmatch */
6299     U32 depth = 0;            /* depth of backtrack stack */
6300     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6301     const U32 max_nochange_depth =
6302         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6303         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6304     regmatch_state *yes_state = NULL; /* state to pop to on success of
6305                                                             subpattern */
6306     /* mark_state piggy backs on the yes_state logic so that when we unwind
6307        the stack on success we can update the mark_state as we go */
6308     regmatch_state *mark_state = NULL; /* last mark state we have seen */
6309     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6310     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
6311     U32 state_num;
6312     bool no_final = 0;      /* prevent failure from backtracking? */
6313     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
6314     char *startpoint = locinput;
6315     SV *popmark = NULL;     /* are we looking for a mark? */
6316     SV *sv_commit = NULL;   /* last mark name seen in failure */
6317     SV *sv_yes_mark = NULL; /* last mark name we have seen
6318                                during a successful match */
6319     U32 lastopen = 0;       /* last open we saw */
6320     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6321     SV* const oreplsv = GvSVn(PL_replgv);
6322     /* these three flags are set by various ops to signal information to
6323      * the very next op. They have a useful lifetime of exactly one loop
6324      * iteration, and are not preserved or restored by state pushes/pops
6325      */
6326     bool sw = 0;            /* the condition value in (?(cond)a|b) */
6327     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
6328     int logical = 0;        /* the following EVAL is:
6329                                 0: (?{...})
6330                                 1: (?(?{...})X|Y)
6331                                 2: (??{...})
6332                                or the following IFMATCH/UNLESSM is:
6333                                 false: plain (?=foo)
6334                                 true:  used as a condition: (?(?=foo))
6335                             */
6336     PAD* last_pad = NULL;
6337     dMULTICALL;
6338     U8 gimme = G_SCALAR;
6339     CV *caller_cv = NULL;       /* who called us */
6340     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
6341     U32 maxopenparen = 0;       /* max '(' index seen so far */
6342     int to_complement;  /* Invert the result? */
6343     _char_class_number classnum;
6344     bool is_utf8_pat = reginfo->is_utf8_pat;
6345     bool match = FALSE;
6346     I32 orig_savestack_ix = PL_savestack_ix;
6347     U8 * script_run_begin = NULL;
6348
6349 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6350 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6351 #  define SOLARIS_BAD_OPTIMIZER
6352     const U32 *pl_charclass_dup = PL_charclass;
6353 #  define PL_charclass pl_charclass_dup
6354 #endif
6355
6356 #ifdef DEBUGGING
6357     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6358 #endif
6359
6360     /* protect against undef(*^R) */
6361     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6362
6363     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6364     multicall_oldcatch = 0;
6365     PERL_UNUSED_VAR(multicall_cop);
6366
6367     PERL_ARGS_ASSERT_REGMATCH;
6368
6369     st = PL_regmatch_state;
6370
6371     /* Note that nextbyte is a byte even in UTF */
6372     SET_nextchr;
6373     scan = prog;
6374
6375     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6376             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6377             Perl_re_printf( aTHX_ "regmatch start\n" );
6378     }));
6379
6380     while (scan != NULL) {
6381         next = scan + NEXT_OFF(scan);
6382         if (next == scan)
6383             next = NULL;
6384         state_num = OP(scan);
6385
6386       reenter_switch:
6387         DEBUG_EXECUTE_r(
6388             if (state_num <= REGNODE_MAX) {
6389                 SV * const prop = sv_newmortal();
6390                 regnode *rnext = regnext(scan);
6391
6392                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6393                 regprop(rex, prop, scan, reginfo, NULL);
6394                 Perl_re_printf( aTHX_
6395                     "%*s%" IVdf ":%s(%" IVdf ")\n",
6396                     INDENT_CHARS(depth), "",
6397                     (IV)(scan - rexi->program),
6398                     SvPVX_const(prop),
6399                     (PL_regkind[OP(scan)] == END || !rnext) ?
6400                         0 : (IV)(rnext - rexi->program));
6401             }
6402         );
6403
6404         to_complement = 0;
6405
6406         SET_nextchr;
6407         assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6408
6409         switch (state_num) {
6410         case SBOL: /*  /^../ and /\A../  */
6411             if (locinput == reginfo->strbeg)
6412                 break;
6413             sayNO;
6414
6415         case MBOL: /*  /^../m  */
6416             if (locinput == reginfo->strbeg ||
6417                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6418             {
6419                 break;
6420             }
6421             sayNO;
6422
6423         case GPOS: /*  \G  */
6424             if (locinput == reginfo->ganch)
6425                 break;
6426             sayNO;
6427
6428         case KEEPS: /*   \K  */
6429             /* update the startpoint */
6430             st->u.keeper.val = rex->offs[0].start;
6431             rex->offs[0].start = locinput - reginfo->strbeg;
6432             PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6433                             script_run_begin);
6434             NOT_REACHED; /* NOTREACHED */
6435
6436         case KEEPS_next_fail:
6437             /* rollback the start point change */
6438             rex->offs[0].start = st->u.keeper.val;
6439             sayNO_SILENT;
6440             NOT_REACHED; /* NOTREACHED */
6441
6442         case MEOL: /* /..$/m  */
6443             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6444                 sayNO;
6445             break;
6446
6447         case SEOL: /* /..$/  */
6448             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6449                 sayNO;
6450             if (reginfo->strend - locinput > 1)
6451                 sayNO;
6452             break;
6453
6454         case EOS: /*  \z  */
6455             if (!NEXTCHR_IS_EOS)
6456                 sayNO;
6457             break;
6458
6459         case SANY: /*  /./s  */
6460             if (NEXTCHR_IS_EOS || locinput >= loceol)
6461                 sayNO;
6462             goto increment_locinput;
6463
6464         case REG_ANY: /*  /./  */
6465             if (   NEXTCHR_IS_EOS
6466                 || locinput >= loceol
6467                 || nextbyte == '\n')
6468             {
6469                 sayNO;
6470             }
6471             goto increment_locinput;
6472
6473
6474 #undef  ST
6475 #define ST st->u.trie
6476         case TRIEC: /* (ab|cd) with known charclass */
6477             /* In this case the charclass data is available inline so
6478                we can fail fast without a lot of extra overhead.
6479              */
6480             if ( !   NEXTCHR_IS_EOS
6481                 &&   locinput < loceol
6482                 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6483             {
6484                 DEBUG_EXECUTE_r(
6485                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6486                               depth, PL_colors[4], PL_colors[5])
6487                 );
6488                 sayNO_SILENT;
6489                 NOT_REACHED; /* NOTREACHED */
6490             }
6491             /* FALLTHROUGH */
6492         case TRIE:  /* (ab|cd)  */
6493             /* the basic plan of execution of the trie is:
6494              * At the beginning, run though all the states, and
6495              * find the longest-matching word. Also remember the position
6496              * of the shortest matching word. For example, this pattern:
6497              *    1  2 3 4    5
6498              *    ab|a|x|abcd|abc
6499              * when matched against the string "abcde", will generate
6500              * accept states for all words except 3, with the longest
6501              * matching word being 4, and the shortest being 2 (with
6502              * the position being after char 1 of the string).
6503              *
6504              * Then for each matching word, in word order (i.e. 1,2,4,5),
6505              * we run the remainder of the pattern; on each try setting
6506              * the current position to the character following the word,
6507              * returning to try the next word on failure.
6508              *
6509              * We avoid having to build a list of words at runtime by
6510              * using a compile-time structure, wordinfo[].prev, which
6511              * gives, for each word, the previous accepting word (if any).
6512              * In the case above it would contain the mappings 1->2, 2->0,
6513              * 3->0, 4->5, 5->1.  We can use this table to generate, from
6514              * the longest word (4 above), a list of all words, by
6515              * following the list of prev pointers; this gives us the
6516              * unordered list 4,5,1,2. Then given the current word we have
6517              * just tried, we can go through the list and find the
6518              * next-biggest word to try (so if we just failed on word 2,
6519              * the next in the list is 4).
6520              *
6521              * Since at runtime we don't record the matching position in
6522              * the string for each word, we have to work that out for
6523              * each word we're about to process. The wordinfo table holds
6524              * the character length of each word; given that we recorded
6525              * at the start: the position of the shortest word and its
6526              * length in chars, we just need to move the pointer the
6527              * difference between the two char lengths. Depending on
6528              * Unicode status and folding, that's cheap or expensive.
6529              *
6530              * This algorithm is optimised for the case where are only a
6531              * small number of accept states, i.e. 0,1, or maybe 2.
6532              * With lots of accepts states, and having to try all of them,
6533              * it becomes quadratic on number of accept states to find all
6534              * the next words.
6535              */
6536
6537             {
6538                 /* what type of TRIE am I? (utf8 makes this contextual) */
6539                 DECL_TRIE_TYPE(scan);
6540
6541                 /* what trie are we using right now */
6542                 reg_trie_data * const trie
6543                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6544                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6545                 U32 state = trie->startstate;
6546
6547                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6548                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6549                     if (utf8_target
6550                         && ! NEXTCHR_IS_EOS
6551                         && UTF8_IS_ABOVE_LATIN1(nextbyte)
6552                         && scan->flags == EXACTL)
6553                     {
6554                         /* We only output for EXACTL, as we let the folder
6555                          * output this message for EXACTFLU8 to avoid
6556                          * duplication */
6557                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6558                                                                reginfo->strend);
6559                     }
6560                 }
6561                 if (   trie->bitmap
6562                     && (     NEXTCHR_IS_EOS
6563                         ||   locinput >= loceol
6564                         || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6565                 {
6566                     if (trie->states[ state ].wordnum) {
6567                          DEBUG_EXECUTE_r(
6568                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
6569                                           depth, PL_colors[4], PL_colors[5])
6570                         );
6571                         if (!trie->jump)
6572                             break;
6573                     } else {
6574                         DEBUG_EXECUTE_r(
6575                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6576                                           depth, PL_colors[4], PL_colors[5])
6577                         );
6578                         sayNO_SILENT;
6579                    }
6580                 }
6581
6582             {
6583                 U8 *uc = ( U8* )locinput;
6584
6585                 STRLEN len = 0;
6586                 STRLEN foldlen = 0;
6587                 U8 *uscan = (U8*)NULL;
6588                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6589                 U32 charcount = 0; /* how many input chars we have matched */
6590                 U32 accepted = 0; /* have we seen any accepting states? */
6591
6592                 ST.jump = trie->jump;
6593                 ST.me = scan;
6594                 ST.firstpos = NULL;
6595                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6596                 ST.nextword = 0;
6597
6598                 /* fully traverse the TRIE; note the position of the
6599                    shortest accept state and the wordnum of the longest
6600                    accept state */
6601
6602                 while ( state && uc <= (U8*)(loceol) ) {
6603                     U32 base = trie->states[ state ].trans.base;
6604                     UV uvc = 0;
6605                     U16 charid = 0;
6606                     U16 wordnum;
6607                     wordnum = trie->states[ state ].wordnum;
6608
6609                     if (wordnum) { /* it's an accept state */
6610                         if (!accepted) {
6611                             accepted = 1;
6612                             /* record first match position */
6613                             if (ST.longfold) {
6614                                 ST.firstpos = (U8*)locinput;
6615                                 ST.firstchars = 0;
6616                             }
6617                             else {
6618                                 ST.firstpos = uc;
6619                                 ST.firstchars = charcount;
6620                             }
6621                         }
6622                         if (!ST.nextword || wordnum < ST.nextword)
6623                             ST.nextword = wordnum;
6624                         ST.topword = wordnum;
6625                     }
6626
6627                     DEBUG_TRIE_EXECUTE_r({
6628                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6629                                 /* HERE */
6630                                 PerlIO_printf( Perl_debug_log,
6631                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6632                                     INDENT_CHARS(depth), "", PL_colors[4],
6633                                     (UV)state, (accepted ? 'Y' : 'N'));
6634                     });
6635
6636                     /* read a char and goto next state */
6637                     if ( base && (foldlen || uc < (U8*)(loceol))) {
6638                         I32 offset;
6639                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6640                                              (U8 *) loceol, uscan,
6641                                              len, uvc, charid, foldlen,
6642                                              foldbuf, uniflags);
6643                         charcount++;
6644                         if (foldlen>0)
6645                             ST.longfold = TRUE;
6646                         if (charid &&
6647                              ( ((offset =
6648                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6649
6650                              && ((U32)offset < trie->lasttrans)
6651                              && trie->trans[offset].check == state)
6652                         {
6653                             state = trie->trans[offset].next;
6654                         }
6655                         else {
6656                             state = 0;
6657                         }
6658                         uc += len;
6659
6660                     }
6661                     else {
6662                         state = 0;
6663                     }
6664                     DEBUG_TRIE_EXECUTE_r(
6665                         Perl_re_printf( aTHX_
6666                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6667                             charid, uvc, (UV)state, PL_colors[5] );
6668                     );
6669                 }
6670                 if (!accepted)
6671                    sayNO;
6672
6673                 /* calculate total number of accept states */
6674                 {
6675                     U16 w = ST.topword;
6676                     accepted = 0;
6677                     while (w) {
6678                         w = trie->wordinfo[w].prev;
6679                         accepted++;
6680                     }
6681                     ST.accepted = accepted;
6682                 }
6683
6684                 DEBUG_EXECUTE_r(
6685                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6686                         depth,
6687                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6688                 );
6689                 goto trie_first_try; /* jump into the fail handler */
6690             }}
6691             NOT_REACHED; /* NOTREACHED */
6692
6693         case TRIE_next_fail: /* we failed - try next alternative */
6694         {
6695             U8 *uc;
6696             if ( ST.jump ) {
6697                 /* undo any captures done in the tail part of a branch,
6698                  * e.g.
6699                  *    /(?:X(.)(.)|Y(.)).../
6700                  * where the trie just matches X then calls out to do the
6701                  * rest of the branch */
6702                 REGCP_UNWIND(ST.cp);
6703                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6704             }
6705             if (!--ST.accepted) {
6706                 DEBUG_EXECUTE_r({
6707                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6708                         depth,
6709                         PL_colors[4],
6710                         PL_colors[5] );
6711                 });
6712                 sayNO_SILENT;
6713             }
6714             {
6715                 /* Find next-highest word to process.  Note that this code
6716                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6717                 U16 min = 0;
6718                 U16 word;
6719                 U16 const nextword = ST.nextword;
6720                 reg_trie_wordinfo * const wordinfo
6721                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6722                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6723                     if (word > nextword && (!min || word < min))
6724                         min = word;
6725                 }
6726                 ST.nextword = min;
6727             }
6728
6729           trie_first_try:
6730             if (do_cutgroup) {
6731                 do_cutgroup = 0;
6732                 no_final = 0;
6733             }
6734
6735             if ( ST.jump ) {
6736                 ST.lastparen = rex->lastparen;
6737                 ST.lastcloseparen = rex->lastcloseparen;
6738                 REGCP_SET(ST.cp);
6739             }
6740
6741             /* find start char of end of current word */
6742             {
6743                 U32 chars; /* how many chars to skip */
6744                 reg_trie_data * const trie
6745                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6746
6747                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6748                             >=  ST.firstchars);
6749                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6750                             - ST.firstchars;
6751                 uc = ST.firstpos;
6752
6753                 if (ST.longfold) {
6754                     /* the hard option - fold each char in turn and find
6755                      * its folded length (which may be different */
6756                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6757                     STRLEN foldlen;
6758                     STRLEN len;
6759                     UV uvc;
6760                     U8 *uscan;
6761
6762                     while (chars) {
6763                         if (utf8_target) {
6764                             /* XXX This assumes the length is well-formed, as
6765                              * does the UTF8SKIP below */
6766                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6767                                                     uniflags);
6768                             uc += len;
6769                         }
6770                         else {
6771                             uvc = *uc;
6772                             uc++;
6773                         }
6774                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6775                         uscan = foldbuf;
6776                         while (foldlen) {
6777                             if (!--chars)
6778                                 break;
6779                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6780                                                  uniflags);
6781                             uscan += len;
6782                             foldlen -= len;
6783                         }
6784                     }
6785                 }
6786                 else {
6787                     if (utf8_target)
6788                         uc = utf8_hop(uc, chars);
6789                     else
6790                         uc += chars;
6791                 }
6792             }
6793
6794             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6795                             ? ST.jump[ST.nextword]
6796                             : NEXT_OFF(ST.me));
6797
6798             DEBUG_EXECUTE_r({
6799                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6800                     depth,
6801                     PL_colors[4],
6802                     ST.nextword,
6803                     PL_colors[5]
6804                     );
6805             });
6806
6807             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6808                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6809                                 script_run_begin);
6810                 NOT_REACHED; /* NOTREACHED */
6811             }
6812             /* only one choice left - just continue */
6813             DEBUG_EXECUTE_r({
6814                 AV *const trie_words
6815                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6816                 SV ** const tmp = trie_words
6817                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6818                 SV *sv= tmp ? sv_newmortal() : NULL;
6819
6820                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6821                     depth, PL_colors[4],
6822                     ST.nextword,
6823                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6824                             PL_colors[0], PL_colors[1],
6825                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6826                         )
6827                     : "not compiled under -Dr",
6828                     PL_colors[5] );
6829             });
6830
6831             locinput = (char*)uc;
6832             continue; /* execute rest of RE */
6833             /* NOTREACHED */
6834         }
6835 #undef  ST
6836
6837         case LEXACT_REQ8:
6838             if (! utf8_target) {
6839                 sayNO;
6840             }
6841             /* FALLTHROUGH */
6842
6843         case LEXACT:
6844         {
6845             char *s;
6846
6847             s = STRINGl(scan);
6848             ln = STR_LENl(scan);
6849             goto join_short_long_exact;
6850
6851         case EXACTL:             /*  /abc/l       */
6852             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6853
6854             /* Complete checking would involve going through every character
6855              * matched by the string to see if any is above latin1.  But the
6856              * comparision otherwise might very well be a fast assembly
6857              * language routine, and I (khw) don't think slowing things down
6858              * just to check for this warning is worth it.  So this just checks
6859              * the first character */
6860             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6861                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6862             }
6863             goto do_exact;
6864         case EXACT_REQ8:
6865             if (! utf8_target) {
6866                 sayNO;
6867             }
6868             /* FALLTHROUGH */
6869
6870         case EXACT:             /*  /abc/        */
6871           do_exact:
6872             s = STRINGs(scan);
6873             ln = STR_LENs(scan);
6874
6875           join_short_long_exact:
6876             if (utf8_target != is_utf8_pat) {
6877                 /* The target and the pattern have differing utf8ness. */
6878                 char *l = locinput;
6879                 const char * const e = s + ln;
6880
6881                 if (utf8_target) {
6882                     /* The target is utf8, the pattern is not utf8.
6883                      * Above-Latin1 code points can't match the pattern;
6884                      * invariants match exactly, and the other Latin1 ones need
6885                      * to be downgraded to a single byte in order to do the
6886                      * comparison.  (If we could be confident that the target
6887                      * is not malformed, this could be refactored to have fewer
6888                      * tests by just assuming that if the first bytes match, it
6889                      * is an invariant, but there are tests in the test suite
6890                      * dealing with (??{...}) which violate this) */
6891                     while (s < e) {
6892                         if (   l >= loceol
6893                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6894                         {
6895                             sayNO;
6896                         }
6897                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6898                             if (*l != *s) {
6899                                 sayNO;
6900                             }
6901                             l++;
6902                         }
6903                         else {
6904                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6905                             {
6906                                 sayNO;
6907                             }
6908                             l += 2;
6909                         }
6910                         s++;
6911                     }
6912                 }
6913                 else {
6914                     /* The target is not utf8, the pattern is utf8. */
6915                     while (s < e) {
6916                         if (   l >= loceol
6917                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6918                         {
6919                             sayNO;
6920                         }
6921                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6922                             if (*s != *l) {
6923                                 sayNO;
6924                             }
6925                             s++;
6926                         }
6927                         else {
6928                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6929                             {
6930                                 sayNO;
6931                             }
6932                             s += 2;
6933                         }
6934                         l++;
6935                     }
6936                 }
6937                 locinput = l;
6938             }
6939             else {
6940                 /* The target and the pattern have the same utf8ness. */
6941                 /* Inline the first character, for speed. */
6942                 if (   loceol - locinput < ln
6943                     || UCHARAT(s) != nextbyte
6944                     || (ln > 1 && memNE(s, locinput, ln)))
6945                 {
6946                     sayNO;
6947                 }
6948                 locinput += ln;
6949             }
6950             break;
6951             }
6952
6953         case EXACTFL:            /*  /abc/il      */
6954           {
6955             re_fold_t folder;
6956             const U8 * fold_array;
6957             const char * s;
6958             U32 fold_utf8_flags;
6959
6960             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6961             folder = foldEQ_locale;
6962             fold_array = PL_fold_locale;
6963             fold_utf8_flags = FOLDEQ_LOCALE;
6964             goto do_exactf;
6965
6966         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6967                                       is effectively /u; hence to match, target
6968                                       must be UTF-8. */
6969             if (! utf8_target) {
6970                 sayNO;
6971             }
6972             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6973                                              | FOLDEQ_S2_FOLDS_SANE;
6974             folder = foldEQ_latin1_s2_folded;
6975             fold_array = PL_fold_latin1;
6976             goto do_exactf;
6977
6978         case EXACTFU_REQ8:      /* /abc/iu with something in /abc/ > 255 */
6979             if (! utf8_target) {
6980                 sayNO;
6981             }
6982             assert(is_utf8_pat);
6983             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6984             goto do_exactf;
6985
6986         case EXACTFUP:          /*  /foo/iu, and something is problematic in
6987                                     'foo' so can't take shortcuts. */
6988             assert(! is_utf8_pat);
6989             folder = foldEQ_latin1;
6990             fold_array = PL_fold_latin1;
6991             fold_utf8_flags = 0;
6992             goto do_exactf;
6993
6994         case EXACTFU:            /*  /abc/iu      */
6995             folder = foldEQ_latin1_s2_folded;
6996             fold_array = PL_fold_latin1;
6997             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6998             goto do_exactf;
6999
7000         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
7001                                    patterns */
7002             assert(! is_utf8_pat);
7003             /* FALLTHROUGH */
7004         case EXACTFAA:            /*  /abc/iaa     */
7005             folder = foldEQ_latin1_s2_folded;
7006             fold_array = PL_fold_latin1;
7007             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7008             if (is_utf8_pat || ! utf8_target) {
7009
7010                 /* The possible presence of a MICRO SIGN in the pattern forbids
7011                  * us to view a non-UTF-8 pattern as folded when there is a
7012                  * UTF-8 target */
7013                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7014                                   |FOLDEQ_S2_FOLDS_SANE;
7015             }
7016             goto do_exactf;
7017
7018
7019         case EXACTF:             /*  /abc/i    This node only generated for
7020                                                non-utf8 patterns */
7021             assert(! is_utf8_pat);
7022             folder = foldEQ;
7023             fold_array = PL_fold;
7024             fold_utf8_flags = 0;
7025
7026           do_exactf:
7027             s = STRINGs(scan);
7028             ln = STR_LENs(scan);
7029
7030             if (   utf8_target
7031                 || is_utf8_pat
7032                 || state_num == EXACTFUP
7033                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7034             {
7035               /* Either target or the pattern are utf8, or has the issue where
7036                * the fold lengths may differ. */
7037                 const char * const l = locinput;
7038                 char *e = loceol;
7039
7040                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
7041                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
7042                 {
7043                     sayNO;
7044                 }
7045                 locinput = e;
7046                 break;
7047             }
7048
7049             /* Neither the target nor the pattern are utf8 */
7050             if (UCHARAT(s) != nextbyte
7051                 && !NEXTCHR_IS_EOS
7052                 && UCHARAT(s) != fold_array[nextbyte])
7053             {
7054                 sayNO;
7055             }
7056             if (loceol - locinput < ln)
7057                 sayNO;
7058             if (ln > 1 && ! folder(locinput, s, ln))
7059                 sayNO;
7060             locinput += ln;
7061             break;
7062         }
7063
7064         case NBOUNDL: /*  /\B/l  */
7065             to_complement = 1;
7066             /* FALLTHROUGH */
7067
7068         case BOUNDL:  /*  /\b/l  */
7069         {
7070             bool b1, b2;
7071             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7072
7073             if (FLAGS(scan) != TRADITIONAL_BOUND) {
7074                 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7075                 goto boundu;
7076             }
7077
7078             if (utf8_target) {
7079                 if (locinput == reginfo->strbeg)
7080                     b1 = isWORDCHAR_LC('\n');
7081                 else {
7082                     U8 *p = reghop3((U8*)locinput, -1,
7083                                     (U8*)(reginfo->strbeg));
7084                     b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7085                 }
7086                 b2 = (NEXTCHR_IS_EOS)
7087                     ? isWORDCHAR_LC('\n')
7088                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7089                                               (U8*) reginfo->strend);
7090             }
7091             else { /* Here the string isn't utf8 */
7092                 b1 = (locinput == reginfo->strbeg)
7093                      ? isWORDCHAR_LC('\n')
7094                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
7095                 b2 = (NEXTCHR_IS_EOS)
7096                     ? isWORDCHAR_LC('\n')
7097                     : isWORDCHAR_LC(nextbyte);
7098             }
7099             if (to_complement ^ (b1 == b2)) {
7100                 sayNO;
7101             }
7102             break;
7103         }
7104
7105         case NBOUND:  /*  /\B/   */
7106             to_complement = 1;
7107             /* FALLTHROUGH */
7108
7109         case BOUND:   /*  /\b/   */
7110             if (utf8_target) {
7111                 goto bound_utf8;
7112             }
7113             goto bound_ascii_match_only;
7114
7115         case NBOUNDA: /*  /\B/a  */
7116             to_complement = 1;
7117             /* FALLTHROUGH */
7118
7119         case BOUNDA:  /*  /\b/a  */
7120         {
7121             bool b1, b2;
7122
7123           bound_ascii_match_only:
7124             /* Here the string isn't utf8, or is utf8 and only ascii characters
7125              * are to match \w.  In the latter case looking at the byte just
7126              * prior to the current one may be just the final byte of a
7127              * multi-byte character.  This is ok.  There are two cases:
7128              * 1) it is a single byte character, and then the test is doing
7129              *    just what it's supposed to.
7130              * 2) it is a multi-byte character, in which case the final byte is
7131              *    never mistakable for ASCII, and so the test will say it is
7132              *    not a word character, which is the correct answer. */
7133             b1 = (locinput == reginfo->strbeg)
7134                  ? isWORDCHAR_A('\n')
7135                  : isWORDCHAR_A(UCHARAT(locinput - 1));
7136             b2 = (NEXTCHR_IS_EOS)
7137                 ? isWORDCHAR_A('\n')
7138                 : isWORDCHAR_A(nextbyte);
7139             if (to_complement ^ (b1 == b2)) {
7140                 sayNO;
7141             }
7142             break;
7143         }
7144
7145         case NBOUNDU: /*  /\B/u  */
7146             to_complement = 1;
7147             /* FALLTHROUGH */
7148
7149         case BOUNDU:  /*  /\b/u  */
7150
7151           boundu:
7152             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7153                 match = FALSE;
7154             }
7155             else if (utf8_target) {
7156               bound_utf8:
7157                 switch((bound_type) FLAGS(scan)) {
7158                     case TRADITIONAL_BOUND:
7159                     {
7160                         bool b1, b2;
7161                         if (locinput == reginfo->strbeg) {
7162                             b1 = 0 /* isWORDCHAR_L1('\n') */;
7163                         }
7164                         else {
7165                             U8 *p = reghop3((U8*)locinput, -1,
7166                                             (U8*)(reginfo->strbeg));
7167
7168                             b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7169                         }
7170                         b2 = (NEXTCHR_IS_EOS)
7171                             ? 0 /* isWORDCHAR_L1('\n') */
7172                             : isWORDCHAR_utf8_safe((U8*)locinput,
7173                                                    (U8*) reginfo->strend);
7174                         match = cBOOL(b1 != b2);
7175                         break;
7176                     }
7177                     case GCB_BOUND:
7178                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7179                             match = TRUE; /* GCB always matches at begin and
7180                                              end */
7181                         }
7182                         else {
7183                             /* Find the gcb values of previous and current
7184                              * chars, then see if is a break point */
7185                             match = isGCB(getGCB_VAL_UTF8(
7186                                                 reghop3((U8*)locinput,
7187                                                         -1,
7188                                                         (U8*)(reginfo->strbeg)),
7189                                                 (U8*) reginfo->strend),
7190                                           getGCB_VAL_UTF8((U8*) locinput,
7191                                                         (U8*) reginfo->strend),
7192                                           (U8*) reginfo->strbeg,
7193                                           (U8*) locinput,
7194                                           utf8_target);
7195                         }
7196                         break;
7197
7198                     case LB_BOUND:
7199                         if (locinput == reginfo->strbeg) {
7200                             match = FALSE;
7201                         }
7202                         else if (NEXTCHR_IS_EOS) {
7203                             match = TRUE;
7204                         }
7205                         else {
7206                             match = isLB(getLB_VAL_UTF8(
7207                                                 reghop3((U8*)locinput,
7208                                                         -1,
7209                                                         (U8*)(reginfo->strbeg)),
7210                                                 (U8*) reginfo->strend),
7211                                           getLB_VAL_UTF8((U8*) locinput,
7212                                                         (U8*) reginfo->strend),
7213                                           (U8*) reginfo->strbeg,
7214                                           (U8*) locinput,
7215                                           (U8*) reginfo->strend,
7216                                           utf8_target);
7217                         }
7218                         break;
7219
7220                     case SB_BOUND: /* Always matches at begin and end */
7221                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7222                             match = TRUE;
7223                         }
7224                         else {
7225                             match = isSB(getSB_VAL_UTF8(
7226                                                 reghop3((U8*)locinput,
7227                                                         -1,
7228                                                         (U8*)(reginfo->strbeg)),
7229                                                 (U8*) reginfo->strend),
7230                                           getSB_VAL_UTF8((U8*) locinput,
7231                                                         (U8*) reginfo->strend),
7232                                           (U8*) reginfo->strbeg,
7233                                           (U8*) locinput,
7234                                           (U8*) reginfo->strend,
7235                                           utf8_target);
7236                         }
7237                         break;
7238
7239                     case WB_BOUND:
7240                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7241                             match = TRUE;
7242                         }
7243                         else {
7244                             match = isWB(WB_UNKNOWN,
7245                                          getWB_VAL_UTF8(
7246                                                 reghop3((U8*)locinput,
7247                                                         -1,
7248                                                         (U8*)(reginfo->strbeg)),
7249                                                 (U8*) reginfo->strend),
7250                                           getWB_VAL_UTF8((U8*) locinput,
7251                                                         (U8*) reginfo->strend),
7252                                           (U8*) reginfo->strbeg,
7253                                           (U8*) locinput,
7254                                           (U8*) reginfo->strend,
7255                                           utf8_target);
7256                         }
7257                         break;
7258                 }
7259             }
7260             else {  /* Not utf8 target */
7261                 switch((bound_type) FLAGS(scan)) {
7262                     case TRADITIONAL_BOUND:
7263                     {
7264                         bool b1, b2;
7265                         b1 = (locinput == reginfo->strbeg)
7266                             ? 0 /* isWORDCHAR_L1('\n') */
7267                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
7268                         b2 = (NEXTCHR_IS_EOS)
7269                             ? 0 /* isWORDCHAR_L1('\n') */
7270                             : isWORDCHAR_L1(nextbyte);
7271                         match = cBOOL(b1 != b2);
7272                         break;
7273                     }
7274
7275                     case GCB_BOUND:
7276                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7277                             match = TRUE; /* GCB always matches at begin and
7278                                              end */
7279                         }
7280                         else {  /* Only CR-LF combo isn't a GCB in 0-255
7281                                    range */
7282                             match =    UCHARAT(locinput - 1) != '\r'
7283                                     || UCHARAT(locinput) != '\n';
7284                         }
7285                         break;
7286
7287                     case LB_BOUND:
7288                         if (locinput == reginfo->strbeg) {
7289                             match = FALSE;
7290                         }
7291                         else if (NEXTCHR_IS_EOS) {
7292                             match = TRUE;
7293                         }
7294                         else {
7295                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7296                                          getLB_VAL_CP(UCHARAT(locinput)),
7297                                          (U8*) reginfo->strbeg,
7298                                          (U8*) locinput,
7299                                          (U8*) reginfo->strend,
7300                                          utf8_target);
7301                         }
7302                         break;
7303
7304                     case SB_BOUND: /* Always matches at begin and end */
7305                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7306                             match = TRUE;
7307                         }
7308                         else {
7309                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7310                                          getSB_VAL_CP(UCHARAT(locinput)),
7311                                          (U8*) reginfo->strbeg,
7312                                          (U8*) locinput,
7313                                          (U8*) reginfo->strend,
7314                                          utf8_target);
7315                         }
7316                         break;
7317
7318                     case WB_BOUND:
7319                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7320                             match = TRUE;
7321                         }
7322                         else {
7323                             match = isWB(WB_UNKNOWN,
7324                                          getWB_VAL_CP(UCHARAT(locinput -1)),
7325                                          getWB_VAL_CP(UCHARAT(locinput)),
7326                                          (U8*) reginfo->strbeg,
7327                                          (U8*) locinput,
7328                                          (U8*) reginfo->strend,
7329                                          utf8_target);
7330                         }
7331                         break;
7332                 }
7333             }
7334
7335             if (to_complement ^ ! match) {
7336                 sayNO;
7337             }
7338             break;
7339
7340         case ANYOFPOSIXL:
7341         case ANYOFL:  /*  /[abc]/l      */
7342             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7343             CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7344
7345             /* FALLTHROUGH */
7346         case ANYOFD:  /*   /[abc]/d       */
7347         case ANYOF:  /*   /[abc]/       */
7348             if (NEXTCHR_IS_EOS || locinput >= loceol)
7349                 sayNO;
7350             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7351                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7352             {
7353                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7354                     sayNO;
7355                 }
7356                 locinput++;
7357             }
7358             else {
7359                 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7360                                                                    utf8_target))
7361                 {
7362                     sayNO;
7363                 }
7364                 goto increment_locinput;
7365             }
7366             break;
7367
7368         case ANYOFM:
7369             if (   NEXTCHR_IS_EOS
7370                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
7371                 || locinput >= loceol)
7372             {
7373                 sayNO;
7374             }
7375             locinput++; /* ANYOFM is always single byte */
7376             break;
7377
7378         case NANYOFM:
7379             if (   NEXTCHR_IS_EOS
7380                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
7381                 || locinput >= loceol)
7382             {
7383                 sayNO;
7384             }
7385             goto increment_locinput;
7386             break;
7387
7388         case ANYOFH:
7389             if (   ! utf8_target
7390                 ||   NEXTCHR_IS_EOS
7391                 ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7392                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7393                                                                    utf8_target))
7394             {
7395                 sayNO;
7396             }
7397             goto increment_locinput;
7398             break;
7399
7400         case ANYOFHb:
7401             if (   ! utf8_target
7402                 ||   NEXTCHR_IS_EOS
7403                 ||   ANYOF_FLAGS(scan) != (U8) *locinput
7404                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7405                                                                   utf8_target))
7406             {
7407                 sayNO;
7408             }
7409             goto increment_locinput;
7410             break;
7411
7412         case ANYOFHr:
7413             if (   ! utf8_target
7414                 ||   NEXTCHR_IS_EOS
7415                 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7416                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7417                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7418                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7419                                                                    utf8_target))
7420             {
7421                 sayNO;
7422             }
7423             goto increment_locinput;
7424             break;
7425
7426         case ANYOFHs:
7427             if (   ! utf8_target
7428                 ||   NEXTCHR_IS_EOS
7429                 ||   loceol - locinput < FLAGS(scan)
7430                 ||   memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7431                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7432                                                                    utf8_target))
7433             {
7434                 sayNO;
7435             }
7436             goto increment_locinput;
7437             break;
7438
7439         case ANYOFR:
7440             if (NEXTCHR_IS_EOS) {
7441                 sayNO;
7442             }
7443
7444             if (utf8_target) {
7445                 if (    ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7446                    || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7447                                                 (U8 *) reginfo->strend,
7448                                                 NULL),
7449                                     ANYOFRbase(scan), ANYOFRdelta(scan)))
7450                 {
7451                     sayNO;
7452                 }
7453             }
7454             else {
7455                 if (! withinCOUNT((U8) *locinput,
7456                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7457                 {
7458                     sayNO;
7459                 }
7460             }
7461             goto increment_locinput;
7462             break;
7463
7464         case ANYOFRb:
7465             if (NEXTCHR_IS_EOS) {
7466                 sayNO;
7467             }
7468
7469             if (utf8_target) {
7470                 if (     ANYOF_FLAGS(scan) != (U8) *locinput
7471                     || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7472                                                 (U8 *) reginfo->strend,
7473                                                 NULL),
7474                                      ANYOFRbase(scan), ANYOFRdelta(scan)))
7475                 {
7476                     sayNO;
7477                 }
7478             }
7479             else {
7480                 if (! withinCOUNT((U8) *locinput,
7481                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7482                 {
7483                     sayNO;
7484                 }
7485             }
7486             goto increment_locinput;
7487             break;
7488
7489         /* The argument (FLAGS) to all the POSIX node types is the class number
7490          * */
7491
7492         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
7493             to_complement = 1;
7494             /* FALLTHROUGH */
7495
7496         case POSIXL:    /* \w or [:punct:] etc. under /l */
7497             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7498             if (NEXTCHR_IS_EOS || locinput >= loceol)
7499                 sayNO;
7500
7501             /* Use isFOO_lc() for characters within Latin1.  (Note that
7502              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7503              * wouldn't be invariant) */
7504             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7505                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7506                     sayNO;
7507                 }
7508
7509                 locinput++;
7510                 break;
7511             }
7512
7513             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7514                 /* An above Latin-1 code point, or malformed */
7515                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7516                                                        reginfo->strend);
7517                 goto utf8_posix_above_latin1;
7518             }
7519
7520             /* Here is a UTF-8 variant code point below 256 and the target is
7521              * UTF-8 */
7522             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7523                                             EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7524                                             *(locinput + 1))))))
7525             {
7526                 sayNO;
7527             }
7528
7529             goto increment_locinput;
7530
7531         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
7532             to_complement = 1;
7533             /* FALLTHROUGH */
7534
7535         case POSIXD:    /* \w or [:punct:] etc. under /d */
7536             if (utf8_target) {
7537                 goto utf8_posix;
7538             }
7539             goto posixa;
7540
7541         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
7542
7543             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7544                 sayNO;
7545             }
7546
7547             /* All UTF-8 variants match */
7548             if (! UTF8_IS_INVARIANT(nextbyte)) {
7549                 goto increment_locinput;
7550             }
7551
7552             to_complement = 1;
7553             goto join_nposixa;
7554
7555         case POSIXA:    /* \w or [:punct:] etc. under /a */
7556
7557           posixa:
7558             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7559              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7560              * character is a single byte */
7561
7562             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7563                 sayNO;
7564             }
7565
7566           join_nposixa:
7567
7568             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte,
7569                                                                 FLAGS(scan)))))
7570             {
7571                 sayNO;
7572             }
7573
7574             /* Here we are either not in utf8, or we matched a utf8-invariant,
7575              * so the next char is the next byte */
7576             locinput++;
7577             break;
7578
7579         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
7580             to_complement = 1;
7581             /* FALLTHROUGH */
7582
7583         case POSIXU:    /* \w or [:punct:] etc. under /u */
7584           utf8_posix:
7585             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7586                 sayNO;
7587             }
7588
7589             /* Use _generic_isCC() for characters within Latin1.  (Note that
7590              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7591              * wouldn't be invariant) */
7592             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7593                 if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte,
7594                                                            FLAGS(scan)))))
7595                 {
7596                     sayNO;
7597                 }
7598                 locinput++;
7599             }
7600             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7601                 if (! (to_complement
7602                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7603                                                                *(locinput + 1)),
7604                                              FLAGS(scan)))))
7605                 {
7606                     sayNO;
7607                 }
7608                 locinput += 2;
7609             }
7610             else {  /* Handle above Latin-1 code points */
7611               utf8_posix_above_latin1:
7612                 classnum = (_char_class_number) FLAGS(scan);
7613                 switch (classnum) {
7614                     default:
7615                         if (! (to_complement
7616                            ^ cBOOL(_invlist_contains_cp(
7617                                       PL_XPosix_ptrs[classnum],
7618                                       utf8_to_uvchr_buf((U8 *) locinput,
7619                                                         (U8 *) reginfo->strend,
7620                                                         NULL)))))
7621                         {
7622                             sayNO;
7623                         }
7624                         break;
7625                     case _CC_ENUM_SPACE:
7626                         if (! (to_complement
7627                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
7628                         {
7629                             sayNO;
7630                         }
7631                         break;
7632                     case _CC_ENUM_BLANK:
7633                         if (! (to_complement
7634                                         ^ cBOOL(is_HORIZWS_high(locinput))))
7635                         {
7636                             sayNO;
7637                         }
7638                         break;
7639                     case _CC_ENUM_XDIGIT:
7640                         if (! (to_complement
7641                                         ^ cBOOL(is_XDIGIT_high(locinput))))
7642                         {
7643                             sayNO;
7644                         }
7645                         break;
7646                     case _CC_ENUM_VERTSPACE:
7647                         if (! (to_complement
7648                                         ^ cBOOL(is_VERTWS_high(locinput))))
7649                         {
7650                             sayNO;
7651                         }
7652                         break;
7653                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
7654                     case _CC_ENUM_ASCII:
7655                         if (! to_complement) {
7656                             sayNO;
7657                         }
7658                         break;
7659                 }
7660                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7661             }
7662             break;
7663
7664         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
7665                        a Unicode extended Grapheme Cluster */
7666             if (NEXTCHR_IS_EOS || locinput >= loceol)
7667                 sayNO;
7668             if  (! utf8_target) {
7669
7670                 /* Match either CR LF  or '.', as all the other possibilities
7671                  * require utf8 */
7672                 locinput++;         /* Match the . or CR */
7673                 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7674                                        match the LF */
7675                     && locinput <  loceol
7676                     && UCHARAT(locinput) == '\n')
7677                 {
7678                     locinput++;
7679                 }
7680             }
7681             else {
7682
7683                 /* Get the gcb type for the current character */
7684                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7685                                                        (U8*) reginfo->strend);
7686
7687                 /* Then scan through the input until we get to the first
7688                  * character whose type is supposed to be a gcb with the
7689                  * current character.  (There is always a break at the
7690                  * end-of-input) */
7691                 locinput += UTF8SKIP(locinput);
7692                 while (locinput < loceol) {
7693                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7694                                                          (U8*) reginfo->strend);
7695                     if (isGCB(prev_gcb, cur_gcb,
7696                               (U8*) reginfo->strbeg, (U8*) locinput,
7697                               utf8_target))
7698                     {
7699                         break;
7700                     }
7701
7702                     prev_gcb = cur_gcb;
7703                     locinput += UTF8SKIP(locinput);
7704                 }
7705
7706
7707             }
7708             break;
7709
7710         case REFFLN:  /*  /\g{name}/il  */
7711         {   /* The capture buffer cases.  The ones beginning with N for the
7712                named buffers just convert to the equivalent numbered and
7713                pretend they were called as the corresponding numbered buffer
7714                op.  */
7715             /* don't initialize these in the declaration, it makes C++
7716                unhappy */
7717             const char *s;
7718             char type;
7719             re_fold_t folder;
7720             const U8 *fold_array;
7721             UV utf8_fold_flags;
7722
7723             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7724             folder = foldEQ_locale;
7725             fold_array = PL_fold_locale;
7726             type = REFFL;
7727             utf8_fold_flags = FOLDEQ_LOCALE;
7728             goto do_nref;
7729
7730         case REFFAN:  /*  /\g{name}/iaa  */
7731             folder = foldEQ_latin1;
7732             fold_array = PL_fold_latin1;
7733             type = REFFA;
7734             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7735             goto do_nref;
7736
7737         case REFFUN:  /*  /\g{name}/iu  */
7738             folder = foldEQ_latin1;
7739             fold_array = PL_fold_latin1;
7740             type = REFFU;
7741             utf8_fold_flags = 0;
7742             goto do_nref;
7743
7744         case REFFN:  /*  /\g{name}/i  */
7745             folder = foldEQ;
7746             fold_array = PL_fold;
7747             type = REFF;
7748             utf8_fold_flags = 0;
7749             goto do_nref;
7750
7751         case REFN:  /*  /\g{name}/   */
7752             type = REF;
7753             folder = NULL;
7754             fold_array = NULL;
7755             utf8_fold_flags = 0;
7756           do_nref:
7757
7758             /* For the named back references, find the corresponding buffer
7759              * number */
7760             n = reg_check_named_buff_matched(rex,scan);
7761
7762             if ( ! n ) {
7763                 sayNO;
7764             }
7765             goto do_nref_ref_common;
7766
7767         case REFFL:  /*  /\1/il  */
7768             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7769             folder = foldEQ_locale;
7770             fold_array = PL_fold_locale;
7771             utf8_fold_flags = FOLDEQ_LOCALE;
7772             goto do_ref;
7773
7774         case REFFA:  /*  /\1/iaa  */
7775             folder = foldEQ_latin1;
7776             fold_array = PL_fold_latin1;
7777             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7778             goto do_ref;
7779
7780         case REFFU:  /*  /\1/iu  */
7781             folder = foldEQ_latin1;
7782             fold_array = PL_fold_latin1;
7783             utf8_fold_flags = 0;
7784             goto do_ref;
7785
7786         case REFF:  /*  /\1/i  */
7787             folder = foldEQ;
7788             fold_array = PL_fold;
7789             utf8_fold_flags = 0;
7790             goto do_ref;
7791
7792         case REF:  /*  /\1/    */
7793             folder = NULL;
7794             fold_array = NULL;
7795             utf8_fold_flags = 0;
7796
7797           do_ref:
7798             type = OP(scan);
7799             n = ARG(scan);  /* which paren pair */
7800
7801           do_nref_ref_common:
7802             ln = rex->offs[n].start;
7803             endref = rex->offs[n].end;
7804             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7805             if (rex->lastparen < n || ln == -1 || endref == -1)
7806                 sayNO;                  /* Do not match unless seen CLOSEn. */
7807             if (ln == endref)
7808                 break;
7809
7810             s = reginfo->strbeg + ln;
7811             if (type != REF     /* REF can do byte comparison */
7812                 && (utf8_target || type == REFFU || type == REFFL))
7813             {
7814                 char * limit = loceol;
7815
7816                 /* This call case insensitively compares the entire buffer
7817                     * at s, with the current input starting at locinput, but
7818                     * not going off the end given by loceol, and
7819                     * returns in <limit> upon success, how much of the
7820                     * current input was matched */
7821                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7822                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7823                 {
7824                     sayNO;
7825                 }
7826                 locinput = limit;
7827                 break;
7828             }
7829
7830             /* Not utf8:  Inline the first character, for speed. */
7831             if ( ! NEXTCHR_IS_EOS
7832                 && locinput < loceol
7833                 && UCHARAT(s) != nextbyte
7834                 && (   type == REF
7835                     || UCHARAT(s) != fold_array[nextbyte]))
7836             {
7837                 sayNO;
7838             }
7839             ln = endref - ln;
7840             if (locinput + ln > loceol)
7841                 sayNO;
7842             if (ln > 1 && (type == REF
7843                            ? memNE(s, locinput, ln)
7844                            : ! folder(locinput, s, ln)))
7845                 sayNO;
7846             locinput += ln;
7847             break;
7848         }
7849
7850         case NOTHING: /* null op; e.g. the 'nothing' following
7851                        * the '*' in m{(a+|b)*}' */
7852             break;
7853         case TAIL: /* placeholder while compiling (A|B|C) */
7854             break;
7855
7856 #undef  ST
7857 #define ST st->u.eval
7858 #define CUR_EVAL cur_eval->u.eval
7859
7860         {
7861             SV *ret;
7862             REGEXP *re_sv;
7863             regexp *re;
7864             regexp_internal *rei;
7865             regnode *startpoint;
7866             U32 arg;
7867
7868         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7869             arg= (U32)ARG(scan);
7870             if (cur_eval && cur_eval->locinput == locinput) {
7871                 if ( ++nochange_depth > max_nochange_depth )
7872                     Perl_croak(aTHX_
7873                         "Pattern subroutine nesting without pos change"
7874                         " exceeded limit in regex");
7875             } else {
7876                 nochange_depth = 0;
7877             }
7878             re_sv = rex_sv;
7879             re = rex;
7880             rei = rexi;
7881             startpoint = scan + ARG2L(scan);
7882             EVAL_CLOSE_PAREN_SET( st, arg );
7883             /* Detect infinite recursion
7884              *
7885              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7886              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7887              * So we track the position in the string we are at each time
7888              * we recurse and if we try to enter the same routine twice from
7889              * the same position we throw an error.
7890              */
7891             if ( rex->recurse_locinput[arg] == locinput ) {
7892                 /* FIXME: we should show the regop that is failing as part
7893                  * of the error message. */
7894                 Perl_croak(aTHX_ "Infinite recursion in regex");
7895             } else {
7896                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7897                 rex->recurse_locinput[arg]= locinput;
7898
7899                 DEBUG_r({
7900                     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7901                     DEBUG_STACK_r({
7902                         Perl_re_exec_indentf( aTHX_
7903                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7904                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7905                         );
7906                     });
7907                 });
7908             }
7909
7910             /* Save all the positions seen so far. */
7911             ST.cp = regcppush(rex, 0, maxopenparen);
7912             REGCP_SET(ST.lastcp);
7913
7914             /* and then jump to the code we share with EVAL */
7915             goto eval_recurse_doit;
7916             /* NOTREACHED */
7917
7918         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7919             if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7920                 if ( ++nochange_depth > max_nochange_depth )
7921                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7922             } else {
7923                 nochange_depth = 0;
7924             }
7925             {
7926                 /* execute the code in the {...} */
7927
7928                 dSP;
7929                 IV before;
7930                 OP * const oop = PL_op;
7931                 COP * const ocurcop = PL_curcop;
7932                 OP *nop;
7933                 CV *newcv;
7934
7935                 /* save *all* paren positions */
7936                 regcppush(rex, 0, maxopenparen);
7937                 REGCP_SET(ST.lastcp);
7938
7939                 if (!caller_cv)
7940                     caller_cv = find_runcv(NULL);
7941
7942                 n = ARG(scan);
7943
7944                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7945                     newcv = (ReANY(
7946                                     (REGEXP*)(rexi->data->data[n])
7947                             ))->qr_anoncv;
7948                     nop = (OP*)rexi->data->data[n+1];
7949                 }
7950                 else if (rexi->data->what[n] == 'l') { /* literal code */
7951                     newcv = caller_cv;
7952                     nop = (OP*)rexi->data->data[n];
7953                     assert(CvDEPTH(newcv));
7954                 }
7955                 else {
7956                     /* literal with own CV */
7957                     assert(rexi->data->what[n] == 'L');
7958                     newcv = rex->qr_anoncv;
7959                     nop = (OP*)rexi->data->data[n];
7960                 }
7961
7962                 /* Some notes about MULTICALL and the context and save stacks.
7963                  *
7964                  * In something like
7965                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7966                  * since codeblocks don't introduce a new scope (so that
7967                  * local() etc accumulate), at the end of a successful
7968                  * match there will be a SAVEt_CLEARSV on the savestack
7969                  * for each of $x, $y, $z. If the three code blocks above
7970                  * happen to have come from different CVs (e.g. via
7971                  * embedded qr//s), then we must ensure that during any
7972                  * savestack unwinding, PL_comppad always points to the
7973                  * right pad at each moment. We achieve this by
7974                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7975                  * there is a change of pad.
7976                  * In theory whenever we call a code block, we should
7977                  * push a CXt_SUB context, then pop it on return from
7978                  * that code block. This causes a bit of an issue in that
7979                  * normally popping a context also clears the savestack
7980                  * back to cx->blk_oldsaveix, but here we specifically
7981                  * don't want to clear the save stack on exit from the
7982                  * code block.
7983                  * Also for efficiency we don't want to keep pushing and
7984                  * popping the single SUB context as we backtrack etc.
7985                  * So instead, we push a single context the first time
7986                  * we need, it, then hang onto it until the end of this
7987                  * function. Whenever we encounter a new code block, we
7988                  * update the CV etc if that's changed. During the times
7989                  * in this function where we're not executing a code
7990                  * block, having the SUB context still there is a bit
7991                  * naughty - but we hope that no-one notices.
7992                  * When the SUB context is initially pushed, we fake up
7993                  * cx->blk_oldsaveix to be as if we'd pushed this context
7994                  * on first entry to S_regmatch rather than at some random
7995                  * point during the regexe execution. That way if we
7996                  * croak, popping the context stack will ensure that
7997                  * *everything* SAVEd by this function is undone and then
7998                  * the context popped, rather than e.g., popping the
7999                  * context (and restoring the original PL_comppad) then
8000                  * popping more of the savestack and restoring a bad
8001                  * PL_comppad.
8002                  */
8003
8004                 /* If this is the first EVAL, push a MULTICALL. On
8005                  * subsequent calls, if we're executing a different CV, or
8006                  * if PL_comppad has got messed up from backtracking
8007                  * through SAVECOMPPADs, then refresh the context.
8008                  */
8009                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8010                 {
8011                     U8 flags = (CXp_SUB_RE |
8012                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8013                     SAVECOMPPAD();
8014                     if (last_pushed_cv) {
8015                         CHANGE_MULTICALL_FLAGS(newcv, flags);
8016                     }
8017                     else {
8018                         PUSH_MULTICALL_FLAGS(newcv, flags);
8019                     }
8020                     /* see notes above */
8021                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8022
8023                     last_pushed_cv = newcv;
8024                 }
8025                 else {
8026                     /* these assignments are just to silence compiler
8027                      * warnings */
8028                     multicall_cop = NULL;
8029                 }
8030                 last_pad = PL_comppad;
8031
8032                 /* the initial nextstate you would normally execute
8033                  * at the start of an eval (which would cause error
8034                  * messages to come from the eval), may be optimised
8035                  * away from the execution path in the regex code blocks;
8036                  * so manually set PL_curcop to it initially */
8037                 {
8038                     OP *o = cUNOPx(nop)->op_first;
8039                     assert(o->op_type == OP_NULL);
8040                     if (o->op_targ == OP_SCOPE) {
8041                         o = cUNOPo->op_first;
8042                     }
8043                     else {
8044                         assert(o->op_targ == OP_LEAVE);
8045                         o = cUNOPo->op_first;
8046                         assert(o->op_type == OP_ENTER);
8047                         o = OpSIBLING(o);
8048                     }
8049
8050                     if (o->op_type != OP_STUB) {
8051                         assert(    o->op_type == OP_NEXTSTATE
8052                                 || o->op_type == OP_DBSTATE
8053                                 || (o->op_type == OP_NULL
8054                                     &&  (  o->op_targ == OP_NEXTSTATE
8055                                         || o->op_targ == OP_DBSTATE
8056                                         )
8057                                     )
8058                         );
8059                         PL_curcop = (COP*)o;
8060                     }
8061                 }
8062                 nop = nop->op_next;
8063
8064                 DEBUG_STATE_r( Perl_re_printf( aTHX_
8065                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8066
8067                 rex->offs[0].end = locinput - reginfo->strbeg;
8068                 if (reginfo->info_aux_eval->pos_magic)
8069                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8070                                   reginfo->sv, reginfo->strbeg,
8071                                   locinput - reginfo->strbeg);
8072
8073                 if (sv_yes_mark) {
8074                     SV *sv_mrk = get_sv("REGMARK", 1);
8075                     sv_setsv(sv_mrk, sv_yes_mark);
8076                 }
8077
8078                 /* we don't use MULTICALL here as we want to call the
8079                  * first op of the block of interest, rather than the
8080                  * first op of the sub. Also, we don't want to free
8081                  * the savestack frame */
8082                 before = (IV)(SP-PL_stack_base);
8083                 PL_op = nop;
8084                 CALLRUNOPS(aTHX);                       /* Scalar context. */
8085                 SPAGAIN;
8086                 if ((IV)(SP-PL_stack_base) == before)
8087                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
8088                 else {
8089                     ret = POPs;
8090                     PUTBACK;
8091                 }
8092
8093                 /* before restoring everything, evaluate the returned
8094                  * value, so that 'uninit' warnings don't use the wrong
8095                  * PL_op or pad. Also need to process any magic vars
8096                  * (e.g. $1) *before* parentheses are restored */
8097
8098                 PL_op = NULL;
8099
8100                 re_sv = NULL;
8101                 if (logical == 0) {       /*   (?{})/   */
8102                     SV *replsv = save_scalar(PL_replgv);
8103                     sv_setsv(replsv, ret); /* $^R */
8104                     SvSETMAGIC(replsv);
8105                 }
8106                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
8107                     sw = cBOOL(SvTRUE_NN(ret));
8108                     logical = 0;
8109                 }
8110                 else {                   /*  /(??{})  */
8111                     /*  if its overloaded, let the regex compiler handle
8112                      *  it; otherwise extract regex, or stringify  */
8113                     if (SvGMAGICAL(ret))
8114                         ret = sv_mortalcopy(ret);
8115                     if (!SvAMAGIC(ret)) {
8116                         SV *sv = ret;
8117                         if (SvROK(sv))
8118                             sv = SvRV(sv);
8119                         if (SvTYPE(sv) == SVt_REGEXP)
8120                             re_sv = (REGEXP*) sv;
8121                         else if (SvSMAGICAL(ret)) {
8122                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8123                             if (mg)
8124                                 re_sv = (REGEXP *) mg->mg_obj;
8125                         }
8126
8127                         /* force any undef warnings here */
8128                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8129                             ret = sv_mortalcopy(ret);
8130                             (void) SvPV_force_nolen(ret);
8131                         }
8132                     }
8133
8134                 }
8135
8136                 /* *** Note that at this point we don't restore
8137                  * PL_comppad, (or pop the CxSUB) on the assumption it may
8138                  * be used again soon. This is safe as long as nothing
8139                  * in the regexp code uses the pad ! */
8140                 PL_op = oop;
8141                 PL_curcop = ocurcop;
8142                 regcp_restore(rex, ST.lastcp, &maxopenparen);
8143                 PL_curpm_under = PL_curpm;
8144                 PL_curpm = PL_reg_curpm;
8145
8146                 if (logical != 2) {
8147                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8148                                     script_run_begin);
8149                     /* NOTREACHED */
8150                 }
8151             }
8152
8153                 /* only /(??{})/  from now on */
8154                 logical = 0;
8155                 {
8156                     /* extract RE object from returned value; compiling if
8157                      * necessary */
8158
8159                     if (re_sv) {
8160                         re_sv = reg_temp_copy(NULL, re_sv);
8161                     }
8162                     else {
8163                         U32 pm_flags = 0;
8164
8165                         if (SvUTF8(ret) && IN_BYTES) {
8166                             /* In use 'bytes': make a copy of the octet
8167                              * sequence, but without the flag on */
8168                             STRLEN len;
8169                             const char *const p = SvPV(ret, len);
8170                             ret = newSVpvn_flags(p, len, SVs_TEMP);
8171                         }
8172                         if (rex->intflags & PREGf_USE_RE_EVAL)
8173                             pm_flags |= PMf_USE_RE_EVAL;
8174
8175                         /* if we got here, it should be an engine which
8176                          * supports compiling code blocks and stuff */
8177                         assert(rex->engine && rex->engine->op_comp);
8178                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
8179                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8180                                     rex->engine, NULL, NULL,
8181                                     /* copy /msixn etc to inner pattern */
8182                                     ARG2L(scan),
8183                                     pm_flags);
8184
8185                         if (!(SvFLAGS(ret)
8186                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
8187                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8188                             /* This isn't a first class regexp. Instead, it's
8189                                caching a regexp onto an existing, Perl visible
8190                                scalar.  */
8191                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8192                         }
8193                     }
8194                     SAVEFREESV(re_sv);
8195                     re = ReANY(re_sv);
8196                 }
8197                 RXp_MATCH_COPIED_off(re);
8198                 re->subbeg = rex->subbeg;
8199                 re->sublen = rex->sublen;
8200                 re->suboffset = rex->suboffset;
8201                 re->subcoffset = rex->subcoffset;
8202                 re->lastparen = 0;
8203                 re->lastcloseparen = 0;
8204                 rei = RXi_GET(re);
8205                 DEBUG_EXECUTE_r(
8206                     debug_start_match(re_sv, utf8_target, locinput,
8207                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
8208                 );
8209                 startpoint = rei->program + 1;
8210                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8211                                              * close_paren only for GOSUB */
8212                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8213                 /* Save all the seen positions so far. */
8214                 ST.cp = regcppush(rex, 0, maxopenparen);
8215                 REGCP_SET(ST.lastcp);
8216                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8217                 maxopenparen = 0;
8218                 /* run the pattern returned from (??{...}) */
8219
8220               eval_recurse_doit: /* Share code with GOSUB below this line
8221                             * At this point we expect the stack context to be
8222                             * set up correctly */
8223
8224                 /* invalidate the S-L poscache. We're now executing a
8225                  * different set of WHILEM ops (and their associated
8226                  * indexes) against the same string, so the bits in the
8227                  * cache are meaningless. Setting maxiter to zero forces
8228                  * the cache to be invalidated and zeroed before reuse.
8229                  * XXX This is too dramatic a measure. Ideally we should
8230                  * save the old cache and restore when running the outer
8231                  * pattern again */
8232                 reginfo->poscache_maxiter = 0;
8233
8234                 /* the new regexp might have a different is_utf8_pat than we do */
8235                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8236
8237                 ST.prev_rex = rex_sv;
8238                 ST.prev_curlyx = cur_curlyx;
8239                 rex_sv = re_sv;
8240                 SET_reg_curpm(rex_sv);
8241                 rex = re;
8242                 rexi = rei;
8243                 cur_curlyx = NULL;
8244                 ST.B = next;
8245                 ST.prev_eval = cur_eval;
8246                 cur_eval = st;
8247                 /* now continue from first node in postoned RE */
8248                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8249                                     loceol, script_run_begin);
8250                 NOT_REACHED; /* NOTREACHED */
8251         }
8252
8253         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8254             /* note: this is called twice; first after popping B, then A */
8255             DEBUG_STACK_r({
8256                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
8257                     depth, cur_eval, ST.prev_eval);
8258             });
8259
8260 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8261             if ( cur_eval && CUR_EVAL.close_paren ) {\
8262                 DEBUG_STACK_r({ \
8263                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8264                         depth,    \
8265                         CUR_EVAL.close_paren - 1,\
8266                         cur_eval, \
8267                         VAL);     \
8268                 });               \
8269                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8270             }
8271
8272             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8273
8274             rex_sv = ST.prev_rex;
8275             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8276             SET_reg_curpm(rex_sv);
8277             rex = ReANY(rex_sv);
8278             rexi = RXi_GET(rex);
8279             {
8280                 /* preserve $^R across LEAVE's. See Bug 121070. */
8281                 SV *save_sv= GvSV(PL_replgv);
8282                 SV *replsv;
8283                 SvREFCNT_inc(save_sv);
8284                 regcpblow(ST.cp); /* LEAVE in disguise */
8285                 /* don't move this initialization up */
8286                 replsv = GvSV(PL_replgv);
8287                 sv_setsv(replsv, save_sv);
8288                 SvSETMAGIC(replsv);
8289                 SvREFCNT_dec(save_sv);
8290             }
8291             cur_eval = ST.prev_eval;
8292             cur_curlyx = ST.prev_curlyx;
8293
8294             /* Invalidate cache. See "invalidate" comment above. */
8295             reginfo->poscache_maxiter = 0;
8296             if ( nochange_depth )
8297                 nochange_depth--;
8298
8299             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8300             sayYES;
8301
8302
8303         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8304             REGCP_UNWIND(ST.lastcp);
8305             sayNO;
8306
8307         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8308             /* note: this is called twice; first after popping B, then A */
8309             DEBUG_STACK_r({
8310                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8311                     depth, cur_eval, ST.prev_eval);
8312             });
8313
8314             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8315
8316             rex_sv = ST.prev_rex;
8317             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8318             SET_reg_curpm(rex_sv);
8319             rex = ReANY(rex_sv);
8320             rexi = RXi_GET(rex);
8321
8322             REGCP_UNWIND(ST.lastcp);
8323             regcppop(rex, &maxopenparen);
8324             cur_eval = ST.prev_eval;
8325             cur_curlyx = ST.prev_curlyx;
8326
8327             /* Invalidate cache. See "invalidate" comment above. */
8328             reginfo->poscache_maxiter = 0;
8329             if ( nochange_depth )
8330                 nochange_depth--;
8331
8332             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8333             sayNO_SILENT;
8334 #undef ST
8335
8336         case OPEN: /*  (  */
8337             n = ARG(scan);  /* which paren pair */
8338             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
8339             if (n > maxopenparen)
8340                 maxopenparen = n;
8341             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8342                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8343                 depth,
8344                 PTR2UV(rex),
8345                 PTR2UV(rex->offs),
8346                 (UV)n,
8347                 (IV)rex->offs[n].start_tmp,
8348                 (UV)maxopenparen
8349             ));
8350             lastopen = n;
8351             break;
8352
8353         case SROPEN: /*  (*SCRIPT_RUN:  */
8354             script_run_begin = (U8 *) locinput;
8355             break;
8356
8357
8358         case CLOSE:  /*  )  */
8359             n = ARG(scan);  /* which paren pair */
8360             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8361                              locinput - reginfo->strbeg);
8362             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8363                 goto fake_end;
8364
8365             break;
8366
8367         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
8368
8369             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8370             {
8371                 sayNO;
8372             }
8373
8374             break;
8375
8376
8377         case ACCEPT:  /*  (*ACCEPT)  */
8378             if (scan->flags)
8379                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8380             if (ARG2L(scan)){
8381                 regnode *cursor;
8382                 for (cursor=scan;
8383                      cursor && OP(cursor)!=END;
8384                      cursor=regnext(cursor))
8385                 {
8386                     if ( OP(cursor)==CLOSE ){
8387                         n = ARG(cursor);
8388                         if ( n <= lastopen ) {
8389                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8390                                              locinput - reginfo->strbeg);
8391                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8392                                 break;
8393                         }
8394                     }
8395                 }
8396             }
8397             goto fake_end;
8398             /* NOTREACHED */
8399
8400         case GROUPP:  /*  (?(1))  */
8401             n = ARG(scan);  /* which paren pair */
8402             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
8403             break;
8404
8405         case GROUPPN:  /*  (?(<name>))  */
8406             /* reg_check_named_buff_matched returns 0 for no match */
8407             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8408             break;
8409
8410         case INSUBP:   /*  (?(R))  */
8411             n = ARG(scan);
8412             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8413              * of SCAN is already set up as matches a eval.close_paren */
8414             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8415             break;
8416
8417         case DEFINEP:  /*  (?(DEFINE))  */
8418             sw = 0;
8419             break;
8420
8421         case IFTHEN:   /*  (?(cond)A|B)  */
8422             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8423             if (sw)
8424                 next = NEXTOPER(NEXTOPER(scan));
8425             else {
8426                 next = scan + ARG(scan);
8427                 if (OP(next) == IFTHEN) /* Fake one. */
8428                     next = NEXTOPER(NEXTOPER(next));
8429             }
8430             break;
8431
8432         case LOGICAL:  /* modifier for EVAL and IFMATCH */
8433             logical = scan->flags;
8434             break;
8435
8436 /*******************************************************************
8437
8438 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8439 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8440 STAR/PLUS/CURLY/CURLYN are used instead.)
8441
8442 A*B is compiled as <CURLYX><A><WHILEM><B>
8443
8444 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8445 state, which contains the current count, initialised to -1. It also sets
8446 cur_curlyx to point to this state, with any previous value saved in the
8447 state block.
8448
8449 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8450 since the pattern may possibly match zero times (i.e. it's a while {} loop
8451 rather than a do {} while loop).
8452
8453 Each entry to WHILEM represents a successful match of A. The count in the
8454 CURLYX block is incremented, another WHILEM state is pushed, and execution
8455 passes to A or B depending on greediness and the current count.
8456
8457 For example, if matching against the string a1a2a3b (where the aN are
8458 substrings that match /A/), then the match progresses as follows: (the
8459 pushed states are interspersed with the bits of strings matched so far):
8460
8461     <CURLYX cnt=-1>
8462     <CURLYX cnt=0><WHILEM>
8463     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8464     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8465     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8466     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8467
8468 (Contrast this with something like CURLYM, which maintains only a single
8469 backtrack state:
8470
8471     <CURLYM cnt=0> a1
8472     a1 <CURLYM cnt=1> a2
8473     a1 a2 <CURLYM cnt=2> a3
8474     a1 a2 a3 <CURLYM cnt=3> b
8475 )
8476
8477 Each WHILEM state block marks a point to backtrack to upon partial failure
8478 of A or B, and also contains some minor state data related to that
8479 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
8480 overall state, such as the count, and pointers to the A and B ops.
8481
8482 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8483 must always point to the *current* CURLYX block, the rules are:
8484
8485 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8486 and set cur_curlyx to point the new block.
8487
8488 When popping the CURLYX block after a successful or unsuccessful match,
8489 restore the previous cur_curlyx.
8490
8491 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8492 to the outer one saved in the CURLYX block.
8493
8494 When popping the WHILEM block after a successful or unsuccessful B match,
8495 restore the previous cur_curlyx.
8496
8497 Here's an example for the pattern (AI* BI)*BO
8498 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8499
8500 cur_
8501 curlyx backtrack stack
8502 ------ ---------------
8503 NULL
8504 CO     <CO prev=NULL> <WO>
8505 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8506 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8507 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8508
8509 At this point the pattern succeeds, and we work back down the stack to
8510 clean up, restoring as we go:
8511
8512 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8513 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8514 CO     <CO prev=NULL> <WO>
8515 NULL
8516
8517 *******************************************************************/
8518
8519 #define ST st->u.curlyx
8520
8521         case CURLYX:    /* start of /A*B/  (for complex A) */
8522         {
8523             /* No need to save/restore up to this paren */
8524             I32 parenfloor = scan->flags;
8525
8526             assert(next); /* keep Coverity happy */
8527             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
8528                 next += ARG(next);
8529
8530             /* XXXX Probably it is better to teach regpush to support
8531                parenfloor > maxopenparen ... */
8532             if (parenfloor > (I32)rex->lastparen)
8533                 parenfloor = rex->lastparen; /* Pessimization... */
8534
8535             ST.prev_curlyx= cur_curlyx;
8536             cur_curlyx = st;
8537             ST.cp = PL_savestack_ix;
8538
8539             /* these fields contain the state of the current curly.
8540              * they are accessed by subsequent WHILEMs */
8541             ST.parenfloor = parenfloor;
8542             ST.me = scan;
8543             ST.B = next;
8544             ST.minmod = minmod;
8545             minmod = 0;
8546             ST.count = -1;      /* this will be updated by WHILEM */
8547             ST.lastloc = NULL;  /* this will be updated by WHILEM */
8548
8549             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
8550                                 script_run_begin);
8551             NOT_REACHED; /* NOTREACHED */
8552         }
8553
8554         case CURLYX_end: /* just finished matching all of A*B */
8555             cur_curlyx = ST.prev_curlyx;
8556             sayYES;
8557             NOT_REACHED; /* NOTREACHED */
8558
8559         case CURLYX_end_fail: /* just failed to match all of A*B */
8560             regcpblow(ST.cp);
8561             cur_curlyx = ST.prev_curlyx;
8562             sayNO;
8563             NOT_REACHED; /* NOTREACHED */
8564
8565
8566 #undef ST
8567 #define ST st->u.whilem
8568
8569         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
8570         {
8571             /* see the discussion above about CURLYX/WHILEM */
8572             I32 n;
8573             int min, max;
8574             regnode *A;
8575
8576             assert(cur_curlyx); /* keep Coverity happy */
8577
8578             min = ARG1(cur_curlyx->u.curlyx.me);
8579             max = ARG2(cur_curlyx->u.curlyx.me);
8580             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
8581             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8582             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8583             ST.cache_offset = 0;
8584             ST.cache_mask = 0;
8585
8586
8587             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
8588                   depth, (long)n, min, max)
8589             );
8590
8591             /* First just match a string of min A's. */
8592
8593             if (n < min) {
8594                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8595                 cur_curlyx->u.curlyx.lastloc = locinput;
8596                 REGCP_SET(ST.lastcp);
8597
8598                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8599                                 script_run_begin);
8600                 NOT_REACHED; /* NOTREACHED */
8601             }
8602
8603             /* If degenerate A matches "", assume A done. */
8604
8605             if (locinput == cur_curlyx->u.curlyx.lastloc) {
8606                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
8607                    depth)
8608                 );
8609                 goto do_whilem_B_max;
8610             }
8611
8612             /* super-linear cache processing.
8613              *
8614              * The idea here is that for certain types of CURLYX/WHILEM -
8615              * principally those whose upper bound is infinity (and
8616              * excluding regexes that have things like \1 and other very
8617              * non-regular expresssiony things), then if a pattern like
8618              * /....A*.../ fails and we backtrack to the WHILEM, then we
8619              * make a note that this particular WHILEM op was at string
8620              * position 47 (say) when the rest of pattern failed. Then, if
8621              * we ever find ourselves back at that WHILEM, and at string
8622              * position 47 again, we can just fail immediately rather than
8623              * running the rest of the pattern again.
8624              *
8625              * This is very handy when patterns start to go
8626              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8627              * with a combinatorial explosion of backtracking.
8628              *
8629              * The cache is implemented as a bit array, with one bit per
8630              * string byte position per WHILEM op (up to 16) - so its
8631              * between 0.25 and 2x the string size.
8632              *
8633              * To avoid allocating a poscache buffer every time, we do an
8634              * initially countdown; only after we have  executed a WHILEM
8635              * op (string-length x #WHILEMs) times do we allocate the
8636              * cache.
8637              *
8638              * The top 4 bits of scan->flags byte say how many different
8639              * relevant CURLLYX/WHILEM op pairs there are, while the
8640              * bottom 4-bits is the identifying index number of this
8641              * WHILEM.
8642              */
8643
8644             if (scan->flags) {
8645
8646                 if (!reginfo->poscache_maxiter) {
8647                     /* start the countdown: Postpone detection until we
8648                      * know the match is not *that* much linear. */
8649                     reginfo->poscache_maxiter
8650                         =    (reginfo->strend - reginfo->strbeg + 1)
8651                            * (scan->flags>>4);
8652                     /* possible overflow for long strings and many CURLYX's */
8653                     if (reginfo->poscache_maxiter < 0)
8654                         reginfo->poscache_maxiter = I32_MAX;
8655                     reginfo->poscache_iter = reginfo->poscache_maxiter;
8656                 }
8657
8658                 if (reginfo->poscache_iter-- == 0) {
8659                     /* initialise cache */
8660                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8661                     regmatch_info_aux *const aux = reginfo->info_aux;
8662                     if (aux->poscache) {
8663                         if ((SSize_t)reginfo->poscache_size < size) {
8664                             Renew(aux->poscache, size, char);
8665                             reginfo->poscache_size = size;
8666                         }
8667                         Zero(aux->poscache, size, char);
8668                     }
8669                     else {
8670                         reginfo->poscache_size = size;
8671                         Newxz(aux->poscache, size, char);
8672                     }
8673                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8674       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8675                               PL_colors[4], PL_colors[5])
8676                     );
8677                 }
8678
8679                 if (reginfo->poscache_iter < 0) {
8680                     /* have we already failed at this position? */
8681                     SSize_t offset, mask;
8682
8683                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8684                     offset  = (scan->flags & 0xf) - 1
8685                                 +   (locinput - reginfo->strbeg)
8686                                   * (scan->flags>>4);
8687                     mask    = 1 << (offset % 8);
8688                     offset /= 8;
8689                     if (reginfo->info_aux->poscache[offset] & mask) {
8690                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8691                             depth)
8692                         );
8693                         cur_curlyx->u.curlyx.count--;
8694                         sayNO; /* cache records failure */
8695                     }
8696                     ST.cache_offset = offset;
8697                     ST.cache_mask   = mask;
8698                 }
8699             }
8700
8701             /* Prefer B over A for minimal matching. */
8702
8703             if (cur_curlyx->u.curlyx.minmod) {
8704                 ST.save_curlyx = cur_curlyx;
8705                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8706                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8707                                     locinput, loceol, script_run_begin);
8708                 NOT_REACHED; /* NOTREACHED */
8709             }
8710
8711             /* Prefer A over B for maximal matching. */
8712
8713             if (n < max) { /* More greed allowed? */
8714                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8715                             maxopenparen);
8716                 cur_curlyx->u.curlyx.lastloc = locinput;
8717                 REGCP_SET(ST.lastcp);
8718                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8719                                 script_run_begin);
8720                 NOT_REACHED; /* NOTREACHED */
8721             }
8722             goto do_whilem_B_max;
8723         }
8724         NOT_REACHED; /* NOTREACHED */
8725
8726         case WHILEM_B_min: /* just matched B in a minimal match */
8727         case WHILEM_B_max: /* just matched B in a maximal match */
8728             cur_curlyx = ST.save_curlyx;
8729             sayYES;
8730             NOT_REACHED; /* NOTREACHED */
8731
8732         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8733             cur_curlyx = ST.save_curlyx;
8734             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8735             cur_curlyx->u.curlyx.count--;
8736             CACHEsayNO;
8737             NOT_REACHED; /* NOTREACHED */
8738
8739         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8740             /* FALLTHROUGH */
8741         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8742             REGCP_UNWIND(ST.lastcp);
8743             regcppop(rex, &maxopenparen);
8744             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8745             cur_curlyx->u.curlyx.count--;
8746             CACHEsayNO;
8747             NOT_REACHED; /* NOTREACHED */
8748
8749         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8750             REGCP_UNWIND(ST.lastcp);
8751             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8752             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8753                 depth)
8754             );
8755           do_whilem_B_max:
8756             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8757                 && ckWARN(WARN_REGEXP)
8758                 && !reginfo->warned)
8759             {
8760                 reginfo->warned = TRUE;
8761                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8762                      "Complex regular subexpression recursion limit (%d) "
8763                      "exceeded",
8764                      REG_INFTY - 1);
8765             }
8766
8767             /* now try B */
8768             ST.save_curlyx = cur_curlyx;
8769             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8770             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8771                                 locinput, loceol, script_run_begin);
8772             NOT_REACHED; /* NOTREACHED */
8773
8774         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8775             cur_curlyx = ST.save_curlyx;
8776
8777             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8778                 /* Maximum greed exceeded */
8779                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8780                     && ckWARN(WARN_REGEXP)
8781                     && !reginfo->warned)
8782                 {
8783                     reginfo->warned     = TRUE;
8784                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8785                         "Complex regular subexpression recursion "
8786                         "limit (%d) exceeded",
8787                         REG_INFTY - 1);
8788                 }
8789                 cur_curlyx->u.curlyx.count--;
8790                 CACHEsayNO;
8791             }
8792
8793             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8794             );
8795             /* Try grabbing another A and see if it helps. */
8796             cur_curlyx->u.curlyx.lastloc = locinput;
8797             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8798                             maxopenparen);
8799             REGCP_SET(ST.lastcp);
8800             PUSH_STATE_GOTO(WHILEM_A_min,
8801                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8802                 locinput, loceol, script_run_begin);
8803             NOT_REACHED; /* NOTREACHED */
8804
8805 #undef  ST
8806 #define ST st->u.branch
8807
8808         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8809             next = scan + ARG(scan);
8810             if (next == scan)
8811                 next = NULL;
8812             scan = NEXTOPER(scan);
8813             /* FALLTHROUGH */
8814
8815         case BRANCH:        /*  /(...|A|...)/ */
8816             scan = NEXTOPER(scan); /* scan now points to inner node */
8817             ST.lastparen = rex->lastparen;
8818             ST.lastcloseparen = rex->lastcloseparen;
8819             ST.next_branch = next;
8820             REGCP_SET(ST.cp);
8821
8822             /* Now go into the branch */
8823             if (has_cutgroup) {
8824                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8825                                     script_run_begin);
8826             } else {
8827                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8828                                 script_run_begin);
8829             }
8830             NOT_REACHED; /* NOTREACHED */
8831
8832         case CUTGROUP:  /*  /(*THEN)/  */
8833             sv_yes_mark = st->u.mark.mark_name = scan->flags
8834                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8835                 : NULL;
8836             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8837                             script_run_begin);
8838             NOT_REACHED; /* NOTREACHED */
8839
8840         case CUTGROUP_next_fail:
8841             do_cutgroup = 1;
8842             no_final = 1;
8843             if (st->u.mark.mark_name)
8844                 sv_commit = st->u.mark.mark_name;
8845             sayNO;
8846             NOT_REACHED; /* NOTREACHED */
8847
8848         case BRANCH_next:
8849             sayYES;
8850             NOT_REACHED; /* NOTREACHED */
8851
8852         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8853             if (do_cutgroup) {
8854                 do_cutgroup = 0;
8855                 no_final = 0;
8856             }
8857             REGCP_UNWIND(ST.cp);
8858             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8859             scan = ST.next_branch;
8860             /* no more branches? */
8861             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8862                 DEBUG_EXECUTE_r({
8863                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8864                         depth,
8865                         PL_colors[4],
8866                         PL_colors[5] );
8867                 });
8868                 sayNO_SILENT;
8869             }
8870             continue; /* execute next BRANCH[J] op */
8871             /* NOTREACHED */
8872
8873         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8874             minmod = 1;
8875             break;
8876
8877 #undef  ST
8878 #define ST st->u.curlym
8879
8880         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8881
8882             /* This is an optimisation of CURLYX that enables us to push
8883              * only a single backtracking state, no matter how many matches
8884              * there are in {m,n}. It relies on the pattern being constant
8885              * length, with no parens to influence future backrefs
8886              */
8887
8888             ST.me = scan;
8889             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8890
8891             ST.lastparen      = rex->lastparen;
8892             ST.lastcloseparen = rex->lastcloseparen;
8893
8894             /* if paren positive, emulate an OPEN/CLOSE around A */
8895             if (ST.me->flags) {
8896                 U32 paren = ST.me->flags;
8897                 if (paren > maxopenparen)
8898                     maxopenparen = paren;
8899                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8900             }
8901             ST.A = scan;
8902             ST.B = next;
8903             ST.alen = 0;
8904             ST.count = 0;
8905             ST.minmod = minmod;
8906             minmod = 0;
8907             ST.Binfo.count = -1;
8908             REGCP_SET(ST.cp);
8909
8910             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8911                 goto curlym_do_B;
8912
8913           curlym_do_A: /* execute the A in /A{m,n}B/  */
8914             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8915                                 script_run_begin);
8916             NOT_REACHED; /* NOTREACHED */
8917
8918         case CURLYM_A: /* we've just matched an A */
8919             ST.count++;
8920             /* after first match, determine A's length: u.curlym.alen */
8921             if (ST.count == 1) {
8922                 if (reginfo->is_utf8_target) {
8923                     char *s = st->locinput;
8924                     while (s < locinput) {
8925                         ST.alen++;
8926                         s += UTF8SKIP(s);
8927                     }
8928                 }
8929                 else {
8930                     ST.alen = locinput - st->locinput;
8931                 }
8932                 if (ST.alen == 0)
8933                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8934             }
8935             DEBUG_EXECUTE_r(
8936                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8937                           depth, (IV) ST.count, (IV)ST.alen)
8938             );
8939
8940             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8941                 goto fake_end;
8942
8943             {
8944                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8945                 if ( max == REG_INFTY || ST.count < max )
8946                     goto curlym_do_A; /* try to match another A */
8947             }
8948             goto curlym_do_B; /* try to match B */
8949
8950         case CURLYM_A_fail: /* just failed to match an A */
8951             REGCP_UNWIND(ST.cp);
8952
8953
8954             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8955                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8956                 sayNO;
8957
8958           curlym_do_B: /* execute the B in /A{m,n}B/  */
8959             if (ST.Binfo.count < 0) {
8960                 /* calculate possible match of 1st char following curly */
8961                 assert(ST.B);
8962                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8963                     regnode *text_node = ST.B;
8964                     if (! HAS_TEXT(text_node))
8965                         FIND_NEXT_IMPT(text_node);
8966                     if (PL_regkind[OP(text_node)] == EXACT) {
8967                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
8968                                                         &ST.Binfo, reginfo))
8969                         {
8970                             sayNO;
8971                         }
8972                     }
8973                 }
8974             }
8975
8976             DEBUG_EXECUTE_r(
8977                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8978                     depth, (IV)ST.count)
8979             );
8980             if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
8981                 assert(ST.Binfo.count > 0);
8982
8983                 /* Do a quick test to hopefully rule out most non-matches */
8984                 if (     locinput + ST.Binfo.min_length > loceol
8985                     || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
8986                 {
8987                     DEBUG_OPTIMISE_r(
8988                         Perl_re_exec_indentf( aTHX_
8989                             "CURLYM Fast bail next target=0x%X anded==0x%X"
8990                                                                 " mask=0x%X\n",
8991                             depth,
8992                             (int) nextbyte, ST.Binfo.first_byte_anded,
8993                                             ST.Binfo.first_byte_mask)
8994                     );
8995                     state_num = CURLYM_B_fail;
8996                     goto reenter_switch;
8997                 }
8998             }
8999
9000             if (ST.me->flags) {
9001                 /* emulate CLOSE: mark current A as captured */
9002                 U32 paren = (U32)ST.me->flags;
9003                 if (ST.count) {
9004                     CLOSE_CAPTURE(paren,
9005                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
9006                         locinput - reginfo->strbeg);
9007                 }
9008                 else
9009                     rex->offs[paren].end = -1;
9010
9011                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9012                 {
9013                     if (ST.count)
9014                         goto fake_end;
9015                     else
9016                         sayNO;
9017                 }
9018             }
9019
9020             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
9021                             script_run_begin);
9022             NOT_REACHED; /* NOTREACHED */
9023
9024         case CURLYM_B_fail: /* just failed to match a B */
9025             REGCP_UNWIND(ST.cp);
9026             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9027             if (ST.minmod) {
9028                 I32 max = ARG2(ST.me);
9029                 if (max != REG_INFTY && ST.count == max)
9030                     sayNO;
9031                 goto curlym_do_A; /* try to match a further A */
9032             }
9033             /* backtrack one A */
9034             if (ST.count == ARG1(ST.me) /* min */)
9035                 sayNO;
9036             ST.count--;
9037             SET_locinput(HOPc(locinput, -ST.alen));
9038             goto curlym_do_B; /* try to match B */
9039
9040 #undef ST
9041 #define ST st->u.curly
9042
9043 #define CURLY_SETPAREN(paren, success) \
9044     if (paren) { \
9045         if (success) { \
9046             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
9047                                  locinput - reginfo->strbeg); \
9048         } \
9049         else { \
9050             rex->offs[paren].end = -1; \
9051             rex->lastparen      = ST.lastparen; \
9052             rex->lastcloseparen = ST.lastcloseparen; \
9053         } \
9054     }
9055
9056         case STAR:              /*  /A*B/ where A is width 1 char */
9057             ST.paren = 0;
9058             ST.min = 0;
9059             ST.max = REG_INFTY;
9060             scan = NEXTOPER(scan);
9061             goto repeat;
9062
9063         case PLUS:              /*  /A+B/ where A is width 1 char */
9064             ST.paren = 0;
9065             ST.min = 1;
9066             ST.max = REG_INFTY;
9067             scan = NEXTOPER(scan);
9068             goto repeat;
9069
9070         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
9071             ST.paren = scan->flags;     /* Which paren to set */
9072             ST.lastparen      = rex->lastparen;
9073             ST.lastcloseparen = rex->lastcloseparen;
9074             if (ST.paren > maxopenparen)
9075                 maxopenparen = ST.paren;
9076             ST.min = ARG1(scan);  /* min to match */
9077             ST.max = ARG2(scan);  /* max to match */
9078             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
9079
9080             /* handle the single-char capture called as a GOSUB etc */
9081             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9082             {
9083                 char *li = locinput;
9084                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9085                     sayNO;
9086                 SET_locinput(li);
9087                 goto fake_end;
9088             }
9089
9090             goto repeat;
9091
9092         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
9093             ST.paren = 0;
9094             ST.min = ARG1(scan);  /* min to match */
9095             ST.max = ARG2(scan);  /* max to match */
9096             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
9097           repeat:
9098             /*
9099             * Lookahead to avoid useless match attempts
9100             * when we know what character comes next.
9101             *
9102             * Used to only do .*x and .*?x, but now it allows
9103             * for )'s, ('s and (?{ ... })'s to be in the way
9104             * of the quantifier and the EXACT-like node.  -- japhy
9105             */
9106
9107             assert(ST.min <= ST.max);
9108             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9109                 ST.Binfo.count = 0;
9110             }
9111             else {
9112                 regnode *text_node = next;
9113
9114                 if (! HAS_TEXT(text_node))
9115                     FIND_NEXT_IMPT(text_node);
9116
9117                 if (! HAS_TEXT(text_node))
9118                     ST.Binfo.count = 0;
9119                 else {
9120                     if ( PL_regkind[OP(text_node)] != EXACT ) {
9121                         ST.Binfo.count = 0;
9122                     }
9123                     else {
9124                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9125                                                         &ST.Binfo, reginfo))
9126                         {
9127                             sayNO;
9128                         }
9129                     }
9130                 }
9131             }
9132
9133             ST.A = scan;
9134             ST.B = next;
9135             if (minmod) {
9136                 char *li = locinput;
9137                 minmod = 0;
9138                 if (ST.min &&
9139                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9140                             < ST.min)
9141                     sayNO;
9142                 SET_locinput(li);
9143                 ST.count = ST.min;
9144                 REGCP_SET(ST.cp);
9145
9146                 if (ST.Binfo.count <= 0)
9147                     goto curly_try_B_min;
9148
9149                 ST.oldloc = locinput;
9150
9151                 /* set ST.maxpos to the furthest point along the
9152                  * string that could possibly match, i.e., that a match could
9153                  * start at. */
9154                 if  (ST.max == REG_INFTY) {
9155                     ST.maxpos = loceol - 1;
9156                     if (utf8_target)
9157                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9158                             ST.maxpos--;
9159                 }
9160                 else if (utf8_target) {
9161                     int m = ST.max - ST.min;
9162                     for (ST.maxpos = locinput;
9163                          m >0 && ST.maxpos <  loceol; m--)
9164                         ST.maxpos += UTF8SKIP(ST.maxpos);
9165                 }
9166                 else {
9167                     ST.maxpos = locinput + ST.max - ST.min;
9168                     if (ST.maxpos >=  loceol)
9169                         ST.maxpos =  loceol - 1;
9170                 }
9171                 goto curly_try_B_min_known;
9172
9173             }
9174             else {
9175                 /* avoid taking address of locinput, so it can remain
9176                  * a register var */
9177                 char *li = locinput;
9178                 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9179                 if (ST.count < ST.min)
9180                     sayNO;
9181                 SET_locinput(li);
9182                 if ((ST.count > ST.min)
9183                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
9184                 {
9185                     /* A{m,n} must come at the end of the string, there's
9186                      * no point in backing off ... */
9187                     ST.min = ST.count;
9188                     /* ...except that $ and \Z can match before *and* after
9189                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
9190                        We may back off by one in this case. */
9191                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9192                         ST.min--;
9193                 }
9194                 REGCP_SET(ST.cp);
9195                 goto curly_try_B_max;
9196             }
9197             NOT_REACHED; /* NOTREACHED */
9198
9199         case CURLY_B_min_fail:
9200             /* failed to find B in a non-greedy match. */
9201
9202             REGCP_UNWIND(ST.cp);
9203             if (ST.paren) {
9204                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9205             }
9206
9207             if (ST.Binfo.count == 0) {
9208                 /* failed -- move forward one */
9209                 char *li = locinput;
9210                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9211                     sayNO;
9212                 }
9213                 locinput = li;
9214                 ST.count++;
9215                 if (!(   ST.count <= ST.max
9216                         /* count overflow ? */
9217                      || (ST.max == REG_INFTY && ST.count > 0))
9218                 )
9219                     sayNO;
9220             }
9221             else {
9222                 int n;
9223                 /* Couldn't or didn't -- move forward. */
9224                 ST.oldloc = locinput;
9225                 if (utf8_target)
9226                     locinput += UTF8SKIP(locinput);
9227                 else
9228                     locinput++;
9229                 ST.count++;
9230
9231               curly_try_B_min_known:
9232                 /* find the next place where 'B' could work, then call B */
9233                 if (locinput + ST.Binfo.initial_exact < loceol) {
9234                     if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9235
9236                         /* Here, the mask is all 1's for the entire length of
9237                          * any possible match.  (That actually means that there
9238                          * is only one possible match.)  Look for the next
9239                          * occurrence */
9240                         locinput = ninstr(locinput, loceol,
9241                                         (char *) ST.Binfo.matches,
9242                                         (char *) ST.Binfo.matches
9243                                                     + ST.Binfo.initial_exact);
9244                         if (locinput == NULL) {
9245                             sayNO;
9246                         }
9247                     }
9248                     else do {
9249                         /* If the first byte(s) of the mask are all ones, it
9250                          * means those bytes must match identically, so can use
9251                          * ninstr() to find the next possible matchpoint */
9252                         if (ST.Binfo.initial_exact > 0) {
9253                             locinput = ninstr(locinput, loceol,
9254                                               (char *) ST.Binfo.matches,
9255                                               (char *) ST.Binfo.matches
9256                                                      + ST.Binfo.initial_exact);
9257                         }
9258                         else { /* Otherwise find the next byte that matches,
9259                                   masked */
9260                             locinput = (char *) find_next_masked(
9261                                                 (U8 *) locinput, (U8 *) loceol,
9262                                                 ST.Binfo.first_byte_anded,
9263                                                 ST.Binfo.first_byte_mask);
9264                             /* Advance to the end of a multi-byte character */
9265                             if (utf8_target) {
9266                                 while (   locinput < loceol
9267                                     && UTF8_IS_CONTINUATION(*locinput))
9268                                 {
9269                                     locinput++;
9270                                 }
9271                             }
9272                         }
9273                         if (   locinput == NULL
9274                             || locinput + ST.Binfo.min_length > loceol)
9275                         {
9276                             sayNO;
9277                         }
9278
9279                         /* Here, we have found a possible match point; if can't
9280                          * rule it out, quit the loop so can check fully */
9281                         if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9282                             break;
9283                         }
9284
9285                         locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9286
9287                     } while (locinput <= ST.maxpos);
9288                 }
9289
9290                 if (locinput > ST.maxpos)
9291                     sayNO;
9292
9293                 n = (utf8_target)
9294                     ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9295                     : (STRLEN) (locinput - ST.oldloc);
9296
9297
9298                 /* Here is at the beginning of a character that meets the mask
9299                  * criteria.  Need to make sure that some real possibility */
9300
9301                 if (n) {
9302                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9303                      * at what may be the beginning of b; check that everything
9304                      * between oldloc and locinput matches */
9305                     char *li = ST.oldloc;
9306                     ST.count += n;
9307                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9308                         sayNO;
9309                     assert(n == REG_INFTY || locinput == li);
9310                 }
9311             }
9312
9313           curly_try_B_min:
9314             CURLY_SETPAREN(ST.paren, ST.count);
9315             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9316                             script_run_begin);
9317             NOT_REACHED; /* NOTREACHED */
9318
9319
9320           curly_try_B_max:
9321             /* a successful greedy match: now try to match B */
9322             if (        ST.Binfo.count <= 0
9323                 || (    ST.Binfo.count > 0
9324                     &&  locinput + ST.Binfo.min_length <= loceol
9325                     &&  S_test_EXACTISH_ST(locinput, ST.Binfo)))
9326             {
9327                 CURLY_SETPAREN(ST.paren, ST.count);
9328                 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9329                                 script_run_begin);
9330                 NOT_REACHED; /* NOTREACHED */
9331             }
9332             /* FALLTHROUGH */
9333
9334         case CURLY_B_max_fail:
9335             /* failed to find B in a greedy match */
9336
9337             REGCP_UNWIND(ST.cp);
9338             if (ST.paren) {
9339                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9340             }
9341             /*  back up. */
9342             if (--ST.count < ST.min)
9343                 sayNO;
9344             locinput = HOPc(locinput, -1);
9345             goto curly_try_B_max;
9346
9347 #undef ST
9348
9349         case END: /*  last op of main pattern  */
9350           fake_end:
9351             if (cur_eval) {
9352                 /* we've just finished A in /(??{A})B/; now continue with B */
9353                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9354                 st->u.eval.prev_rex = rex_sv;           /* inner */
9355
9356                 /* Save *all* the positions. */
9357                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9358                 rex_sv = CUR_EVAL.prev_rex;
9359                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9360                 SET_reg_curpm(rex_sv);
9361                 rex = ReANY(rex_sv);
9362                 rexi = RXi_GET(rex);
9363
9364                 st->u.eval.prev_curlyx = cur_curlyx;
9365                 cur_curlyx = CUR_EVAL.prev_curlyx;
9366
9367                 REGCP_SET(st->u.eval.lastcp);
9368
9369                 /* Restore parens of the outer rex without popping the
9370                  * savestack */
9371                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9372
9373                 st->u.eval.prev_eval = cur_eval;
9374                 cur_eval = CUR_EVAL.prev_eval;
9375                 DEBUG_EXECUTE_r(
9376                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
9377                                       depth, cur_eval););
9378                 if ( nochange_depth )
9379                     nochange_depth--;
9380
9381                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9382
9383                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
9384                                     st->u.eval.prev_eval->u.eval.B,
9385                                     locinput, loceol, script_run_begin);
9386             }
9387
9388             if (locinput < reginfo->till) {
9389                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9390                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9391                                       PL_colors[4],
9392                                       (long)(locinput - startpos),
9393                                       (long)(reginfo->till - startpos),
9394                                       PL_colors[5]));
9395
9396                 sayNO_SILENT;           /* Cannot match: too short. */
9397             }
9398             sayYES;                     /* Success! */
9399
9400         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
9401             DEBUG_EXECUTE_r(
9402             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
9403                 depth, PL_colors[4], PL_colors[5]));
9404             sayYES;                     /* Success! */
9405
9406 #undef  ST
9407 #define ST st->u.ifmatch
9408
9409         case SUSPEND:   /* (?>A) */
9410             ST.wanted = 1;
9411             ST.start = locinput;
9412             ST.end = loceol;
9413             ST.count = 1;
9414             goto do_ifmatch;
9415
9416         case UNLESSM:   /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9417             ST.wanted = 0;
9418             goto ifmatch_trivial_fail_test;
9419
9420         case IFMATCH:   /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9421             ST.wanted = 1;
9422           ifmatch_trivial_fail_test:
9423             ST.count = scan->next_off + 1; /* next_off repurposed to be
9424                                               lookbehind count, requires
9425                                               non-zero flags */
9426             if (! scan->flags) {    /* 'flags' zero means lookahed */
9427
9428                 /* Lookahead starts here and ends at the normal place */
9429                 ST.start = locinput;
9430                 ST.end = loceol;
9431             }
9432             else {
9433                 PERL_UINT_FAST8_T back_count = scan->flags;
9434                 char * s;
9435
9436                 /* Lookbehind can look beyond the current position */
9437                 ST.end = loceol;
9438
9439                 /* ... and starts at the first place in the input that is in
9440                  * the range of the possible start positions */
9441                 for (; ST.count > 0; ST.count--, back_count--) {
9442                     s = HOPBACKc(locinput, back_count);
9443                     if (s) {
9444                         ST.start = s;
9445                         goto do_ifmatch;
9446                     }
9447                 }
9448
9449                 /* If the lookbehind doesn't start in the actual string, is a
9450                  * trivial match failure */
9451                 if (logical) {
9452                     logical = 0;
9453                     sw = 1 - cBOOL(ST.wanted);
9454                 }
9455                 else if (ST.wanted)
9456                     sayNO;
9457
9458                 /* Here, we didn't want it to match, so is actually success */
9459                 next = scan + ARG(scan);
9460                 if (next == scan)
9461                     next = NULL;
9462                 break;
9463             }
9464
9465           do_ifmatch:
9466             ST.me = scan;
9467             ST.logical = logical;
9468             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9469
9470             /* execute body of (?...A) */
9471             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
9472                                 ST.end, script_run_begin);
9473             NOT_REACHED; /* NOTREACHED */
9474
9475         {
9476             bool matched;
9477
9478         case IFMATCH_A_fail: /* body of (?...A) failed */
9479             if (! ST.logical && ST.count > 1) {
9480
9481                 /* It isn't a real failure until we've tried all starting
9482                  * positions.  Move to the next starting position and retry */
9483                 ST.count--;
9484                 ST.start = HOPc(ST.start, 1);
9485                 scan = ST.me;
9486                 logical = ST.logical;
9487                 goto do_ifmatch;
9488             }
9489
9490             /* Here, all starting positions have been tried. */
9491             matched = FALSE;
9492             goto ifmatch_done;
9493
9494         case IFMATCH_A: /* body of (?...A) succeeded */
9495             matched = TRUE;
9496           ifmatch_done:
9497             sw = matched == ST.wanted;
9498             if (! ST.logical && !sw) {
9499                 sayNO;
9500             }
9501
9502             if (OP(ST.me) != SUSPEND) {
9503                 /* restore old position except for (?>...) */
9504                 locinput = st->locinput;
9505                 loceol = st->loceol;
9506                 script_run_begin = st->sr0;
9507             }
9508             scan = ST.me + ARG(ST.me);
9509             if (scan == ST.me)
9510                 scan = NULL;
9511             continue; /* execute B */
9512         }
9513
9514 #undef ST
9515
9516         case LONGJMP: /*  alternative with many branches compiles to
9517                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9518             next = scan + ARG(scan);
9519             if (next == scan)
9520                 next = NULL;
9521             break;
9522
9523         case COMMIT:  /*  (*COMMIT)  */
9524             reginfo->cutpoint = loceol;
9525             /* FALLTHROUGH */
9526
9527         case PRUNE:   /*  (*PRUNE)   */
9528             if (scan->flags)
9529                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9530             PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9531                             script_run_begin);
9532             NOT_REACHED; /* NOTREACHED */
9533
9534         case COMMIT_next_fail:
9535             no_final = 1;
9536             /* FALLTHROUGH */
9537             sayNO;
9538             NOT_REACHED; /* NOTREACHED */
9539
9540         case OPFAIL:   /* (*FAIL)  */
9541             if (scan->flags)
9542                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9543             if (logical) {
9544                 /* deal with (?(?!)X|Y) properly,
9545                  * make sure we trigger the no branch
9546                  * of the trailing IFTHEN structure*/
9547                 sw= 0;
9548                 break;
9549             } else {
9550                 sayNO;
9551             }
9552             NOT_REACHED; /* NOTREACHED */
9553
9554 #define ST st->u.mark
9555         case MARKPOINT: /*  (*MARK:foo)  */
9556             ST.prev_mark = mark_state;
9557             ST.mark_name = sv_commit = sv_yes_mark
9558                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9559             mark_state = st;
9560             ST.mark_loc = locinput;
9561             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9562                                 script_run_begin);
9563             NOT_REACHED; /* NOTREACHED */
9564
9565         case MARKPOINT_next:
9566             mark_state = ST.prev_mark;
9567             sayYES;
9568             NOT_REACHED; /* NOTREACHED */
9569
9570         case MARKPOINT_next_fail:
9571             if (popmark && sv_eq(ST.mark_name,popmark))
9572             {
9573                 if (ST.mark_loc > startpoint)
9574                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9575                 popmark = NULL; /* we found our mark */
9576                 sv_commit = ST.mark_name;
9577
9578                 DEBUG_EXECUTE_r({
9579                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9580                             depth,
9581                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9582                 });
9583             }
9584             mark_state = ST.prev_mark;
9585             sv_yes_mark = mark_state ?
9586                 mark_state->u.mark.mark_name : NULL;
9587             sayNO;
9588             NOT_REACHED; /* NOTREACHED */
9589
9590         case SKIP:  /*  (*SKIP)  */
9591             if (!scan->flags) {
9592                 /* (*SKIP) : if we fail we cut here*/
9593                 ST.mark_name = NULL;
9594                 ST.mark_loc = locinput;
9595                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9596                                 script_run_begin);
9597             } else {
9598                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9599                    otherwise do nothing.  Meaning we need to scan
9600                  */
9601                 regmatch_state *cur = mark_state;
9602                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9603
9604                 while (cur) {
9605                     if ( sv_eq( cur->u.mark.mark_name,
9606                                 find ) )
9607                     {
9608                         ST.mark_name = find;
9609                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9610                                          script_run_begin);
9611                     }
9612                     cur = cur->u.mark.prev_mark;
9613                 }
9614             }
9615             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9616             break;
9617
9618         case SKIP_next_fail:
9619             if (ST.mark_name) {
9620                 /* (*CUT:NAME) - Set up to search for the name as we
9621                    collapse the stack*/
9622                 popmark = ST.mark_name;
9623             } else {
9624                 /* (*CUT) - No name, we cut here.*/
9625                 if (ST.mark_loc > startpoint)
9626                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9627                 /* but we set sv_commit to latest mark_name if there
9628                    is one so they can test to see how things lead to this
9629                    cut */
9630                 if (mark_state)
9631                     sv_commit=mark_state->u.mark.mark_name;
9632             }
9633             no_final = 1;
9634             sayNO;
9635             NOT_REACHED; /* NOTREACHED */
9636 #undef ST
9637
9638         case LNBREAK: /* \R */
9639             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9640                 locinput += n;
9641             } else
9642                 sayNO;
9643             break;
9644
9645         default:
9646             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9647                           PTR2UV(scan), OP(scan));
9648             Perl_croak(aTHX_ "regexp memory corruption");
9649
9650         /* this is a point to jump to in order to increment
9651          * locinput by one character */
9652           increment_locinput:
9653             assert(!NEXTCHR_IS_EOS);
9654             if (utf8_target) {
9655                 locinput += PL_utf8skip[nextbyte];
9656                 /* locinput is allowed to go 1 char off the end (signifying
9657                  * EOS), but not 2+ */
9658                 if (locinput >  loceol)
9659                     sayNO;
9660             }
9661             else
9662                 locinput++;
9663             break;
9664
9665         } /* end switch */
9666
9667         /* switch break jumps here */
9668         scan = next; /* prepare to execute the next op and ... */
9669         continue;    /* ... jump back to the top, reusing st */
9670         /* NOTREACHED */
9671
9672       push_yes_state:
9673         /* push a state that backtracks on success */
9674         st->u.yes.prev_yes_state = yes_state;
9675         yes_state = st;
9676         /* FALLTHROUGH */
9677       push_state:
9678         /* push a new regex state, then continue at scan  */
9679         {
9680             regmatch_state *newst;
9681             DECLARE_AND_GET_RE_DEBUG_FLAGS;
9682
9683             DEBUG_r( /* DEBUG_STACK_r */
9684               if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9685                 regmatch_state *cur = st;
9686                 regmatch_state *curyes = yes_state;
9687                 U32 i;
9688                 regmatch_slab *slab = PL_regmatch_slab;
9689                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9690                     if (cur < SLAB_FIRST(slab)) {
9691                         slab = slab->prev;
9692                         cur = SLAB_LAST(slab);
9693                     }
9694                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9695                         depth,
9696                         i ? "    " : "push",
9697                         depth - i, PL_reg_name[cur->resume_state],
9698                         (curyes == cur) ? "yes" : ""
9699                     );
9700                     if (curyes == cur)
9701                         curyes = cur->u.yes.prev_yes_state;
9702                 }
9703             } else {
9704                 DEBUG_STATE_pp("push")
9705             });
9706             depth++;
9707             st->locinput = locinput;
9708             st->loceol = loceol;
9709             st->sr0 = script_run_begin;
9710             newst = st+1;
9711             if (newst >  SLAB_LAST(PL_regmatch_slab))
9712                 newst = S_push_slab(aTHX);
9713             PL_regmatch_state = newst;
9714
9715             locinput = pushinput;
9716             loceol = pusheol;
9717             script_run_begin = pushsr0;
9718             st = newst;
9719             continue;
9720             /* NOTREACHED */
9721         }
9722     }
9723 #ifdef SOLARIS_BAD_OPTIMIZER
9724 #  undef PL_charclass
9725 #endif
9726
9727     /*
9728     * We get here only if there's trouble -- normally "case END" is
9729     * the terminating point.
9730     */
9731     Perl_croak(aTHX_ "corrupted regexp pointers");
9732     NOT_REACHED; /* NOTREACHED */
9733
9734   yes:
9735     if (yes_state) {
9736         /* we have successfully completed a subexpression, but we must now
9737          * pop to the state marked by yes_state and continue from there */
9738         assert(st != yes_state);
9739 #ifdef DEBUGGING
9740         while (st != yes_state) {
9741             st--;
9742             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9743                 PL_regmatch_slab = PL_regmatch_slab->prev;
9744                 st = SLAB_LAST(PL_regmatch_slab);
9745             }
9746             DEBUG_STATE_r({
9747                 if (no_final) {
9748                     DEBUG_STATE_pp("pop (no final)");
9749                 } else {
9750                     DEBUG_STATE_pp("pop (yes)");
9751                 }
9752             });
9753             depth--;
9754         }
9755 #else
9756         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9757             || yes_state > SLAB_LAST(PL_regmatch_slab))
9758         {
9759             /* not in this slab, pop slab */
9760             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9761             PL_regmatch_slab = PL_regmatch_slab->prev;
9762             st = SLAB_LAST(PL_regmatch_slab);
9763         }
9764         depth -= (st - yes_state);
9765 #endif
9766         st = yes_state;
9767         yes_state = st->u.yes.prev_yes_state;
9768         PL_regmatch_state = st;
9769
9770         if (no_final) {
9771             locinput= st->locinput;
9772             loceol= st->loceol;
9773             script_run_begin = st->sr0;
9774         }
9775         state_num = st->resume_state + no_final;
9776         goto reenter_switch;
9777     }
9778
9779     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9780                           PL_colors[4], PL_colors[5]));
9781
9782     if (reginfo->info_aux_eval) {
9783         /* each successfully executed (?{...}) block does the equivalent of
9784          *   local $^R = do {...}
9785          * When popping the save stack, all these locals would be undone;
9786          * bypass this by setting the outermost saved $^R to the latest
9787          * value */
9788         /* I dont know if this is needed or works properly now.
9789          * see code related to PL_replgv elsewhere in this file.
9790          * Yves
9791          */
9792         if (oreplsv != GvSV(PL_replgv)) {
9793             sv_setsv(oreplsv, GvSV(PL_replgv));
9794             SvSETMAGIC(oreplsv);
9795         }
9796     }
9797     result = 1;
9798     goto final_exit;
9799
9800   no:
9801     DEBUG_EXECUTE_r(
9802         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9803             depth,
9804             PL_colors[4], PL_colors[5])
9805         );
9806
9807   no_silent:
9808     if (no_final) {
9809         if (yes_state) {
9810             goto yes;
9811         } else {
9812             goto final_exit;
9813         }
9814     }
9815     if (depth) {
9816         /* there's a previous state to backtrack to */
9817         st--;
9818         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9819             PL_regmatch_slab = PL_regmatch_slab->prev;
9820             st = SLAB_LAST(PL_regmatch_slab);
9821         }
9822         PL_regmatch_state = st;
9823         locinput= st->locinput;
9824         loceol= st->loceol;
9825         script_run_begin = st->sr0;
9826
9827         DEBUG_STATE_pp("pop");
9828         depth--;
9829         if (yes_state == st)
9830             yes_state = st->u.yes.prev_yes_state;
9831
9832         state_num = st->resume_state + 1; /* failure = success + 1 */
9833         PERL_ASYNC_CHECK();
9834         goto reenter_switch;
9835     }
9836     result = 0;
9837
9838   final_exit:
9839     if (rex->intflags & PREGf_VERBARG_SEEN) {
9840         SV *sv_err = get_sv("REGERROR", 1);
9841         SV *sv_mrk = get_sv("REGMARK", 1);
9842         if (result) {
9843             sv_commit = &PL_sv_no;
9844             if (!sv_yes_mark)
9845                 sv_yes_mark = &PL_sv_yes;
9846         } else {
9847             if (!sv_commit)
9848                 sv_commit = &PL_sv_yes;
9849             sv_yes_mark = &PL_sv_no;
9850         }
9851         assert(sv_err);
9852         assert(sv_mrk);
9853         sv_setsv(sv_err, sv_commit);
9854         sv_setsv(sv_mrk, sv_yes_mark);
9855     }
9856
9857
9858     if (last_pushed_cv) {
9859         dSP;
9860         /* see "Some notes about MULTICALL" above */
9861         POP_MULTICALL;
9862         PERL_UNUSED_VAR(SP);
9863     }
9864     else
9865         LEAVE_SCOPE(orig_savestack_ix);
9866
9867     assert(!result ||  locinput - reginfo->strbeg >= 0);
9868     return result ?  locinput - reginfo->strbeg : -1;
9869 }
9870
9871 /*
9872  - regrepeat - repeatedly match something simple, report how many
9873  *
9874  * What 'simple' means is a node which can be the operand of a quantifier like
9875  * '+', or {1,3}
9876  *
9877  * startposp - pointer to a pointer to the start position.  This is updated
9878  *             to point to the byte following the highest successful
9879  *             match.
9880  * p         - the regnode to be repeatedly matched against.
9881  * loceol    - pointer to the end position beyond which we aren't supposed to
9882  *             look.
9883  * reginfo   - struct holding match state, such as utf8_target
9884  * max       - maximum number of things to match.
9885  * depth     - (for debugging) backtracking depth.
9886  */
9887 STATIC I32
9888 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9889             char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9890 {
9891     char *scan;     /* Pointer to current position in target string */
9892     I32 c;
9893     char *this_eol = loceol;   /* potentially adjusted version. */
9894     I32 hardcount = 0;  /* How many matches so far */
9895     bool utf8_target = reginfo->is_utf8_target;
9896     unsigned int to_complement = 0;  /* Invert the result? */
9897     _char_class_number classnum;
9898
9899     PERL_ARGS_ASSERT_REGREPEAT;
9900
9901     /* This routine is structured so that we switch on the input OP.  Each OP
9902      * case: statement contains a loop to repeatedly apply the OP, advancing
9903      * the input until it fails, or reaches the end of the input, or until it
9904      * reaches the upper limit of matches. */
9905
9906     scan = *startposp;
9907     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
9908                                max */
9909         max = I32_MAX;
9910     else if (! utf8_target && this_eol - scan > max)
9911         this_eol = scan + max;
9912
9913     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
9914      * to the maximum of how far we should go in it (but leaving it set to the
9915      * real end if the maximum permissible would take us beyond that).  This
9916      * allows us to make the loop exit condition that we haven't gone past
9917      * <this_eol> to also mean that we haven't exceeded the max permissible
9918      * count, saving a test each time through the loop.  But it assumes that
9919      * the OP matches a single byte, which is true for most of the OPs below
9920      * when applied to a non-UTF-8 target.  Those relatively few OPs that don't
9921      * have this characteristic have to compensate.
9922      *
9923      * There is no such adjustment for UTF-8 targets, sinc the number of bytes
9924      * per character can vary.  OPs will have to test both that the count is
9925      * less than the max permissible (using <hardcount> to keep track), and
9926      * that we are still within the bounds of the string (using <this_eol>.  A
9927      * few OPs match a single byte no matter what the encoding.  They can omit
9928      * the max test if, for the UTF-8 case, they do the adjustment that was
9929      * skipped above.
9930      *
9931      * Thus, the code above sets things up for the common case; and exceptional
9932      * cases need extra work; the common case is to make sure <scan> doesn't
9933      * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9934      * count doesn't exceed the maximum permissible */
9935
9936     switch (OP(p)) {
9937     case REG_ANY:
9938         if (utf8_target) {
9939             while (scan < this_eol && hardcount < max && *scan != '\n') {
9940                 scan += UTF8SKIP(scan);
9941                 hardcount++;
9942             }
9943         } else {
9944             scan = (char *) memchr(scan, '\n', this_eol - scan);
9945             if (! scan) {
9946                 scan = this_eol;
9947             }
9948         }
9949         break;
9950     case SANY:
9951         if (utf8_target) {
9952             while (scan < this_eol && hardcount < max) {
9953                 scan += UTF8SKIP(scan);
9954                 hardcount++;
9955             }
9956         }
9957         else
9958             scan = this_eol;
9959         break;
9960
9961     case EXACTL:
9962         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9963             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9964         }
9965         /* FALLTHROUGH */
9966
9967     case EXACTFL:
9968     case EXACTFLU8:
9969         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9970         goto do_exact;
9971
9972     case EXACT_REQ8:
9973     case LEXACT_REQ8:
9974     case EXACTFU_REQ8:
9975         if (! utf8_target) {
9976             break;
9977         }
9978         /* FALLTHROUGH */
9979
9980     case LEXACT:
9981     case EXACT:
9982     case EXACTF:
9983     case EXACTFAA_NO_TRIE:
9984     case EXACTFAA:
9985     case EXACTFU:
9986     case EXACTFUP:
9987
9988       do_exact: {
9989         struct next_matchable_info Binfo;
9990         PERL_UINT_FAST8_T definitive_len;
9991
9992         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9993
9994         /* Set up termination info, and quit if we can rule out that we've
9995          * gotten a match of the termination criteria */
9996         if (   ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
9997             ||   scan + Binfo.min_length > this_eol
9998             || ! S_test_EXACTISH_ST(scan, Binfo))
9999         {
10000             break;
10001         }
10002
10003         definitive_len = Binfo.initial_definitive;
10004
10005         /* Here there are potential matches, and the first byte(s) matched our
10006          * filter
10007          *
10008          * If we got a definitive match of some initial bytes, there is no
10009          * possibility of false positives as far as it got */
10010         if (definitive_len > 0) {
10011
10012             /* If as far as it got is the maximum possible, there were no false
10013              * positives at all.  Since we have everything set up, see how many
10014              * repeats there are. */
10015             if (definitive_len >= Binfo.max_length) {
10016
10017                 /* We've already found one match */
10018                 scan += definitive_len;
10019                 hardcount++;
10020
10021                 /* If want more than the one match, and there is room for more,
10022                  * see if there are any */
10023                 if (hardcount < max && scan + definitive_len <= this_eol) {
10024
10025                     /* If the character is only a single byte long, just span
10026                      * all such bytes. */
10027                     if (definitive_len == 1) {
10028                         const char * orig_scan = scan;
10029
10030                         if (this_eol - (scan - hardcount) > max) {
10031                             this_eol = scan - hardcount + max;
10032                         }
10033
10034                         /* Use different routines depending on whether it's an
10035                          * exact match or matches with a mask */
10036                         if (Binfo.initial_exact == 1) {
10037                             scan = (char *) find_span_end((U8 *) scan,
10038                                                           (U8 *) this_eol,
10039                                                           Binfo.matches[0]);
10040                         }
10041                         else {
10042                             scan = (char *) find_span_end_mask(
10043                                                        (U8 *) scan,
10044                                                        (U8 *) this_eol,
10045                                                        Binfo.first_byte_anded,
10046                                                        Binfo.first_byte_mask);
10047                         }
10048
10049                         hardcount += scan - orig_scan;
10050                     }
10051                     else { /* Here, the full character definitive match is more
10052                               than one byte */
10053                         while (   hardcount < max
10054                                && scan + definitive_len <= this_eol
10055                                && S_test_EXACTISH_ST(scan, Binfo))
10056                         {
10057                                 scan += definitive_len;
10058                                 hardcount++;
10059                         }
10060                     }
10061                 }
10062
10063                 break;
10064             }   /* End of a full character is definitively matched */
10065
10066             /* Here, an initial portion of the character matched definitively,
10067              * and the rest matched as well, but could have false positives */
10068
10069             do {
10070                 PERL_INT_FAST8_T i;
10071                 U8 * matches = Binfo.matches;
10072
10073                 /* The first bytes were definitive.  Look at the remaining */
10074                 for (i = 0; i < Binfo.count; i++) {
10075                     if (memEQ(scan + definitive_len,
10076                               matches + definitive_len,
10077                               Binfo.lengths[i] - definitive_len))
10078                     {
10079                         goto found_a_completion;
10080                     }
10081
10082                     matches += Binfo.lengths[i];
10083                 }
10084
10085                 /* Didn't find anything to complete our initial match.  Stop
10086                  * here */
10087                 break;
10088
10089               found_a_completion:
10090
10091                 /* Here, matched a full character, Include it in the result,
10092                  * and then look to see if the next char matches */
10093                 hardcount++;
10094                 scan += Binfo.lengths[i];
10095
10096             } while (   hardcount < max
10097                      && scan + definitive_len < this_eol
10098                      && S_test_EXACTISH_ST(scan, Binfo));
10099
10100             /* Here, have advanced as far as possible */
10101             break;
10102         } /* End of found some initial bytes that definitively matched */
10103
10104         /* Here, we can't rule out that we have found the beginning of 'B', but
10105          * there were no initial bytes that could rule out anything
10106          * definitively. Use brute force to examine all the possibilities */
10107         while (scan < this_eol && hardcount < max) {
10108             PERL_INT_FAST8_T i;
10109             U8 * matches = Binfo.matches;
10110
10111             for (i = 0; i < Binfo.count; i++) {
10112                 if (memEQ(scan, matches, Binfo.lengths[i])) {
10113                     goto found1;
10114                 }
10115
10116                 matches += Binfo.lengths[i];
10117             }
10118
10119             break;
10120
10121           found1:
10122             hardcount++;
10123             scan += Binfo.lengths[i];
10124         }
10125
10126         break;
10127       }
10128     case ANYOFPOSIXL:
10129     case ANYOFL:
10130         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10131          CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10132
10133         /* FALLTHROUGH */
10134     case ANYOFD:
10135     case ANYOF:
10136         if (utf8_target) {
10137             while (hardcount < max
10138                    && scan < this_eol
10139                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
10140             {
10141                 scan += UTF8SKIP(scan);
10142                 hardcount++;
10143             }
10144         }
10145         else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
10146             while (scan < this_eol
10147                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10148                 scan++;
10149         }
10150         else {
10151             while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10152                 scan++;
10153         }
10154         break;
10155
10156     case ANYOFM:
10157         if (utf8_target && this_eol - scan > max) {
10158
10159             /* We didn't adjust <this_eol> at the beginning of this routine
10160              * because is UTF-8, but it is actually ok to do so, since here, to
10161              * match, 1 char == 1 byte. */
10162             this_eol = scan + max;
10163         }
10164
10165         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
10166         break;
10167
10168     case NANYOFM:
10169         if (utf8_target) {
10170             while (     hardcount < max
10171                    &&   scan < this_eol
10172                    &&  (*scan & FLAGS(p)) != ARG(p))
10173             {
10174                 scan += UTF8SKIP(scan);
10175                 hardcount++;
10176             }
10177         }
10178         else {
10179             scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
10180         }
10181         break;
10182
10183     case ANYOFH:
10184         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
10185             while (  hardcount < max
10186                    && scan < this_eol
10187                    && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10188                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10189             {
10190                 scan += UTF8SKIP(scan);
10191                 hardcount++;
10192             }
10193         }
10194         break;
10195
10196     case ANYOFHb:
10197         if (utf8_target) {  /* ANYOFHb only can match UTF-8 targets */
10198
10199             /* we know the first byte must be the FLAGS field */
10200             while (   hardcount < max
10201                    && scan < this_eol
10202                    && (U8) *scan == ANYOF_FLAGS(p)
10203                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
10204                                                               TRUE))
10205             {
10206                 scan += UTF8SKIP(scan);
10207                 hardcount++;
10208             }
10209         }
10210         break;
10211
10212     case ANYOFHr:
10213         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
10214             while (  hardcount < max
10215                    && scan < this_eol
10216                    && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10217                               LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10218                               HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10219                    && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10220                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10221             {
10222                 scan += UTF8SKIP(scan);
10223                 hardcount++;
10224             }
10225         }
10226         break;
10227
10228     case ANYOFHs:
10229         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
10230             while (   hardcount < max
10231                    && scan + FLAGS(p) < this_eol
10232                    && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10233                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10234             {
10235                 scan += UTF8SKIP(scan);
10236                 hardcount++;
10237             }
10238         }
10239         break;
10240
10241     case ANYOFR:
10242         if (utf8_target) {
10243             while (   hardcount < max
10244                    && scan < this_eol
10245                    && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10246                    && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10247                                                 (U8 *) this_eol,
10248                                                 NULL),
10249                                   ANYOFRbase(p), ANYOFRdelta(p)))
10250             {
10251                 scan += UTF8SKIP(scan);
10252                 hardcount++;
10253             }
10254         }
10255         else {
10256             while (   hardcount < max
10257                    && scan < this_eol
10258                    && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10259             {
10260                 scan++;
10261                 hardcount++;
10262             }
10263         }
10264         break;
10265
10266     case ANYOFRb:
10267         if (utf8_target) {
10268             while (   hardcount < max
10269                    && scan < this_eol
10270                    && (U8) *scan == ANYOF_FLAGS(p)
10271                    && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10272                                                 (U8 *) this_eol,
10273                                                 NULL),
10274                                   ANYOFRbase(p), ANYOFRdelta(p)))
10275             {
10276                 scan += UTF8SKIP(scan);
10277                 hardcount++;
10278             }
10279         }
10280         else {
10281             while (   hardcount < max
10282                    && scan < this_eol
10283                    && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10284             {
10285                 scan++;
10286                 hardcount++;
10287             }
10288         }
10289         break;
10290
10291     /* The argument (FLAGS) to all the POSIX node types is the class number */
10292
10293     case NPOSIXL:
10294         to_complement = 1;
10295         /* FALLTHROUGH */
10296
10297     case POSIXL:
10298         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10299         if (! utf8_target) {
10300             while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
10301                                                                    *scan)))
10302             {
10303                 scan++;
10304             }
10305         } else {
10306             while (hardcount < max && scan < this_eol
10307                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10308                                                                   (U8 *) scan,
10309                                                                   (U8 *) this_eol)))
10310             {
10311                 scan += UTF8SKIP(scan);
10312                 hardcount++;
10313             }
10314         }
10315         break;
10316
10317     case POSIXD:
10318         if (utf8_target) {
10319             goto utf8_posix;
10320         }
10321         /* FALLTHROUGH */
10322
10323     case POSIXA:
10324         if (utf8_target && this_eol - scan > max) {
10325
10326             /* We didn't adjust <this_eol> at the beginning of this routine
10327              * because is UTF-8, but it is actually ok to do so, since here, to
10328              * match, 1 char == 1 byte. */
10329             this_eol = scan + max;
10330         }
10331         while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
10332             scan++;
10333         }
10334         break;
10335
10336     case NPOSIXD:
10337         if (utf8_target) {
10338             to_complement = 1;
10339             goto utf8_posix;
10340         }
10341         /* FALLTHROUGH */
10342
10343     case NPOSIXA:
10344         if (! utf8_target) {
10345             while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
10346                 scan++;
10347             }
10348         }
10349         else {
10350
10351             /* The complement of something that matches only ASCII matches all
10352              * non-ASCII, plus everything in ASCII that isn't in the class. */
10353             while (hardcount < max && scan < this_eol
10354                    && (   ! isASCII_utf8_safe(scan, loceol)
10355                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
10356             {
10357                 scan += UTF8SKIP(scan);
10358                 hardcount++;
10359             }
10360         }
10361         break;
10362
10363     case NPOSIXU:
10364         to_complement = 1;
10365         /* FALLTHROUGH */
10366
10367     case POSIXU:
10368         if (! utf8_target) {
10369             while (scan < this_eol && to_complement
10370                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
10371             {
10372                 scan++;
10373             }
10374         }
10375         else {
10376           utf8_posix:
10377             classnum = (_char_class_number) FLAGS(p);
10378             switch (classnum) {
10379                 default:
10380                     while (   hardcount < max && scan < this_eol
10381                            && to_complement ^ cBOOL(_invlist_contains_cp(
10382                                               PL_XPosix_ptrs[classnum],
10383                                               utf8_to_uvchr_buf((U8 *) scan,
10384                                                                 (U8 *) this_eol,
10385                                                                 NULL))))
10386                     {
10387                         scan += UTF8SKIP(scan);
10388                         hardcount++;
10389                     }
10390                     break;
10391
10392                     /* For the classes below, the knowledge of how to handle
10393                      * every code point is compiled in to Perl via a macro.
10394                      * This code is written for making the loops as tight as
10395                      * possible.  It could be refactored to save space instead.
10396                      * */
10397
10398                 case _CC_ENUM_SPACE:
10399                     while (hardcount < max
10400                            && scan < this_eol
10401                            && (to_complement
10402                                ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10403                     {
10404                         scan += UTF8SKIP(scan);
10405                         hardcount++;
10406                     }
10407                     break;
10408                 case _CC_ENUM_BLANK:
10409                     while (hardcount < max
10410                            && scan < this_eol
10411                            && (to_complement
10412                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10413                     {
10414                         scan += UTF8SKIP(scan);
10415                         hardcount++;
10416                     }
10417                     break;
10418                 case _CC_ENUM_XDIGIT:
10419                     while (hardcount < max
10420                            && scan < this_eol
10421                            && (to_complement
10422                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10423                     {
10424                         scan += UTF8SKIP(scan);
10425                         hardcount++;
10426                     }
10427                     break;
10428                 case _CC_ENUM_VERTSPACE:
10429                     while (hardcount < max
10430                            && scan < this_eol
10431                            && (to_complement
10432                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10433                     {
10434                         scan += UTF8SKIP(scan);
10435                         hardcount++;
10436                     }
10437                     break;
10438                 case _CC_ENUM_CNTRL:
10439                     while (hardcount < max
10440                            && scan < this_eol
10441                            && (to_complement
10442                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10443                     {
10444                         scan += UTF8SKIP(scan);
10445                         hardcount++;
10446                     }
10447                     break;
10448             }
10449         }
10450         break;
10451
10452     case LNBREAK:
10453         if (utf8_target) {
10454             while (hardcount < max && scan < this_eol &&
10455                     (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
10456                 scan += c;
10457                 hardcount++;
10458             }
10459         } else {
10460             /* LNBREAK can match one or two latin chars, which is ok, but we
10461              * have to use hardcount in this situation, and throw away the
10462              * adjustment to <this_eol> done before the switch statement */
10463             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
10464                 scan+=c;
10465                 hardcount++;
10466             }
10467         }
10468         break;
10469
10470     default:
10471         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
10472         NOT_REACHED; /* NOTREACHED */
10473
10474     }
10475
10476     if (hardcount)
10477         c = hardcount;
10478     else
10479         c = scan - *startposp;
10480     *startposp = scan;
10481
10482     DEBUG_r({
10483         DECLARE_AND_GET_RE_DEBUG_FLAGS;
10484         DEBUG_EXECUTE_r({
10485             SV * const prop = sv_newmortal();
10486             regprop(prog, prop, p, reginfo, NULL);
10487             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
10488                         depth, SvPVX_const(prop),(IV)c,(IV)max);
10489         });
10490     });
10491
10492     return(c);
10493 }
10494
10495 /*
10496  - reginclass - determine if a character falls into a character class
10497
10498   n is the ANYOF-type regnode
10499   p is the target string
10500   p_end points to one byte beyond the end of the target string
10501   utf8_target tells whether p is in UTF-8.
10502
10503   Returns true if matched; false otherwise.
10504
10505   Note that this can be a synthetic start class, a combination of various
10506   nodes, so things you think might be mutually exclusive, such as locale,
10507   aren't.  It can match both locale and non-locale
10508
10509  */
10510
10511 STATIC bool
10512 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10513 {
10514     const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10515                         ? 0
10516                         : ANYOF_FLAGS(n);
10517     bool match = FALSE;
10518     UV c = *p;
10519
10520     PERL_ARGS_ASSERT_REGINCLASS;
10521
10522     /* If c is not already the code point, get it.  Note that
10523      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10524     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10525         STRLEN c_len = 0;
10526         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10527         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10528         if (c_len == (STRLEN)-1) {
10529             _force_out_malformed_utf8_message(p, p_end,
10530                                               utf8n_flags,
10531                                               1 /* 1 means die */ );
10532             NOT_REACHED; /* NOTREACHED */
10533         }
10534         if (     c > 255
10535             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10536             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
10537         {
10538             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10539         }
10540     }
10541
10542     /* If this character is potentially in the bitmap, check it */
10543     if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10544         if (ANYOF_BITMAP_TEST(n, c))
10545             match = TRUE;
10546         else if ((flags
10547                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10548                   && OP(n) == ANYOFD
10549                   && ! utf8_target
10550                   && ! isASCII(c))
10551         {
10552             match = TRUE;
10553         }
10554         else if (flags & ANYOF_LOCALE_FLAGS) {
10555             if (  (flags & ANYOFL_FOLD)
10556                 && c < 256
10557                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10558             {
10559                 match = TRUE;
10560             }
10561             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
10562                      && c <= U8_MAX  /* param to isFOO_lc() */
10563             ) {
10564
10565                 /* The data structure is arranged so bits 0, 2, 4, ... are set
10566                  * if the class includes the Posix character class given by
10567                  * bit/2; and 1, 3, 5, ... are set if the class includes the
10568                  * complemented Posix class given by int(bit/2).  So we loop
10569                  * through the bits, each time changing whether we complement
10570                  * the result or not.  Suppose for the sake of illustration
10571                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
10572                  * is set, it means there is a match for this ANYOF node if the
10573                  * character is in the class given by the expression (0 / 2 = 0
10574                  * = \w).  If it is in that class, isFOO_lc() will return 1,
10575                  * and since 'to_complement' is 0, the result will stay TRUE,
10576                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
10577                  * bit 1 is 1.  That means there is a match if the character
10578                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
10579                  * but will on bit 1.  On the second iteration 'to_complement'
10580                  * will be 1, so the exclusive or will reverse things, so we
10581                  * are testing for \W.  On the third iteration, 'to_complement'
10582                  * will be 0, and we would be testing for \s; the fourth
10583                  * iteration would test for \S, etc.
10584                  *
10585                  * Note that this code assumes that all the classes are closed
10586                  * under folding.  For example, if a character matches \w, then
10587                  * its fold does too; and vice versa.  This should be true for
10588                  * any well-behaved locale for all the currently defined Posix
10589                  * classes, except for :lower: and :upper:, which are handled
10590                  * by the pseudo-class :cased: which matches if either of the
10591                  * other two does.  To get rid of this assumption, an outer
10592                  * loop could be used below to iterate over both the source
10593                  * character, and its fold (if different) */
10594
10595                 int count = 0;
10596                 int to_complement = 0;
10597
10598                 while (count < ANYOF_MAX) {
10599                     if (ANYOF_POSIXL_TEST(n, count)
10600                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
10601                     {
10602                         match = TRUE;
10603                         break;
10604                     }
10605                     count++;
10606                     to_complement ^= 1;
10607                 }
10608             }
10609         }
10610     }
10611
10612
10613     /* If the bitmap didn't (or couldn't) match, and something outside the
10614      * bitmap could match, try that. */
10615     if (!match) {
10616         if (c >= NUM_ANYOF_CODE_POINTS
10617             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
10618         {
10619             match = TRUE;       /* Everything above the bitmap matches */
10620         }
10621             /* Here doesn't match everything above the bitmap.  If there is
10622              * some information available beyond the bitmap, we may find a
10623              * match in it.  If so, this is most likely because the code point
10624              * is outside the bitmap range.  But rarely, it could be because of
10625              * some other reason.  If so, various flags are set to indicate
10626              * this possibility.  On ANYOFD nodes, there may be matches that
10627              * happen only when the target string is UTF-8; or for other node
10628              * types, because runtime lookup is needed, regardless of the
10629              * UTF-8ness of the target string.  Finally, under /il, there may
10630              * be some matches only possible if the locale is a UTF-8 one. */
10631         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
10632                  && (   c >= NUM_ANYOF_CODE_POINTS
10633                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
10634                          && (   UNLIKELY(OP(n) != ANYOFD)
10635                              || (utf8_target && ! isASCII_uni(c)
10636 #                               if NUM_ANYOF_CODE_POINTS > 256
10637                                                                  && c < 256
10638 #                               endif
10639                                 )))
10640                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10641                          && IN_UTF8_CTYPE_LOCALE)))
10642         {
10643             SV* only_utf8_locale = NULL;
10644             SV * const definition =
10645 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
10646                 get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10647                                             &only_utf8_locale, NULL);
10648 #else
10649                 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
10650                                              &only_utf8_locale, NULL);
10651 #endif
10652             if (definition) {
10653                 U8 utf8_buffer[2];
10654                 U8 * utf8_p;
10655                 if (utf8_target) {
10656                     utf8_p = (U8 *) p;
10657                 } else { /* Convert to utf8 */
10658                     utf8_p = utf8_buffer;
10659                     append_utf8_from_native_byte(*p, &utf8_p);
10660                     utf8_p = utf8_buffer;
10661                 }
10662
10663                 /* Turkish locales have these hard-coded rules overriding
10664                  * normal ones */
10665                 if (   UNLIKELY(PL_in_utf8_turkic_locale)
10666                     && isALPHA_FOLD_EQ(*p, 'i'))
10667                 {
10668                     if (*p == 'i') {
10669                         if (_invlist_contains_cp(definition,
10670                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10671                         {
10672                             match = TRUE;
10673                         }
10674                     }
10675                     else if (*p == 'I') {
10676                         if (_invlist_contains_cp(definition,
10677                                                 LATIN_SMALL_LETTER_DOTLESS_I))
10678                         {
10679                             match = TRUE;
10680                         }
10681                     }
10682                 }
10683                 else if (_invlist_contains_cp(definition, c)) {
10684                     match = TRUE;
10685                 }
10686             }
10687             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10688                 match = _invlist_contains_cp(only_utf8_locale, c);
10689             }
10690         }
10691
10692         /* In a Turkic locale under folding, hard-code the I i case pair
10693          * matches */
10694         if (     UNLIKELY(PL_in_utf8_turkic_locale)
10695             && ! match
10696             &&   (flags & ANYOFL_FOLD)
10697             &&   utf8_target)
10698         {
10699             if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10700                 if (ANYOF_BITMAP_TEST(n, 'i')) {
10701                     match = TRUE;
10702                 }
10703             }
10704             else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10705                 if (ANYOF_BITMAP_TEST(n, 'I')) {
10706                     match = TRUE;
10707                 }
10708             }
10709         }
10710
10711         if (UNICODE_IS_SUPER(c)
10712             && (flags
10713                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10714             && OP(n) != ANYOFD
10715             && ckWARN_d(WARN_NON_UNICODE))
10716         {
10717             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10718                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10719         }
10720     }
10721
10722 #if ANYOF_INVERT != 1
10723     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10724      * use it, don't */
10725 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10726 #endif
10727
10728     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10729     return (flags & ANYOF_INVERT) ^ match;
10730 }
10731
10732 STATIC U8 *
10733 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10734 {
10735     /* return the position 'off' UTF-8 characters away from 's', forward if
10736      * 'off' >= 0, backwards if negative.  But don't go outside of position
10737      * 'lim', which better be < s  if off < 0 */
10738
10739     PERL_ARGS_ASSERT_REGHOP3;
10740
10741     if (off >= 0) {
10742         while (off-- && s < lim) {
10743             /* XXX could check well-formedness here */
10744             U8 *new_s = s + UTF8SKIP(s);
10745             if (new_s > lim) /* lim may be in the middle of a long character */
10746                 return s;
10747             s = new_s;
10748         }
10749     }
10750     else {
10751         while (off++ && s > lim) {
10752             s--;
10753             if (UTF8_IS_CONTINUED(*s)) {
10754                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10755                     s--;
10756                 if (! UTF8_IS_START(*s)) {
10757                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10758                 }
10759             }
10760             /* XXX could check well-formedness here */
10761         }
10762     }
10763     return s;
10764 }
10765
10766 STATIC U8 *
10767 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10768 {
10769     PERL_ARGS_ASSERT_REGHOP4;
10770
10771     if (off >= 0) {
10772         while (off-- && s < rlim) {
10773             /* XXX could check well-formedness here */
10774             s += UTF8SKIP(s);
10775         }
10776     }
10777     else {
10778         while (off++ && s > llim) {
10779             s--;
10780             if (UTF8_IS_CONTINUED(*s)) {
10781                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10782                     s--;
10783                 if (! UTF8_IS_START(*s)) {
10784                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10785                 }
10786             }
10787             /* XXX could check well-formedness here */
10788         }
10789     }
10790     return s;
10791 }
10792
10793 /* like reghop3, but returns NULL on overrun, rather than returning last
10794  * char pos */
10795
10796 STATIC U8 *
10797 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10798 {
10799     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10800
10801     if (off >= 0) {
10802         while (off-- && s < lim) {
10803             /* XXX could check well-formedness here */
10804             s += UTF8SKIP(s);
10805         }
10806         if (off >= 0)
10807             return NULL;
10808     }
10809     else {
10810         while (off++ && s > lim) {
10811             s--;
10812             if (UTF8_IS_CONTINUED(*s)) {
10813                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10814                     s--;
10815                 if (! UTF8_IS_START(*s)) {
10816                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10817                 }
10818             }
10819             /* XXX could check well-formedness here */
10820         }
10821         if (off <= 0)
10822             return NULL;
10823     }
10824     return s;
10825 }
10826
10827
10828 /* when executing a regex that may have (?{}), extra stuff needs setting
10829    up that will be visible to the called code, even before the current
10830    match has finished. In particular:
10831
10832    * $_ is localised to the SV currently being matched;
10833    * pos($_) is created if necessary, ready to be updated on each call-out
10834      to code;
10835    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10836      isn't set until the current pattern is successfully finished), so that
10837      $1 etc of the match-so-far can be seen;
10838    * save the old values of subbeg etc of the current regex, and  set then
10839      to the current string (again, this is normally only done at the end
10840      of execution)
10841 */
10842
10843 static void
10844 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10845 {
10846     MAGIC *mg;
10847     regexp *const rex = ReANY(reginfo->prog);
10848     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10849
10850     eval_state->rex = rex;
10851     eval_state->sv  = reginfo->sv;
10852
10853     if (reginfo->sv) {
10854         /* Make $_ available to executed code. */
10855         if (reginfo->sv != DEFSV) {
10856             SAVE_DEFSV;
10857             DEFSV_set(reginfo->sv);
10858         }
10859         /* will be dec'd by S_cleanup_regmatch_info_aux */
10860         SvREFCNT_inc_NN(reginfo->sv);
10861
10862         if (!(mg = mg_find_mglob(reginfo->sv))) {
10863             /* prepare for quick setting of pos */
10864             mg = sv_magicext_mglob(reginfo->sv);
10865             mg->mg_len = -1;
10866         }
10867         eval_state->pos_magic = mg;
10868         eval_state->pos       = mg->mg_len;
10869         eval_state->pos_flags = mg->mg_flags;
10870     }
10871     else
10872         eval_state->pos_magic = NULL;
10873
10874     if (!PL_reg_curpm) {
10875         /* PL_reg_curpm is a fake PMOP that we can attach the current
10876          * regex to and point PL_curpm at, so that $1 et al are visible
10877          * within a /(?{})/. It's just allocated once per interpreter the
10878          * first time its needed */
10879         Newxz(PL_reg_curpm, 1, PMOP);
10880 #ifdef USE_ITHREADS
10881         {
10882             SV* const repointer = &PL_sv_undef;
10883             /* this regexp is also owned by the new PL_reg_curpm, which
10884                will try to free it.  */
10885             av_push(PL_regex_padav, repointer);
10886             PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
10887             PL_regex_pad = AvARRAY(PL_regex_padav);
10888         }
10889 #endif
10890     }
10891     SET_reg_curpm(reginfo->prog);
10892     eval_state->curpm = PL_curpm;
10893     PL_curpm_under = PL_curpm;
10894     PL_curpm = PL_reg_curpm;
10895     if (RXp_MATCH_COPIED(rex)) {
10896         /*  Here is a serious problem: we cannot rewrite subbeg,
10897             since it may be needed if this match fails.  Thus
10898             $` inside (?{}) could fail... */
10899         eval_state->subbeg     = rex->subbeg;
10900         eval_state->sublen     = rex->sublen;
10901         eval_state->suboffset  = rex->suboffset;
10902         eval_state->subcoffset = rex->subcoffset;
10903 #ifdef PERL_ANY_COW
10904         eval_state->saved_copy = rex->saved_copy;
10905 #endif
10906         RXp_MATCH_COPIED_off(rex);
10907     }
10908     else
10909         eval_state->subbeg = NULL;
10910     rex->subbeg = (char *)reginfo->strbeg;
10911     rex->suboffset = 0;
10912     rex->subcoffset = 0;
10913     rex->sublen = reginfo->strend - reginfo->strbeg;
10914 }
10915
10916
10917 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10918
10919 static void
10920 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10921 {
10922     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10923     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10924     regmatch_slab *s;
10925
10926     Safefree(aux->poscache);
10927
10928     if (eval_state) {
10929
10930         /* undo the effects of S_setup_eval_state() */
10931
10932         if (eval_state->subbeg) {
10933             regexp * const rex = eval_state->rex;
10934             rex->subbeg     = eval_state->subbeg;
10935             rex->sublen     = eval_state->sublen;
10936             rex->suboffset  = eval_state->suboffset;
10937             rex->subcoffset = eval_state->subcoffset;
10938 #ifdef PERL_ANY_COW
10939             rex->saved_copy = eval_state->saved_copy;
10940 #endif
10941             RXp_MATCH_COPIED_on(rex);
10942         }
10943         if (eval_state->pos_magic)
10944         {
10945             eval_state->pos_magic->mg_len = eval_state->pos;
10946             eval_state->pos_magic->mg_flags =
10947                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10948                | (eval_state->pos_flags & MGf_BYTES);
10949         }
10950
10951         PL_curpm = eval_state->curpm;
10952         SvREFCNT_dec(eval_state->sv);
10953     }
10954
10955     PL_regmatch_state = aux->old_regmatch_state;
10956     PL_regmatch_slab  = aux->old_regmatch_slab;
10957
10958     /* free all slabs above current one - this must be the last action
10959      * of this function, as aux and eval_state are allocated within
10960      * slabs and may be freed here */
10961
10962     s = PL_regmatch_slab->next;
10963     if (s) {
10964         PL_regmatch_slab->next = NULL;
10965         while (s) {
10966             regmatch_slab * const osl = s;
10967             s = s->next;
10968             Safefree(osl);
10969         }
10970     }
10971 }
10972
10973
10974 STATIC void
10975 S_to_utf8_substr(pTHX_ regexp *prog)
10976 {
10977     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10978      * on the converted value */
10979
10980     int i = 1;
10981
10982     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10983
10984     do {
10985         if (prog->substrs->data[i].substr
10986             && !prog->substrs->data[i].utf8_substr) {
10987             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10988             prog->substrs->data[i].utf8_substr = sv;
10989             sv_utf8_upgrade(sv);
10990             if (SvVALID(prog->substrs->data[i].substr)) {
10991                 if (SvTAIL(prog->substrs->data[i].substr)) {
10992                     /* Trim the trailing \n that fbm_compile added last
10993                        time.  */
10994                     SvCUR_set(sv, SvCUR(sv) - 1);
10995                     /* Whilst this makes the SV technically "invalid" (as its
10996                        buffer is no longer followed by "\0") when fbm_compile()
10997                        adds the "\n" back, a "\0" is restored.  */
10998                     fbm_compile(sv, FBMcf_TAIL);
10999                 } else
11000                     fbm_compile(sv, 0);
11001             }
11002             if (prog->substrs->data[i].substr == prog->check_substr)
11003                 prog->check_utf8 = sv;
11004         }
11005     } while (i--);
11006 }
11007
11008 STATIC bool
11009 S_to_byte_substr(pTHX_ regexp *prog)
11010 {
11011     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11012      * on the converted value; returns FALSE if can't be converted. */
11013
11014     int i = 1;
11015
11016     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11017
11018     do {
11019         if (prog->substrs->data[i].utf8_substr
11020             && !prog->substrs->data[i].substr) {
11021             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11022             if (! sv_utf8_downgrade(sv, TRUE)) {
11023                 SvREFCNT_dec_NN(sv);
11024                 return FALSE;
11025             }
11026             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11027                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11028                     /* Trim the trailing \n that fbm_compile added last
11029                         time.  */
11030                     SvCUR_set(sv, SvCUR(sv) - 1);
11031                     fbm_compile(sv, FBMcf_TAIL);
11032                 } else
11033                     fbm_compile(sv, 0);
11034             }
11035             prog->substrs->data[i].substr = sv;
11036             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11037                 prog->check_substr = sv;
11038         }
11039     } while (i--);
11040
11041     return TRUE;
11042 }
11043
11044 #ifndef PERL_IN_XSUB_RE
11045
11046 bool
11047 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11048 {
11049     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
11050      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
11051      * the larger string bounded by 'strbeg' and 'strend'.
11052      *
11053      * 'cp' needs to be assigned (if not, a future version of the Unicode
11054      * Standard could make it something that combines with adjacent characters,
11055      * so code using it would then break), and there has to be a GCB break
11056      * before and after the character. */
11057
11058
11059     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11060     const U8 * prev_cp_start;
11061
11062     PERL_ARGS_ASSERT_IS_GRAPHEME;
11063
11064     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
11065         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11066     {
11067         /* These are considered graphemes */
11068         return TRUE;
11069     }
11070
11071     /* Otherwise, unassigned code points are forbidden */
11072     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11073                                     _invlist_search(PL_Assigned_invlist, cp))))
11074     {
11075         return FALSE;
11076     }
11077
11078     cp_gcb_val = getGCB_VAL_CP(cp);
11079
11080     /* Find the GCB value of the previous code point in the input */
11081     prev_cp_start = utf8_hop_back(s, -1, strbeg);
11082     if (UNLIKELY(prev_cp_start == s)) {
11083         prev_cp_gcb_val = GCB_EDGE;
11084     }
11085     else {
11086         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11087     }
11088
11089     /* And check that is a grapheme boundary */
11090     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11091                 TRUE /* is UTF-8 encoded */ ))
11092     {
11093         return FALSE;
11094     }
11095
11096     /* Similarly verify there is a break between the current character and the
11097      * following one */
11098     s += UTF8SKIP(s);
11099     if (s >= strend) {
11100         next_cp_gcb_val = GCB_EDGE;
11101     }
11102     else {
11103         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11104     }
11105
11106     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11107 }
11108
11109 /*
11110 =for apidoc_section $unicode
11111
11112 =for apidoc isSCRIPT_RUN
11113
11114 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11115 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
11116 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
11117 two degenerate cases given below, this function returns TRUE iff all code
11118 points in it come from any combination of three "scripts" given by the Unicode
11119 "Script Extensions" property: Common, Inherited, and possibly one other.
11120 Additionally all decimal digits must come from the same consecutive sequence of
11121 10.
11122
11123 For example, if all the characters in the sequence are Greek, or Common, or
11124 Inherited, this function will return TRUE, provided any decimal digits in it
11125 are from the same block of digits in Common.  (These are the ASCII digits
11126 "0".."9" and additionally a block for full width forms of these, and several
11127 others used in mathematical notation.)   For scripts (unlike Greek) that have
11128 their own digits defined this will accept either digits from that set or from
11129 one of the Common digit sets, but not a combination of the two.  Some scripts,
11130 such as Arabic, have more than one set of digits.  All digits must come from
11131 the same set for this function to return TRUE.
11132
11133 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11134 contain the script found, using the C<SCX_enum> typedef.  Its value will be
11135 C<SCX_INVALID> if the function returns FALSE.
11136
11137 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11138 will be C<SCX_INVALID>.
11139
11140 If the sequence contains a single code point which is unassigned to a character
11141 in the version of Unicode being used, the function will return TRUE, and the
11142 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
11143 in the input sequence will result in the function treating the input as not
11144 being a script run.
11145
11146 The returned script will be C<SCX_Inherited> iff all the code points in it are
11147 from the Inherited script.
11148
11149 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11150 it are from the Inherited or Common scripts.
11151
11152 =cut
11153
11154 */
11155
11156 bool
11157 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11158 {
11159     /* Basically, it looks at each character in the sequence to see if the
11160      * above conditions are met; if not it fails.  It uses an inversion map to
11161      * find the enum corresponding to the script of each character.  But this
11162      * is complicated by the fact that a few code points can be in any of
11163      * several scripts.  The data has been constructed so that there are
11164      * additional enum values (all negative) for these situations.  The
11165      * absolute value of those is an index into another table which contains
11166      * pointers to auxiliary tables for each such situation.  Each aux array
11167      * lists all the scripts for the given situation.  There is another,
11168      * parallel, table that gives the number of entries in each aux table.
11169      * These are all defined in charclass_invlists.h */
11170
11171     /* XXX Here are the additional things UTS 39 says could be done:
11172      *
11173      * Forbid sequences of the same nonspacing mark
11174      *
11175      * Check to see that all the characters are in the sets of exemplar
11176      * characters for at least one language in the Unicode Common Locale Data
11177      * Repository [CLDR]. */
11178
11179
11180     /* Things that match /\d/u */
11181     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
11182     UV * decimals_array = invlist_array(decimals_invlist);
11183
11184     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11185      * not currently known) */
11186     UV zero_of_run = 0;
11187
11188     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
11189     SCX_enum script_of_char = SCX_INVALID;
11190
11191     /* If the script remains not fully determined from iteration to iteration,
11192      * this is the current intersection of the possiblities.  */
11193     SCX_enum * intersection = NULL;
11194     PERL_UINT_FAST8_T intersection_len = 0;
11195
11196     bool retval = TRUE;
11197     SCX_enum * ret_script = NULL;
11198
11199     assert(send >= s);
11200
11201     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11202
11203     /* All code points in 0..255 are either Common or Latin, so must be a
11204      * script run.  We can return immediately unless we need to know which
11205      * script it is. */
11206     if (! utf8_target && LIKELY(send > s)) {
11207         if (ret_script == NULL) {
11208             return TRUE;
11209         }
11210
11211         /* If any character is Latin, the run is Latin */
11212         while (s < send) {
11213             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11214                 *ret_script = SCX_Latin;
11215                 return TRUE;
11216             }
11217         }
11218
11219         /* Here, all are Common */
11220         *ret_script = SCX_Common;
11221         return TRUE;
11222     }
11223
11224     /* Look at each character in the sequence */
11225     while (s < send) {
11226         /* If the current character being examined is a digit, this is the code
11227          * point of the zero for its sequence of 10 */
11228         UV zero_of_char;
11229
11230         UV cp;
11231
11232         /* The code allows all scripts to use the ASCII digits.  This is
11233          * because they are in the Common script.  Hence any ASCII ones found
11234          * are ok, unless and until a digit from another set has already been
11235          * encountered.  digit ranges in Common are not similarly blessed) */
11236         if (UNLIKELY(isDIGIT(*s))) {
11237             if (UNLIKELY(script_of_run == SCX_Unknown)) {
11238                 retval = FALSE;
11239                 break;
11240             }
11241             if (zero_of_run) {
11242                 if (zero_of_run != '0') {
11243                     retval = FALSE;
11244                     break;
11245                 }
11246             }
11247             else {
11248                 zero_of_run = '0';
11249             }
11250             s++;
11251             continue;
11252         }
11253
11254         /* Here, isn't an ASCII digit.  Find the code point of the character */
11255         if (! UTF8_IS_INVARIANT(*s)) {
11256             Size_t len;
11257             cp = valid_utf8_to_uvchr((U8 *) s, &len);
11258             s += len;
11259         }
11260         else {
11261             cp = *(s++);
11262         }
11263
11264         /* If is within the range [+0 .. +9] of the script's zero, it also is a
11265          * digit in that script.  We can skip the rest of this code for this
11266          * character. */
11267         if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11268             continue;
11269         }
11270
11271         /* Find the character's script.  The correct values are hard-coded here
11272          * for small-enough code points. */
11273         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
11274                                unlikely to change */
11275             if (       cp > 255
11276                 || (   isALPHA_L1(cp)
11277                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
11278             {
11279                 script_of_char = SCX_Latin;
11280             }
11281             else {
11282                 script_of_char = SCX_Common;
11283             }
11284         }
11285         else {
11286             script_of_char = _Perl_SCX_invmap[
11287                                        _invlist_search(PL_SCX_invlist, cp)];
11288         }
11289
11290         /* We arbitrarily accept a single unassigned character, but not in
11291          * combination with anything else, and not a run of them. */
11292         if (   UNLIKELY(script_of_run == SCX_Unknown)
11293             || UNLIKELY(   script_of_run != SCX_INVALID
11294                         && script_of_char == SCX_Unknown))
11295         {
11296             retval = FALSE;
11297             break;
11298         }
11299
11300         /* For the first character, or the run is inherited, the run's script
11301          * is set to the char's */
11302         if (   UNLIKELY(script_of_run == SCX_INVALID)
11303             || UNLIKELY(script_of_run == SCX_Inherited))
11304         {
11305             script_of_run = script_of_char;
11306         }
11307
11308         /* For the character's script to be Unknown, it must be the first
11309          * character in the sequence (for otherwise a test above would have
11310          * prevented us from reaching here), and we have set the run's script
11311          * to it.  Nothing further to be done for this character */
11312         if (UNLIKELY(script_of_char == SCX_Unknown)) {
11313             continue;
11314         }
11315
11316         /* We accept 'inherited' script characters currently even at the
11317          * beginning.  (We know that no characters in Inherited are digits, or
11318          * we'd have to check for that) */
11319         if (UNLIKELY(script_of_char == SCX_Inherited)) {
11320             continue;
11321         }
11322
11323         /* If the run so far is Common, and the new character isn't, change the
11324          * run's script to that of this character */
11325         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11326             script_of_run = script_of_char;
11327         }
11328
11329         /* Now we can see if the script of the new character is the same as
11330          * that of the run */
11331         if (LIKELY(script_of_char == script_of_run)) {
11332             /* By far the most common case */
11333             goto scripts_match;
11334         }
11335
11336         /* Here, the script of the run isn't Common.  But characters in Common
11337          * match any script */
11338         if (script_of_char == SCX_Common) {
11339             goto scripts_match;
11340         }
11341
11342 #ifndef HAS_SCX_AUX_TABLES
11343
11344         /* Too early a Unicode version to have a code point belonging to more
11345          * than one script, so, if the scripts don't exactly match, fail */
11346         PERL_UNUSED_VAR(intersection_len);
11347         retval = FALSE;
11348         break;
11349
11350 #else
11351
11352         /* Here there is no exact match between the character's script and the
11353          * run's.  And we've handled the special cases of scripts Unknown,
11354          * Inherited, and Common.
11355          *
11356          * Negative script numbers signify that the value may be any of several
11357          * scripts, and we need to look at auxiliary information to make our
11358          * deterimination.  But if both are non-negative, we can fail now */
11359         if (LIKELY(script_of_char >= 0)) {
11360             const SCX_enum * search_in;
11361             PERL_UINT_FAST8_T search_in_len;
11362             PERL_UINT_FAST8_T i;
11363
11364             if (LIKELY(script_of_run >= 0)) {
11365                 retval = FALSE;
11366                 break;
11367             }
11368
11369             /* Use the previously constructed set of possible scripts, if any.
11370              * */
11371             if (intersection) {
11372                 search_in = intersection;
11373                 search_in_len = intersection_len;
11374             }
11375             else {
11376                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11377                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11378             }
11379
11380             for (i = 0; i < search_in_len; i++) {
11381                 if (search_in[i] == script_of_char) {
11382                     script_of_run = script_of_char;
11383                     goto scripts_match;
11384                 }
11385             }
11386
11387             retval = FALSE;
11388             break;
11389         }
11390         else if (LIKELY(script_of_run >= 0)) {
11391             /* script of character could be one of several, but run is a single
11392              * script */
11393             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11394             const PERL_UINT_FAST8_T search_in_len
11395                                      = SCX_AUX_TABLE_lengths[-script_of_char];
11396             PERL_UINT_FAST8_T i;
11397
11398             for (i = 0; i < search_in_len; i++) {
11399                 if (search_in[i] == script_of_run) {
11400                     script_of_char = script_of_run;
11401                     goto scripts_match;
11402                 }
11403             }
11404
11405             retval = FALSE;
11406             break;
11407         }
11408         else {
11409             /* Both run and char could be in one of several scripts.  If the
11410              * intersection is empty, then this character isn't in this script
11411              * run.  Otherwise, we need to calculate the intersection to use
11412              * for future iterations of the loop, unless we are already at the
11413              * final character */
11414             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11415             const PERL_UINT_FAST8_T char_len
11416                                       = SCX_AUX_TABLE_lengths[-script_of_char];
11417             const SCX_enum * search_run;
11418             PERL_UINT_FAST8_T run_len;
11419
11420             SCX_enum * new_overlap = NULL;
11421             PERL_UINT_FAST8_T i, j;
11422
11423             if (intersection) {
11424                 search_run = intersection;
11425                 run_len = intersection_len;
11426             }
11427             else {
11428                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11429                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11430             }
11431
11432             intersection_len = 0;
11433
11434             for (i = 0; i < run_len; i++) {
11435                 for (j = 0; j < char_len; j++) {
11436                     if (search_run[i] == search_char[j]) {
11437
11438                         /* Here, the script at i,j matches.  That means this
11439                          * character is in the run.  But continue on to find
11440                          * the complete intersection, for the next loop
11441                          * iteration, and for the digit check after it.
11442                          *
11443                          * On the first found common script, we malloc space
11444                          * for the intersection list for the worst case of the
11445                          * intersection, which is the minimum of the number of
11446                          * scripts remaining in each set. */
11447                         if (intersection_len == 0) {
11448                             Newx(new_overlap,
11449                                  MIN(run_len - i, char_len - j),
11450                                  SCX_enum);
11451                         }
11452                         new_overlap[intersection_len++] = search_run[i];
11453                     }
11454                 }
11455             }
11456
11457             /* Here we've looked through everything.  If they have no scripts
11458              * in common, not a run */
11459             if (intersection_len == 0) {
11460                 retval = FALSE;
11461                 break;
11462             }
11463
11464             /* If there is only a single script in common, set to that.
11465              * Otherwise, use the intersection going forward */
11466             Safefree(intersection);
11467             intersection = NULL;
11468             if (intersection_len == 1) {
11469                 script_of_run = script_of_char = new_overlap[0];
11470                 Safefree(new_overlap);
11471                 new_overlap = NULL;
11472             }
11473             else {
11474                 intersection = new_overlap;
11475             }
11476         }
11477
11478 #endif
11479
11480   scripts_match:
11481
11482         /* Here, the script of the character is compatible with that of the
11483          * run.  That means that in most cases, it continues the script run.
11484          * Either it and the run match exactly, or one or both can be in any of
11485          * several scripts, and the intersection is not empty.  However, if the
11486          * character is a decimal digit, it could still mean failure if it is
11487          * from the wrong sequence of 10.  So, we need to look at if it's a
11488          * digit.  We've already handled the 10 digits [0-9], and the next
11489          * lowest one is this one: */
11490         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11491             continue;   /* Not a digit; this character is part of the run */
11492         }
11493
11494         /* If we have a definitive '0' for the script of this character, we
11495          * know that for this to be a digit, it must be in the range of +0..+9
11496          * of that zero. */
11497         if (   script_of_char >= 0
11498             && (zero_of_char = script_zeros[script_of_char]))
11499         {
11500             if (! withinCOUNT(cp, zero_of_char, 9)) {
11501                 continue;   /* Not a digit; this character is part of the run
11502                              */
11503             }
11504
11505         }
11506         else {  /* Need to look up if this character is a digit or not */
11507             SSize_t index_of_zero_of_char;
11508             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11509             if (     UNLIKELY(index_of_zero_of_char < 0)
11510                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11511             {
11512                 continue;   /* Not a digit; this character is part of the run.
11513                              */
11514             }
11515
11516             zero_of_char = decimals_array[index_of_zero_of_char];
11517         }
11518
11519         /* Here, the character is a decimal digit, and the zero of its sequence
11520          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
11521          * they better be the same. */
11522         if (zero_of_run) {
11523             if (zero_of_run != zero_of_char) {
11524                 retval = FALSE;
11525                 break;
11526             }
11527         }
11528         else {  /* Otherwise we now have a zero for this run */
11529             zero_of_run = zero_of_char;
11530         }
11531     } /* end of looping through CLOSESR text */
11532
11533     Safefree(intersection);
11534
11535     if (ret_script != NULL) {
11536         if (retval) {
11537             *ret_script = script_of_run;
11538         }
11539         else {
11540             *ret_script = SCX_INVALID;
11541         }
11542     }
11543
11544     return retval;
11545 }
11546
11547 #endif /* ifndef PERL_IN_XSUB_RE */
11548
11549 /*
11550  * ex: set ts=8 sts=4 sw=4 et:
11551  */