regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 #ifndef MIN
 106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 107 #endif
 108
 109 /* this is a chain of data about sub patterns we are processing that
 110    need to be handled separately/specially in study_chunk. Its so
 111    we can simulate recursion without losing state.  */
 112 struct scan_frame;
 113 typedef struct scan_frame {
 114     regnode *last_regnode;      /* last node to process in this frame */
 115     regnode *next_regnode;      /* next node to process when last is reached */
 116     U32 prev_recursed_depth;
 117     I32 stopparen;              /* what stopparen do we use */
 118     U32 is_top_frame;           /* what flags do we use? */
 119
 120     struct scan_frame *this_prev_frame; /* this previous frame */
 121     struct scan_frame *prev_frame;      /* previous frame */
 122     struct scan_frame *next_frame;      /* next frame */
 123 } scan_frame;
 124
 125 struct RExC_state_t {
 126     U32         flags;                  /* RXf_* are we folding, multilining? */
 127     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 128     char        *precomp;               /* uncompiled string. */
 129     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 130     regexp      *rx;                    /* perl core regexp structure */
 131     regexp_internal     *rxi;           /* internal data for regexp object
 132                                            pprivate field */
 133     char        *start;                 /* Start of input for compile */
 134     char        *end;                   /* End of input for compile */
 135     char        *parse;                 /* Input-scan pointer. */
 136     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 137     regnode     *emit_start;            /* Start of emitted-code area */
 138     regnode     *emit_bound;            /* First regnode outside of the
 139                                            allocated space */
 140     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 141                                            implies compiling, so don't emit */
 142     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 143                                            large enough for the largest
 144                                            non-EXACTish node, so can use it as
 145                                            scratch in pass1 */
 146     I32         naughty;                /* How bad is this pattern? */
 147     I32         sawback;                /* Did we see \1, ...? */
 148     U32         seen;
 149     SSize_t     size;                   /* Code size. */
 150     I32                npar;            /* Capture buffer count, (OPEN) plus
 151                                            one. ("par" 0 is the whole
 152                                            pattern)*/
 153     I32         nestroot;               /* root parens we are in - used by
 154                                            accept */
 155     I32         extralen;
 156     I32         seen_zerolen;
 157     regnode     **open_parens;          /* pointers to open parens */
 158     regnode     **close_parens;         /* pointers to close parens */
 159     regnode     *opend;                 /* END node in program */
 160     I32         utf8;           /* whether the pattern is utf8 or not */
 161     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 162                                 /* XXX use this for future optimisation of case
 163                                  * where pattern must be upgraded to utf8. */
 164     I32         uni_semantics;  /* If a d charset modifier should use unicode
 165                                    rules, even if the pattern is not in
 166                                    utf8 */
 167     HV          *paren_names;           /* Paren names */
 168
 169     regnode     **recurse;              /* Recurse regops */
 170     I32         recurse_count;          /* Number of recurse regops */
 171     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 172                                            through */
 173     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 174     I32         in_lookbehind;
 175     I32         contains_locale;
 176     I32         contains_i;
 177     I32         override_recoding;
 178     I32         in_multi_char_class;
 179     struct reg_code_block *code_blocks; /* positions of literal (?{})
 180                                             within pattern */
 181     int         num_code_blocks;        /* size of code_blocks[] */
 182     int         code_index;             /* next code_blocks[] slot */
 183     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 184     scan_frame *frame_head;
 185     scan_frame *frame_last;
 186     U32         frame_count;
 187 #ifdef ADD_TO_REGEXEC
 188     char        *starttry;              /* -Dr: where regtry was called. */
 189 #define RExC_starttry   (pRExC_state->starttry)
 190 #endif
 191     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 192 #ifdef DEBUGGING
 193     const char  *lastparse;
 194     I32         lastnum;
 195     AV          *paren_name_list;       /* idx -> name */
 196     U32         study_chunk_recursed_count;
 197     SV          *mysv1;
 198     SV          *mysv2;
 199 #define RExC_lastparse  (pRExC_state->lastparse)
 200 #define RExC_lastnum    (pRExC_state->lastnum)
 201 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 202 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 203 #define RExC_mysv       (pRExC_state->mysv1)
 204 #define RExC_mysv1      (pRExC_state->mysv1)
 205 #define RExC_mysv2      (pRExC_state->mysv2)
 206
 207 #endif
 208 };
 209
 210 #define RExC_flags      (pRExC_state->flags)
 211 #define RExC_pm_flags   (pRExC_state->pm_flags)
 212 #define RExC_precomp    (pRExC_state->precomp)
 213 #define RExC_rx_sv      (pRExC_state->rx_sv)
 214 #define RExC_rx         (pRExC_state->rx)
 215 #define RExC_rxi        (pRExC_state->rxi)
 216 #define RExC_start      (pRExC_state->start)
 217 #define RExC_end        (pRExC_state->end)
 218 #define RExC_parse      (pRExC_state->parse)
 219 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 220 #ifdef RE_TRACK_PATTERN_OFFSETS
 221 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 222                                                          others */
 223 #endif
 224 #define RExC_emit       (pRExC_state->emit)
 225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 226 #define RExC_emit_start (pRExC_state->emit_start)
 227 #define RExC_emit_bound (pRExC_state->emit_bound)
 228 #define RExC_naughty    (pRExC_state->naughty)
 229 #define RExC_sawback    (pRExC_state->sawback)
 230 #define RExC_seen       (pRExC_state->seen)
 231 #define RExC_size       (pRExC_state->size)
 232 #define RExC_maxlen        (pRExC_state->maxlen)
 233 #define RExC_npar       (pRExC_state->npar)
 234 #define RExC_nestroot   (pRExC_state->nestroot)
 235 #define RExC_extralen   (pRExC_state->extralen)
 236 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 237 #define RExC_utf8       (pRExC_state->utf8)
 238 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 239 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 240 #define RExC_open_parens        (pRExC_state->open_parens)
 241 #define RExC_close_parens       (pRExC_state->close_parens)
 242 #define RExC_opend      (pRExC_state->opend)
 243 #define RExC_paren_names        (pRExC_state->paren_names)
 244 #define RExC_recurse    (pRExC_state->recurse)
 245 #define RExC_recurse_count      (pRExC_state->recurse_count)
 246 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 247 #define RExC_study_chunk_recursed_bytes  \
 248                                    (pRExC_state->study_chunk_recursed_bytes)
 249 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 250 #define RExC_contains_locale    (pRExC_state->contains_locale)
 251 #define RExC_contains_i (pRExC_state->contains_i)
 252 #define RExC_override_recoding (pRExC_state->override_recoding)
 253 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 254 #define RExC_frame_head (pRExC_state->frame_head)
 255 #define RExC_frame_last (pRExC_state->frame_last)
 256 #define RExC_frame_count (pRExC_state->frame_count)
 257
 258
 259 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 260 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 261         ((*s) == '{' && regcurly(s)))
 262
 263 /*
 264  * Flags to be passed up and down.
 265  */
 266 #define WORST           0       /* Worst case. */
 267 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 268
 269 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 270  * character.  (There needs to be a case: in the switch statement in regexec.c
 271  * for any node marked SIMPLE.)  Note that this is not the same thing as
 272  * REGNODE_SIMPLE */
 273 #define SIMPLE          0x02
 274 #define SPSTART         0x04    /* Starts with * or + */
 275 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 276 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 277 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 278
 279 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 280
 281 /* whether trie related optimizations are enabled */
 282 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 283 #define TRIE_STUDY_OPT
 284 #define FULL_TRIE_STUDY
 285 #define TRIE_STCLASS
 286 #endif
 287
 288
 289
 290 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 291 #define PBITVAL(paren) (1 << ((paren) & 7))
 292 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 293 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 294 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 295
 296 #define REQUIRE_UTF8    STMT_START {                                       \
 297                                      if (!UTF) {                           \
 298                                          *flagp = RESTART_UTF8;            \
 299                                          return NULL;                      \
 300                                      }                                     \
 301                         } STMT_END
 302
 303 /* This converts the named class defined in regcomp.h to its equivalent class
 304  * number defined in handy.h. */
 305 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 306 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 307
 308 #define _invlist_union_complement_2nd(a, b, output) \
 309                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 310 #define _invlist_intersection_complement_2nd(a, b, output) \
 311                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 312
 313 /* About scan_data_t.
 314
 315   During optimisation we recurse through the regexp program performing
 316   various inplace (keyhole style) optimisations. In addition study_chunk
 317   and scan_commit populate this data structure with information about
 318   what strings MUST appear in the pattern. We look for the longest
 319   string that must appear at a fixed location, and we look for the
 320   longest string that may appear at a floating location. So for instance
 321   in the pattern:
 322
 323     /FOO[xX]A.*B[xX]BAR/
 324
 325   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 326   strings (because they follow a .* construct). study_chunk will identify
 327   both FOO and BAR as being the longest fixed and floating strings respectively.
 328
 329   The strings can be composites, for instance
 330
 331      /(f)(o)(o)/
 332
 333   will result in a composite fixed substring 'foo'.
 334
 335   For each string some basic information is maintained:
 336
 337   - offset or min_offset
 338     This is the position the string must appear at, or not before.
 339     It also implicitly (when combined with minlenp) tells us how many
 340     characters must match before the string we are searching for.
 341     Likewise when combined with minlenp and the length of the string it
 342     tells us how many characters must appear after the string we have
 343     found.
 344
 345   - max_offset
 346     Only used for floating strings. This is the rightmost point that
 347     the string can appear at. If set to SSize_t_MAX it indicates that the
 348     string can occur infinitely far to the right.
 349
 350   - minlenp
 351     A pointer to the minimum number of characters of the pattern that the
 352     string was found inside. This is important as in the case of positive
 353     lookahead or positive lookbehind we can have multiple patterns
 354     involved. Consider
 355
 356     /(?=FOO).*F/
 357
 358     The minimum length of the pattern overall is 3, the minimum length
 359     of the lookahead part is 3, but the minimum length of the part that
 360     will actually match is 1. So 'FOO's minimum length is 3, but the
 361     minimum length for the F is 1. This is important as the minimum length
 362     is used to determine offsets in front of and behind the string being
 363     looked for.  Since strings can be composites this is the length of the
 364     pattern at the time it was committed with a scan_commit. Note that
 365     the length is calculated by study_chunk, so that the minimum lengths
 366     are not known until the full pattern has been compiled, thus the
 367     pointer to the value.
 368
 369   - lookbehind
 370
 371     In the case of lookbehind the string being searched for can be
 372     offset past the start point of the final matching string.
 373     If this value was just blithely removed from the min_offset it would
 374     invalidate some of the calculations for how many chars must match
 375     before or after (as they are derived from min_offset and minlen and
 376     the length of the string being searched for).
 377     When the final pattern is compiled and the data is moved from the
 378     scan_data_t structure into the regexp structure the information
 379     about lookbehind is factored in, with the information that would
 380     have been lost precalculated in the end_shift field for the
 381     associated string.
 382
 383   The fields pos_min and pos_delta are used to store the minimum offset
 384   and the delta to the maximum offset at the current point in the pattern.
 385
 386 */
 387
 388 typedef struct scan_data_t {
 389     /*I32 len_min;      unused */
 390     /*I32 len_delta;    unused */
 391     SSize_t pos_min;
 392     SSize_t pos_delta;
 393     SV *last_found;
 394     SSize_t last_end;       /* min value, <0 unless valid. */
 395     SSize_t last_start_min;
 396     SSize_t last_start_max;
 397     SV **longest;           /* Either &l_fixed, or &l_float. */
 398     SV *longest_fixed;      /* longest fixed string found in pattern */
 399     SSize_t offset_fixed;   /* offset where it starts */
 400     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 401     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 402     SV *longest_float;      /* longest floating string found in pattern */
 403     SSize_t offset_float_min; /* earliest point in string it can appear */
 404     SSize_t offset_float_max; /* latest point in string it can appear */
 405     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 406     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 407     I32 flags;
 408     I32 whilem_c;
 409     SSize_t *last_closep;
 410     regnode_ssc *start_class;
 411 } scan_data_t;
 412
 413 /*
 414  * Forward declarations for pregcomp()'s friends.
 415  */
 416
 417 static const scan_data_t zero_scan_data =
 418   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 419
 420 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 421 #define SF_BEFORE_SEOL          0x0001
 422 #define SF_BEFORE_MEOL          0x0002
 423 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 424 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 425
 426 #define SF_FIX_SHIFT_EOL        (+2)
 427 #define SF_FL_SHIFT_EOL         (+4)
 428
 429 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 430 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 431
 432 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 433 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 434 #define SF_IS_INF               0x0040
 435 #define SF_HAS_PAR              0x0080
 436 #define SF_IN_PAR               0x0100
 437 #define SF_HAS_EVAL             0x0200
 438 #define SCF_DO_SUBSTR           0x0400
 439 #define SCF_DO_STCLASS_AND      0x0800
 440 #define SCF_DO_STCLASS_OR       0x1000
 441 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 442 #define SCF_WHILEM_VISITED_POS  0x2000
 443
 444 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 445 #define SCF_SEEN_ACCEPT         0x8000
 446 #define SCF_TRIE_DOING_RESTUDY 0x10000
 447 #define SCF_IN_DEFINE          0x20000
 448
 449
 450
 451
 452 #define UTF cBOOL(RExC_utf8)
 453
 454 /* The enums for all these are ordered so things work out correctly */
 455 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 456 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 457                                                      == REGEX_DEPENDS_CHARSET)
 458 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 459 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 460                                                      >= REGEX_UNICODE_CHARSET)
 461 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 462                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 463 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 464                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 465 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 466                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 467
 468 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 469
 470 /* For programs that want to be strictly Unicode compatible by dying if any
 471  * attempt is made to match a non-Unicode code point against a Unicode
 472  * property.  */
 473 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 474
 475 #define OOB_NAMEDCLASS          -1
 476
 477 /* There is no code point that is out-of-bounds, so this is problematic.  But
 478  * its only current use is to initialize a variable that is always set before
 479  * looked at. */
 480 #define OOB_UNICODE             0xDEADBEEF
 481
 482 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 483 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 484
 485
 486 /* length of regex to show in messages that don't mark a position within */
 487 #define RegexLengthToShowInErrorMessages 127
 488
 489 /*
 490  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 491  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 492  * op/pragma/warn/regcomp.
 493  */
 494 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 495 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 496
 497 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 498                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 499
 500 #define REPORT_LOCATION_ARGS(offset)            \
 501                 UTF8fARG(UTF, offset, RExC_precomp), \
 502                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 503
 504 /*
 505  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 506  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 507  * "...".
 508  */
 509 #define _FAIL(code) STMT_START {                                        \
 510     const char *ellipses = "";                                          \
 511     IV len = RExC_end - RExC_precomp;                                   \
 512                                                                         \
 513     if (!SIZE_ONLY)                                                     \
 514         SAVEFREESV(RExC_rx_sv);                                         \
 515     if (len > RegexLengthToShowInErrorMessages) {                       \
 516         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 517         len = RegexLengthToShowInErrorMessages - 10;                    \
 518         ellipses = "...";                                               \
 519     }                                                                   \
 520     code;                                                               \
 521 } STMT_END
 522
 523 #define FAIL(msg) _FAIL(                            \
 524     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 525             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 526
 527 #define FAIL2(msg,arg) _FAIL(                       \
 528     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 529             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 530
 531 /*
 532  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 533  */
 534 #define Simple_vFAIL(m) STMT_START {                                    \
 535     const IV offset =                                                   \
 536         (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
 537     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 538             m, REPORT_LOCATION_ARGS(offset));   \
 539 } STMT_END
 540
 541 /*
 542  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 543  */
 544 #define vFAIL(m) STMT_START {                           \
 545     if (!SIZE_ONLY)                                     \
 546         SAVEFREESV(RExC_rx_sv);                         \
 547     Simple_vFAIL(m);                                    \
 548 } STMT_END
 549
 550 /*
 551  * Like Simple_vFAIL(), but accepts two arguments.
 552  */
 553 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 554     const IV offset = RExC_parse - RExC_precomp;                        \
 555     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 556                       REPORT_LOCATION_ARGS(offset));    \
 557 } STMT_END
 558
 559 /*
 560  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 561  */
 562 #define vFAIL2(m,a1) STMT_START {                       \
 563     if (!SIZE_ONLY)                                     \
 564         SAVEFREESV(RExC_rx_sv);                         \
 565     Simple_vFAIL2(m, a1);                               \
 566 } STMT_END
 567
 568
 569 /*
 570  * Like Simple_vFAIL(), but accepts three arguments.
 571  */
 572 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 573     const IV offset = RExC_parse - RExC_precomp;                \
 574     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 575             REPORT_LOCATION_ARGS(offset));      \
 576 } STMT_END
 577
 578 /*
 579  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 580  */
 581 #define vFAIL3(m,a1,a2) STMT_START {                    \
 582     if (!SIZE_ONLY)                                     \
 583         SAVEFREESV(RExC_rx_sv);                         \
 584     Simple_vFAIL3(m, a1, a2);                           \
 585 } STMT_END
 586
 587 /*
 588  * Like Simple_vFAIL(), but accepts four arguments.
 589  */
 590 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 591     const IV offset = RExC_parse - RExC_precomp;                \
 592     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 593             REPORT_LOCATION_ARGS(offset));      \
 594 } STMT_END
 595
 596 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 597     if (!SIZE_ONLY)                                     \
 598         SAVEFREESV(RExC_rx_sv);                         \
 599     Simple_vFAIL4(m, a1, a2, a3);                       \
 600 } STMT_END
 601
 602 /* A specialized version of vFAIL2 that works with UTF8f */
 603 #define vFAIL2utf8f(m, a1) STMT_START { \
 604     const IV offset = RExC_parse - RExC_precomp;   \
 605     if (!SIZE_ONLY)                                \
 606         SAVEFREESV(RExC_rx_sv);                    \
 607     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 608             REPORT_LOCATION_ARGS(offset));         \
 609 } STMT_END
 610
 611 /* These have asserts in them because of [perl #122671] Many warnings in
 612  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 613  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 614  * would get output again.  So they should be output in pass2, and these
 615  * asserts make sure new warnings follow that paradigm. */
 616
 617 /* m is not necessarily a "literal string", in this macro */
 618 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 619     const IV offset = loc - RExC_precomp;                               \
 620     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 621             m, REPORT_LOCATION_ARGS(offset));       \
 622 } STMT_END
 623
 624 #define ckWARNreg(loc,m) STMT_START {                                   \
 625     const IV offset = loc - RExC_precomp;                               \
 626     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 627             REPORT_LOCATION_ARGS(offset));              \
 628 } STMT_END
 629
 630 #define vWARN_dep(loc, m) STMT_START {                                  \
 631     const IV offset = loc - RExC_precomp;                               \
 632     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,    \
 633             REPORT_LOCATION_ARGS(offset));              \
 634 } STMT_END
 635
 636 #define ckWARNdep(loc,m) STMT_START {                                   \
 637     const IV offset = loc - RExC_precomp;                               \
 638     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                  \
 639             m REPORT_LOCATION,                                          \
 640             REPORT_LOCATION_ARGS(offset));              \
 641 } STMT_END
 642
 643 #define ckWARNregdep(loc,m) STMT_START {                                \
 644     const IV offset = loc - RExC_precomp;                               \
 645     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),    \
 646             m REPORT_LOCATION,                                          \
 647             REPORT_LOCATION_ARGS(offset));              \
 648 } STMT_END
 649
 650 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 651     const IV offset = loc - RExC_precomp;                               \
 652     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                      \
 653             m REPORT_LOCATION,                                          \
 654             a1, REPORT_LOCATION_ARGS(offset));  \
 655 } STMT_END
 656
 657 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 658     const IV offset = loc - RExC_precomp;                               \
 659     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 660             a1, REPORT_LOCATION_ARGS(offset));  \
 661 } STMT_END
 662
 663 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 664     const IV offset = loc - RExC_precomp;                               \
 665     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 666             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 667 } STMT_END
 668
 669 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 670     const IV offset = loc - RExC_precomp;                               \
 671     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 672             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 673 } STMT_END
 674
 675 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 676     const IV offset = loc - RExC_precomp;                               \
 677     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 678             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 679 } STMT_END
 680
 681 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 682     const IV offset = loc - RExC_precomp;                               \
 683     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 684             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 685 } STMT_END
 686
 687 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 688     const IV offset = loc - RExC_precomp;                               \
 689     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 690             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 691 } STMT_END
 692
 693
 694 /* Allow for side effects in s */
 695 #define REGC(c,s) STMT_START {                  \
 696     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 697 } STMT_END
 698
 699 /* Macros for recording node offsets.   20001227 mjd@plover.com
 700  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 701  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 702  * Element 0 holds the number n.
 703  * Position is 1 indexed.
 704  */
 705 #ifndef RE_TRACK_PATTERN_OFFSETS
 706 #define Set_Node_Offset_To_R(node,byte)
 707 #define Set_Node_Offset(node,byte)
 708 #define Set_Cur_Node_Offset
 709 #define Set_Node_Length_To_R(node,len)
 710 #define Set_Node_Length(node,len)
 711 #define Set_Node_Cur_Length(node,start)
 712 #define Node_Offset(n)
 713 #define Node_Length(n)
 714 #define Set_Node_Offset_Length(node,offset,len)
 715 #define ProgLen(ri) ri->u.proglen
 716 #define SetProgLen(ri,x) ri->u.proglen = x
 717 #else
 718 #define ProgLen(ri) ri->u.offsets[0]
 719 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 720 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 721     if (! SIZE_ONLY) {                                                  \
 722         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 723                     __LINE__, (int)(node), (int)(byte)));               \
 724         if((node) < 0) {                                                \
 725             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 726                                          (int)(node));                  \
 727         } else {                                                        \
 728             RExC_offsets[2*(node)-1] = (byte);                          \
 729         }                                                               \
 730     }                                                                   \
 731 } STMT_END
 732
 733 #define Set_Node_Offset(node,byte) \
 734     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 735 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 736
 737 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 738     if (! SIZE_ONLY) {                                                  \
 739         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 740                 __LINE__, (int)(node), (int)(len)));                    \
 741         if((node) < 0) {                                                \
 742             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 743                                          (int)(node));                  \
 744         } else {                                                        \
 745             RExC_offsets[2*(node)] = (len);                             \
 746         }                                                               \
 747     }                                                                   \
 748 } STMT_END
 749
 750 #define Set_Node_Length(node,len) \
 751     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 752 #define Set_Node_Cur_Length(node, start)                \
 753     Set_Node_Length(node, RExC_parse - start)
 754
 755 /* Get offsets and lengths */
 756 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 757 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 758
 759 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 760     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 761     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 762 } STMT_END
 763 #endif
 764
 765 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 766 #define EXPERIMENTAL_INPLACESCAN
 767 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 768
 769 #define DEBUG_RExC_seen() \
 770         DEBUG_OPTIMISE_MORE_r({                                             \
 771             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 772                                                                             \
 773             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 774                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 775                                                                             \
 776             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 777                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 778                                                                             \
 779             if (RExC_seen & REG_GPOS_SEEN)                                  \
 780                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 781                                                                             \
 782             if (RExC_seen & REG_CANY_SEEN)                                  \
 783                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 784                                                                             \
 785             if (RExC_seen & REG_RECURSE_SEEN)                               \
 786                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 787                                                                             \
 788             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 789                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 790                                                                             \
 791             if (RExC_seen & REG_VERBARG_SEEN)                               \
 792                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 793                                                                             \
 794             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 795                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 796                                                                             \
 797             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 798                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 799                                                                             \
 800             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 801                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 802                                                                             \
 803             if (RExC_seen & REG_GOSTART_SEEN)                               \
 804                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 805                                                                             \
 806             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 807                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 808                                                                             \
 809             PerlIO_printf(Perl_debug_log,"\n");                             \
 810         });
 811
 812 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 813   if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
 814
 815 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 816     if ( ( flags ) ) {                                                      \
 817         PerlIO_printf(Perl_debug_log, "%s", open_str);                      \
 818         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 819         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 820         DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 821         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 822         DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 823         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 824         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 825         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 826         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 827         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
 828         DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
 829         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
 830         DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
 831         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
 832         DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
 833         PerlIO_printf(Perl_debug_log, "%s", close_str);                     \
 834     }
 835
 836
 837 #define DEBUG_STUDYDATA(str,data,depth)                              \
 838 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 839     PerlIO_printf(Perl_debug_log,                                    \
 840         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 841         " Flags: 0x%"UVXf,                                           \
 842         (int)(depth)*2, "",                                          \
 843         (IV)((data)->pos_min),                                       \
 844         (IV)((data)->pos_delta),                                     \
 845         (UV)((data)->flags)                                          \
 846     );                                                               \
 847     DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
 848     PerlIO_printf(Perl_debug_log,                                    \
 849         " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
 850         (IV)((data)->whilem_c),                                      \
 851         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 852         is_inf ? "INF " : ""                                         \
 853     );                                                               \
 854     if ((data)->last_found)                                          \
 855         PerlIO_printf(Perl_debug_log,                                \
 856             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 857             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 858             SvPVX_const((data)->last_found),                         \
 859             (IV)((data)->last_end),                                  \
 860             (IV)((data)->last_start_min),                            \
 861             (IV)((data)->last_start_max),                            \
 862             ((data)->longest &&                                      \
 863              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 864             SvPVX_const((data)->longest_fixed),                      \
 865             (IV)((data)->offset_fixed),                              \
 866             ((data)->longest &&                                      \
 867              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 868             SvPVX_const((data)->longest_float),                      \
 869             (IV)((data)->offset_float_min),                          \
 870             (IV)((data)->offset_float_max)                           \
 871         );                                                           \
 872     PerlIO_printf(Perl_debug_log,"\n");                              \
 873 });
 874
 875 #ifdef DEBUGGING
 876
 877 /* is c a control character for which we have a mnemonic? */
 878 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 879
 880 STATIC const char *
 881 S_cntrl_to_mnemonic(const U8 c)
 882 {
 883     /* Returns the mnemonic string that represents character 'c', if one
 884      * exists; NULL otherwise.  The only ones that exist for the purposes of
 885      * this routine are a few control characters */
 886
 887     switch (c) {
 888         case '\a':       return "\\a";
 889         case '\b':       return "\\b";
 890         case ESC_NATIVE: return "\\e";
 891         case '\f':       return "\\f";
 892         case '\n':       return "\\n";
 893         case '\r':       return "\\r";
 894         case '\t':       return "\\t";
 895     }
 896
 897     return NULL;
 898 }
 899
 900 #endif
 901
 902 /* Mark that we cannot extend a found fixed substring at this point.
 903    Update the longest found anchored substring and the longest found
 904    floating substrings if needed. */
 905
 906 STATIC void
 907 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 908                     SSize_t *minlenp, int is_inf)
 909 {
 910     const STRLEN l = CHR_SVLEN(data->last_found);
 911     const STRLEN old_l = CHR_SVLEN(*data->longest);
 912     GET_RE_DEBUG_FLAGS_DECL;
 913
 914     PERL_ARGS_ASSERT_SCAN_COMMIT;
 915
 916     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 917         SvSetMagicSV(*data->longest, data->last_found);
 918         if (*data->longest == data->longest_fixed) {
 919             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 920             if (data->flags & SF_BEFORE_EOL)
 921                 data->flags
 922                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 923             else
 924                 data->flags &= ~SF_FIX_BEFORE_EOL;
 925             data->minlen_fixed=minlenp;
 926             data->lookbehind_fixed=0;
 927         }
 928         else { /* *data->longest == data->longest_float */
 929             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 930             data->offset_float_max = (l
 931                                       ? data->last_start_max
 932                                       : (data->pos_delta == SSize_t_MAX
 933                                          ? SSize_t_MAX
 934                                          : data->pos_min + data->pos_delta));
 935             if (is_inf
 936                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 937                 data->offset_float_max = SSize_t_MAX;
 938             if (data->flags & SF_BEFORE_EOL)
 939                 data->flags
 940                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 941             else
 942                 data->flags &= ~SF_FL_BEFORE_EOL;
 943             data->minlen_float=minlenp;
 944             data->lookbehind_float=0;
 945         }
 946     }
 947     SvCUR_set(data->last_found, 0);
 948     {
 949         SV * const sv = data->last_found;
 950         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 951             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 952             if (mg)
 953                 mg->mg_len = 0;
 954         }
 955     }
 956     data->last_end = -1;
 957     data->flags &= ~SF_BEFORE_EOL;
 958     DEBUG_STUDYDATA("commit: ",data,0);
 959 }
 960
 961 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 962  * list that describes which code points it matches */
 963
 964 STATIC void
 965 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 966 {
 967     /* Set the SSC 'ssc' to match an empty string or any code point */
 968
 969     PERL_ARGS_ASSERT_SSC_ANYTHING;
 970
 971     assert(is_ANYOF_SYNTHETIC(ssc));
 972
 973     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 974     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 975     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
 976 }
 977
 978 STATIC int
 979 S_ssc_is_anything(const regnode_ssc *ssc)
 980 {
 981     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 982      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 983      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 984      * in any way, so there's no point in using it */
 985
 986     UV start, end;
 987     bool ret;
 988
 989     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 990
 991     assert(is_ANYOF_SYNTHETIC(ssc));
 992
 993     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
 994         return FALSE;
 995     }
 996
 997     /* See if the list consists solely of the range 0 - Infinity */
 998     invlist_iterinit(ssc->invlist);
 999     ret = invlist_iternext(ssc->invlist, &start, &end)
1000           && start == 0
1001           && end == UV_MAX;
1002
1003     invlist_iterfinish(ssc->invlist);
1004
1005     if (ret) {
1006         return TRUE;
1007     }
1008
1009     /* If e.g., both \w and \W are set, matches everything */
1010     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1011         int i;
1012         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1013             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1014                 return TRUE;
1015             }
1016         }
1017     }
1018
1019     return FALSE;
1020 }
1021
1022 STATIC void
1023 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1024 {
1025     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1026      * string, any code point, or any posix class under locale */
1027
1028     PERL_ARGS_ASSERT_SSC_INIT;
1029
1030     Zero(ssc, 1, regnode_ssc);
1031     set_ANYOF_SYNTHETIC(ssc);
1032     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1033     ssc_anything(ssc);
1034
1035     /* If any portion of the regex is to operate under locale rules,
1036      * initialization includes it.  The reason this isn't done for all regexes
1037      * is that the optimizer was written under the assumption that locale was
1038      * all-or-nothing.  Given the complexity and lack of documentation in the
1039      * optimizer, and that there are inadequate test cases for locale, many
1040      * parts of it may not work properly, it is safest to avoid locale unless
1041      * necessary. */
1042     if (RExC_contains_locale) {
1043         ANYOF_POSIXL_SETALL(ssc);
1044     }
1045     else {
1046         ANYOF_POSIXL_ZERO(ssc);
1047     }
1048 }
1049
1050 STATIC int
1051 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1052                         const regnode_ssc *ssc)
1053 {
1054     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1055      * to the list of code points matched, and locale posix classes; hence does
1056      * not check its flags) */
1057
1058     UV start, end;
1059     bool ret;
1060
1061     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1062
1063     assert(is_ANYOF_SYNTHETIC(ssc));
1064
1065     invlist_iterinit(ssc->invlist);
1066     ret = invlist_iternext(ssc->invlist, &start, &end)
1067           && start == 0
1068           && end == UV_MAX;
1069
1070     invlist_iterfinish(ssc->invlist);
1071
1072     if (! ret) {
1073         return FALSE;
1074     }
1075
1076     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1077         return FALSE;
1078     }
1079
1080     return TRUE;
1081 }
1082
1083 STATIC SV*
1084 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1085                                const regnode_charclass* const node)
1086 {
1087     /* Returns a mortal inversion list defining which code points are matched
1088      * by 'node', which is of type ANYOF.  Handles complementing the result if
1089      * appropriate.  If some code points aren't knowable at this time, the
1090      * returned list must, and will, contain every code point that is a
1091      * possibility. */
1092
1093     SV* invlist = sv_2mortal(_new_invlist(0));
1094     SV* only_utf8_locale_invlist = NULL;
1095     unsigned int i;
1096     const U32 n = ARG(node);
1097     bool new_node_has_latin1 = FALSE;
1098
1099     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1100
1101     /* Look at the data structure created by S_set_ANYOF_arg() */
1102     if (n != ANYOF_ONLY_HAS_BITMAP) {
1103         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1104         AV * const av = MUTABLE_AV(SvRV(rv));
1105         SV **const ary = AvARRAY(av);
1106         assert(RExC_rxi->data->what[n] == 's');
1107
1108         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1109             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1110         }
1111         else if (ary[0] && ary[0] != &PL_sv_undef) {
1112
1113             /* Here, no compile-time swash, and there are things that won't be
1114              * known until runtime -- we have to assume it could be anything */
1115             return _add_range_to_invlist(invlist, 0, UV_MAX);
1116         }
1117         else if (ary[3] && ary[3] != &PL_sv_undef) {
1118
1119             /* Here no compile-time swash, and no run-time only data.  Use the
1120              * node's inversion list */
1121             invlist = sv_2mortal(invlist_clone(ary[3]));
1122         }
1123
1124         /* Get the code points valid only under UTF-8 locales */
1125         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1126             && ary[2] && ary[2] != &PL_sv_undef)
1127         {
1128             only_utf8_locale_invlist = ary[2];
1129         }
1130     }
1131
1132     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1133      * code points, and an inversion list for the others, but if there are code
1134      * points that should match only conditionally on the target string being
1135      * UTF-8, those are placed in the inversion list, and not the bitmap.
1136      * Since there are circumstances under which they could match, they are
1137      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1138      * to exclude them here, so that when we invert below, the end result
1139      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1140      * have to do this here before we add the unconditionally matched code
1141      * points */
1142     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1143         _invlist_intersection_complement_2nd(invlist,
1144                                              PL_UpperLatin1,
1145                                              &invlist);
1146     }
1147
1148     /* Add in the points from the bit map */
1149     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1150         if (ANYOF_BITMAP_TEST(node, i)) {
1151             invlist = add_cp_to_invlist(invlist, i);
1152             new_node_has_latin1 = TRUE;
1153         }
1154     }
1155
1156     /* If this can match all upper Latin1 code points, have to add them
1157      * as well */
1158     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1159         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1160     }
1161
1162     /* Similarly for these */
1163     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1164         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1165     }
1166
1167     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1168         _invlist_invert(invlist);
1169     }
1170     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1171
1172         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1173          * locale.  We can skip this if there are no 0-255 at all. */
1174         _invlist_union(invlist, PL_Latin1, &invlist);
1175     }
1176
1177     /* Similarly add the UTF-8 locale possible matches.  These have to be
1178      * deferred until after the non-UTF-8 locale ones are taken care of just
1179      * above, or it leads to wrong results under ANYOF_INVERT */
1180     if (only_utf8_locale_invlist) {
1181         _invlist_union_maybe_complement_2nd(invlist,
1182                                             only_utf8_locale_invlist,
1183                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1184                                             &invlist);
1185     }
1186
1187     return invlist;
1188 }
1189
1190 /* These two functions currently do the exact same thing */
1191 #define ssc_init_zero           ssc_init
1192
1193 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1194 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1195
1196 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1197  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1198  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1199
1200 STATIC void
1201 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1202                 const regnode_charclass *and_with)
1203 {
1204     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1205      * another SSC or a regular ANYOF class.  Can create false positives. */
1206
1207     SV* anded_cp_list;
1208     U8  anded_flags;
1209
1210     PERL_ARGS_ASSERT_SSC_AND;
1211
1212     assert(is_ANYOF_SYNTHETIC(ssc));
1213
1214     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1215      * the code point inversion list and just the relevant flags */
1216     if (is_ANYOF_SYNTHETIC(and_with)) {
1217         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1218         anded_flags = ANYOF_FLAGS(and_with);
1219
1220         /* XXX This is a kludge around what appears to be deficiencies in the
1221          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1222          * there are paths through the optimizer where it doesn't get weeded
1223          * out when it should.  And if we don't make some extra provision for
1224          * it like the code just below, it doesn't get added when it should.
1225          * This solution is to add it only when AND'ing, which is here, and
1226          * only when what is being AND'ed is the pristine, original node
1227          * matching anything.  Thus it is like adding it to ssc_anything() but
1228          * only when the result is to be AND'ed.  Probably the same solution
1229          * could be adopted for the same problem we have with /l matching,
1230          * which is solved differently in S_ssc_init(), and that would lead to
1231          * fewer false positives than that solution has.  But if this solution
1232          * creates bugs, the consequences are only that a warning isn't raised
1233          * that should be; while the consequences for having /l bugs is
1234          * incorrect matches */
1235         if (ssc_is_anything((regnode_ssc *)and_with)) {
1236             anded_flags |= ANYOF_WARN_SUPER;
1237         }
1238     }
1239     else {
1240         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1241         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1242     }
1243
1244     ANYOF_FLAGS(ssc) &= anded_flags;
1245
1246     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1247      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1248      * 'and_with' may be inverted.  When not inverted, we have the situation of
1249      * computing:
1250      *  (C1 | P1) & (C2 | P2)
1251      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1252      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1253      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1254      *                    <=  ((C1 & C2) | P1 | P2)
1255      * Alternatively, the last few steps could be:
1256      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1257      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1258      *                    <=  (C1 | C2 | (P1 & P2))
1259      * We favor the second approach if either P1 or P2 is non-empty.  This is
1260      * because these components are a barrier to doing optimizations, as what
1261      * they match cannot be known until the moment of matching as they are
1262      * dependent on the current locale, 'AND"ing them likely will reduce or
1263      * eliminate them.
1264      * But we can do better if we know that C1,P1 are in their initial state (a
1265      * frequent occurrence), each matching everything:
1266      *  (<everything>) & (C2 | P2) =  C2 | P2
1267      * Similarly, if C2,P2 are in their initial state (again a frequent
1268      * occurrence), the result is a no-op
1269      *  (C1 | P1) & (<everything>) =  C1 | P1
1270      *
1271      * Inverted, we have
1272      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1273      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1274      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1275      * */
1276
1277     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1278         && ! is_ANYOF_SYNTHETIC(and_with))
1279     {
1280         unsigned int i;
1281
1282         ssc_intersection(ssc,
1283                          anded_cp_list,
1284                          FALSE /* Has already been inverted */
1285                          );
1286
1287         /* If either P1 or P2 is empty, the intersection will be also; can skip
1288          * the loop */
1289         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1290             ANYOF_POSIXL_ZERO(ssc);
1291         }
1292         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1293
1294             /* Note that the Posix class component P from 'and_with' actually
1295              * looks like:
1296              *      P = Pa | Pb | ... | Pn
1297              * where each component is one posix class, such as in [\w\s].
1298              * Thus
1299              *      ~P = ~(Pa | Pb | ... | Pn)
1300              *         = ~Pa & ~Pb & ... & ~Pn
1301              *        <= ~Pa | ~Pb | ... | ~Pn
1302              * The last is something we can easily calculate, but unfortunately
1303              * is likely to have many false positives.  We could do better
1304              * in some (but certainly not all) instances if two classes in
1305              * P have known relationships.  For example
1306              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1307              * So
1308              *      :lower: & :print: = :lower:
1309              * And similarly for classes that must be disjoint.  For example,
1310              * since \s and \w can have no elements in common based on rules in
1311              * the POSIX standard,
1312              *      \w & ^\S = nothing
1313              * Unfortunately, some vendor locales do not meet the Posix
1314              * standard, in particular almost everything by Microsoft.
1315              * The loop below just changes e.g., \w into \W and vice versa */
1316
1317             regnode_charclass_posixl temp;
1318             int add = 1;    /* To calculate the index of the complement */
1319
1320             ANYOF_POSIXL_ZERO(&temp);
1321             for (i = 0; i < ANYOF_MAX; i++) {
1322                 assert(i % 2 != 0
1323                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1324                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1325
1326                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1327                     ANYOF_POSIXL_SET(&temp, i + add);
1328                 }
1329                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1330             }
1331             ANYOF_POSIXL_AND(&temp, ssc);
1332
1333         } /* else ssc already has no posixes */
1334     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1335          in its initial state */
1336     else if (! is_ANYOF_SYNTHETIC(and_with)
1337              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1338     {
1339         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1340          * copy it over 'ssc' */
1341         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1342             if (is_ANYOF_SYNTHETIC(and_with)) {
1343                 StructCopy(and_with, ssc, regnode_ssc);
1344             }
1345             else {
1346                 ssc->invlist = anded_cp_list;
1347                 ANYOF_POSIXL_ZERO(ssc);
1348                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1349                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1350                 }
1351             }
1352         }
1353         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1354                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1355         {
1356             /* One or the other of P1, P2 is non-empty. */
1357             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1358                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1359             }
1360             ssc_union(ssc, anded_cp_list, FALSE);
1361         }
1362         else { /* P1 = P2 = empty */
1363             ssc_intersection(ssc, anded_cp_list, FALSE);
1364         }
1365     }
1366 }
1367
1368 STATIC void
1369 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1370                const regnode_charclass *or_with)
1371 {
1372     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1373      * another SSC or a regular ANYOF class.  Can create false positives if
1374      * 'or_with' is to be inverted. */
1375
1376     SV* ored_cp_list;
1377     U8 ored_flags;
1378
1379     PERL_ARGS_ASSERT_SSC_OR;
1380
1381     assert(is_ANYOF_SYNTHETIC(ssc));
1382
1383     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1384      * the code point inversion list and just the relevant flags */
1385     if (is_ANYOF_SYNTHETIC(or_with)) {
1386         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1387         ored_flags = ANYOF_FLAGS(or_with);
1388     }
1389     else {
1390         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1391         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1392     }
1393
1394     ANYOF_FLAGS(ssc) |= ored_flags;
1395
1396     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1397      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1398      * 'or_with' may be inverted.  When not inverted, we have the simple
1399      * situation of computing:
1400      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1401      * If P1|P2 yields a situation with both a class and its complement are
1402      * set, like having both \w and \W, this matches all code points, and we
1403      * can delete these from the P component of the ssc going forward.  XXX We
1404      * might be able to delete all the P components, but I (khw) am not certain
1405      * about this, and it is better to be safe.
1406      *
1407      * Inverted, we have
1408      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1409      *                         <=  (C1 | P1) | ~C2
1410      *                         <=  (C1 | ~C2) | P1
1411      * (which results in actually simpler code than the non-inverted case)
1412      * */
1413
1414     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1415         && ! is_ANYOF_SYNTHETIC(or_with))
1416     {
1417         /* We ignore P2, leaving P1 going forward */
1418     }   /* else  Not inverted */
1419     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1420         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1421         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1422             unsigned int i;
1423             for (i = 0; i < ANYOF_MAX; i += 2) {
1424                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1425                 {
1426                     ssc_match_all_cp(ssc);
1427                     ANYOF_POSIXL_CLEAR(ssc, i);
1428                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1429                 }
1430             }
1431         }
1432     }
1433
1434     ssc_union(ssc,
1435               ored_cp_list,
1436               FALSE /* Already has been inverted */
1437               );
1438 }
1439
1440 PERL_STATIC_INLINE void
1441 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1442 {
1443     PERL_ARGS_ASSERT_SSC_UNION;
1444
1445     assert(is_ANYOF_SYNTHETIC(ssc));
1446
1447     _invlist_union_maybe_complement_2nd(ssc->invlist,
1448                                         invlist,
1449                                         invert2nd,
1450                                         &ssc->invlist);
1451 }
1452
1453 PERL_STATIC_INLINE void
1454 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1455                          SV* const invlist,
1456                          const bool invert2nd)
1457 {
1458     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1459
1460     assert(is_ANYOF_SYNTHETIC(ssc));
1461
1462     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1463                                                invlist,
1464                                                invert2nd,
1465                                                &ssc->invlist);
1466 }
1467
1468 PERL_STATIC_INLINE void
1469 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1470 {
1471     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1472
1473     assert(is_ANYOF_SYNTHETIC(ssc));
1474
1475     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1476 }
1477
1478 PERL_STATIC_INLINE void
1479 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1480 {
1481     /* AND just the single code point 'cp' into the SSC 'ssc' */
1482
1483     SV* cp_list = _new_invlist(2);
1484
1485     PERL_ARGS_ASSERT_SSC_CP_AND;
1486
1487     assert(is_ANYOF_SYNTHETIC(ssc));
1488
1489     cp_list = add_cp_to_invlist(cp_list, cp);
1490     ssc_intersection(ssc, cp_list,
1491                      FALSE /* Not inverted */
1492                      );
1493     SvREFCNT_dec_NN(cp_list);
1494 }
1495
1496 PERL_STATIC_INLINE void
1497 S_ssc_clear_locale(regnode_ssc *ssc)
1498 {
1499     /* Set the SSC 'ssc' to not match any locale things */
1500     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1501
1502     assert(is_ANYOF_SYNTHETIC(ssc));
1503
1504     ANYOF_POSIXL_ZERO(ssc);
1505     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1506 }
1507
1508 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1509
1510 STATIC bool
1511 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1512 {
1513     /* The synthetic start class is used to hopefully quickly winnow down
1514      * places where a pattern could start a match in the target string.  If it
1515      * doesn't really narrow things down that much, there isn't much point to
1516      * having the overhead of using it.  This function uses some very crude
1517      * heuristics to decide if to use the ssc or not.
1518      *
1519      * It returns TRUE if 'ssc' rules out more than half what it considers to
1520      * be the "likely" possible matches, but of course it doesn't know what the
1521      * actual things being matched are going to be; these are only guesses
1522      *
1523      * For /l matches, it assumes that the only likely matches are going to be
1524      *      in the 0-255 range, uniformly distributed, so half of that is 127
1525      * For /a and /d matches, it assumes that the likely matches will be just
1526      *      the ASCII range, so half of that is 63
1527      * For /u and there isn't anything matching above the Latin1 range, it
1528      *      assumes that that is the only range likely to be matched, and uses
1529      *      half that as the cut-off: 127.  If anything matches above Latin1,
1530      *      it assumes that all of Unicode could match (uniformly), except for
1531      *      non-Unicode code points and things in the General Category "Other"
1532      *      (unassigned, private use, surrogates, controls and formats).  This
1533      *      is a much large number. */
1534
1535     const U32 max_match = (LOC)
1536                           ? 127
1537                           : (! UNI_SEMANTICS)
1538                             ? 63
1539                             : (invlist_highest(ssc->invlist) < 256)
1540                               ? 127
1541                               : ((NON_OTHER_COUNT + 1) / 2) - 1;
1542     U32 count = 0;      /* Running total of number of code points matched by
1543                            'ssc' */
1544     UV start, end;      /* Start and end points of current range in inversion
1545                            list */
1546
1547     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1548
1549     invlist_iterinit(ssc->invlist);
1550     while (invlist_iternext(ssc->invlist, &start, &end)) {
1551
1552         /* /u is the only thing that we expect to match above 255; so if not /u
1553          * and even if there are matches above 255, ignore them.  This catches
1554          * things like \d under /d which does match the digits above 255, but
1555          * since the pattern is /d, it is not likely to be expecting them */
1556         if (! UNI_SEMANTICS) {
1557             if (start > 255) {
1558                 break;
1559             }
1560             end = MIN(end, 255);
1561         }
1562         count += end - start + 1;
1563         if (count > max_match) {
1564             invlist_iterfinish(ssc->invlist);
1565             return FALSE;
1566         }
1567     }
1568
1569     return TRUE;
1570 }
1571
1572
1573 STATIC void
1574 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1575 {
1576     /* The inversion list in the SSC is marked mortal; now we need a more
1577      * permanent copy, which is stored the same way that is done in a regular
1578      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1579      * map */
1580
1581     SV* invlist = invlist_clone(ssc->invlist);
1582
1583     PERL_ARGS_ASSERT_SSC_FINALIZE;
1584
1585     assert(is_ANYOF_SYNTHETIC(ssc));
1586
1587     /* The code in this file assumes that all but these flags aren't relevant
1588      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1589      * by the time we reach here */
1590     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1591
1592     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1593
1594     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1595                                 NULL, NULL, NULL, FALSE);
1596
1597     /* Make sure is clone-safe */
1598     ssc->invlist = NULL;
1599
1600     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1601         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1602     }
1603
1604     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1605 }
1606
1607 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1608 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1609 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1610 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1611                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1612                                : 0 )
1613
1614
1615 #ifdef DEBUGGING
1616 /*
1617    dump_trie(trie,widecharmap,revcharmap)
1618    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1619    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1620
1621    These routines dump out a trie in a somewhat readable format.
1622    The _interim_ variants are used for debugging the interim
1623    tables that are used to generate the final compressed
1624    representation which is what dump_trie expects.
1625
1626    Part of the reason for their existence is to provide a form
1627    of documentation as to how the different representations function.
1628
1629 */
1630
1631 /*
1632   Dumps the final compressed table form of the trie to Perl_debug_log.
1633   Used for debugging make_trie().
1634 */
1635
1636 STATIC void
1637 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1638             AV *revcharmap, U32 depth)
1639 {
1640     U32 state;
1641     SV *sv=sv_newmortal();
1642     int colwidth= widecharmap ? 6 : 4;
1643     U16 word;
1644     GET_RE_DEBUG_FLAGS_DECL;
1645
1646     PERL_ARGS_ASSERT_DUMP_TRIE;
1647
1648     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1649         (int)depth * 2 + 2,"",
1650         "Match","Base","Ofs" );
1651
1652     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1653         SV ** const tmp = av_fetch( revcharmap, state, 0);
1654         if ( tmp ) {
1655             PerlIO_printf( Perl_debug_log, "%*s",
1656                 colwidth,
1657                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1658                             PL_colors[0], PL_colors[1],
1659                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1660                             PERL_PV_ESCAPE_FIRSTCHAR
1661                 )
1662             );
1663         }
1664     }
1665     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1666         (int)depth * 2 + 2,"");
1667
1668     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1669         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1670     PerlIO_printf( Perl_debug_log, "\n");
1671
1672     for( state = 1 ; state < trie->statecount ; state++ ) {
1673         const U32 base = trie->states[ state ].trans.base;
1674
1675         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1676                                        (int)depth * 2 + 2,"", (UV)state);
1677
1678         if ( trie->states[ state ].wordnum ) {
1679             PerlIO_printf( Perl_debug_log, " W%4X",
1680                                            trie->states[ state ].wordnum );
1681         } else {
1682             PerlIO_printf( Perl_debug_log, "%6s", "" );
1683         }
1684
1685         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1686
1687         if ( base ) {
1688             U32 ofs = 0;
1689
1690             while( ( base + ofs  < trie->uniquecharcount ) ||
1691                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1692                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1693                                                                     != state))
1694                     ofs++;
1695
1696             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1697
1698             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1699                 if ( ( base + ofs >= trie->uniquecharcount )
1700                         && ( base + ofs - trie->uniquecharcount
1701                                                         < trie->lasttrans )
1702                         && trie->trans[ base + ofs
1703                                     - trie->uniquecharcount ].check == state )
1704                 {
1705                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1706                     colwidth,
1707                     (UV)trie->trans[ base + ofs
1708                                              - trie->uniquecharcount ].next );
1709                 } else {
1710                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1711                 }
1712             }
1713
1714             PerlIO_printf( Perl_debug_log, "]");
1715
1716         }
1717         PerlIO_printf( Perl_debug_log, "\n" );
1718     }
1719     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1720                                 (int)depth*2, "");
1721     for (word=1; word <= trie->wordcount; word++) {
1722         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1723             (int)word, (int)(trie->wordinfo[word].prev),
1724             (int)(trie->wordinfo[word].len));
1725     }
1726     PerlIO_printf(Perl_debug_log, "\n" );
1727 }
1728 /*
1729   Dumps a fully constructed but uncompressed trie in list form.
1730   List tries normally only are used for construction when the number of
1731   possible chars (trie->uniquecharcount) is very high.
1732   Used for debugging make_trie().
1733 */
1734 STATIC void
1735 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1736                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1737                          U32 depth)
1738 {
1739     U32 state;
1740     SV *sv=sv_newmortal();
1741     int colwidth= widecharmap ? 6 : 4;
1742     GET_RE_DEBUG_FLAGS_DECL;
1743
1744     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1745
1746     /* print out the table precompression.  */
1747     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1748         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1749         "------:-----+-----------------\n" );
1750
1751     for( state=1 ; state < next_alloc ; state ++ ) {
1752         U16 charid;
1753
1754         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1755             (int)depth * 2 + 2,"", (UV)state  );
1756         if ( ! trie->states[ state ].wordnum ) {
1757             PerlIO_printf( Perl_debug_log, "%5s| ","");
1758         } else {
1759             PerlIO_printf( Perl_debug_log, "W%4x| ",
1760                 trie->states[ state ].wordnum
1761             );
1762         }
1763         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1764             SV ** const tmp = av_fetch( revcharmap,
1765                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1766             if ( tmp ) {
1767                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1768                     colwidth,
1769                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1770                               colwidth,
1771                               PL_colors[0], PL_colors[1],
1772                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1773                               | PERL_PV_ESCAPE_FIRSTCHAR
1774                     ) ,
1775                     TRIE_LIST_ITEM(state,charid).forid,
1776                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1777                 );
1778                 if (!(charid % 10))
1779                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1780                         (int)((depth * 2) + 14), "");
1781             }
1782         }
1783         PerlIO_printf( Perl_debug_log, "\n");
1784     }
1785 }
1786
1787 /*
1788   Dumps a fully constructed but uncompressed trie in table form.
1789   This is the normal DFA style state transition table, with a few
1790   twists to facilitate compression later.
1791   Used for debugging make_trie().
1792 */
1793 STATIC void
1794 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1795                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1796                           U32 depth)
1797 {
1798     U32 state;
1799     U16 charid;
1800     SV *sv=sv_newmortal();
1801     int colwidth= widecharmap ? 6 : 4;
1802     GET_RE_DEBUG_FLAGS_DECL;
1803
1804     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1805
1806     /*
1807        print out the table precompression so that we can do a visual check
1808        that they are identical.
1809      */
1810
1811     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1812
1813     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1814         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1815         if ( tmp ) {
1816             PerlIO_printf( Perl_debug_log, "%*s",
1817                 colwidth,
1818                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1819                             PL_colors[0], PL_colors[1],
1820                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1821                             PERL_PV_ESCAPE_FIRSTCHAR
1822                 )
1823             );
1824         }
1825     }
1826
1827     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1828
1829     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1830         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1831     }
1832
1833     PerlIO_printf( Perl_debug_log, "\n" );
1834
1835     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1836
1837         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1838             (int)depth * 2 + 2,"",
1839             (UV)TRIE_NODENUM( state ) );
1840
1841         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1842             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1843             if (v)
1844                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1845             else
1846                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1847         }
1848         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1849             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1850                                             (UV)trie->trans[ state ].check );
1851         } else {
1852             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1853                                             (UV)trie->trans[ state ].check,
1854             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1855         }
1856     }
1857 }
1858
1859 #endif
1860
1861
1862 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1863   startbranch: the first branch in the whole branch sequence
1864   first      : start branch of sequence of branch-exact nodes.
1865                May be the same as startbranch
1866   last       : Thing following the last branch.
1867                May be the same as tail.
1868   tail       : item following the branch sequence
1869   count      : words in the sequence
1870   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1871   depth      : indent depth
1872
1873 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1874
1875 A trie is an N'ary tree where the branches are determined by digital
1876 decomposition of the key. IE, at the root node you look up the 1st character and
1877 follow that branch repeat until you find the end of the branches. Nodes can be
1878 marked as "accepting" meaning they represent a complete word. Eg:
1879
1880   /he|she|his|hers/
1881
1882 would convert into the following structure. Numbers represent states, letters
1883 following numbers represent valid transitions on the letter from that state, if
1884 the number is in square brackets it represents an accepting state, otherwise it
1885 will be in parenthesis.
1886
1887       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1888       |    |
1889       |   (2)
1890       |    |
1891      (1)   +-i->(6)-+-s->[7]
1892       |
1893       +-s->(3)-+-h->(4)-+-e->[5]
1894
1895       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1896
1897 This shows that when matching against the string 'hers' we will begin at state 1
1898 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1899 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1900 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1901 single traverse. We store a mapping from accepting to state to which word was
1902 matched, and then when we have multiple possibilities we try to complete the
1903 rest of the regex in the order in which they occured in the alternation.
1904
1905 The only prior NFA like behaviour that would be changed by the TRIE support is
1906 the silent ignoring of duplicate alternations which are of the form:
1907
1908  / (DUPE|DUPE) X? (?{ ... }) Y /x
1909
1910 Thus EVAL blocks following a trie may be called a different number of times with
1911 and without the optimisation. With the optimisations dupes will be silently
1912 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1913 the following demonstrates:
1914
1915  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1916
1917 which prints out 'word' three times, but
1918
1919  'words'=~/(word|word|word)(?{ print $1 })S/
1920
1921 which doesnt print it out at all. This is due to other optimisations kicking in.
1922
1923 Example of what happens on a structural level:
1924
1925 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1926
1927    1: CURLYM[1] {1,32767}(18)
1928    5:   BRANCH(8)
1929    6:     EXACT <ac>(16)
1930    8:   BRANCH(11)
1931    9:     EXACT <ad>(16)
1932   11:   BRANCH(14)
1933   12:     EXACT <ab>(16)
1934   16:   SUCCEED(0)
1935   17:   NOTHING(18)
1936   18: END(0)
1937
1938 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1939 and should turn into:
1940
1941    1: CURLYM[1] {1,32767}(18)
1942    5:   TRIE(16)
1943         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1944           <ac>
1945           <ad>
1946           <ab>
1947   16:   SUCCEED(0)
1948   17:   NOTHING(18)
1949   18: END(0)
1950
1951 Cases where tail != last would be like /(?foo|bar)baz/:
1952
1953    1: BRANCH(4)
1954    2:   EXACT <foo>(8)
1955    4: BRANCH(7)
1956    5:   EXACT <bar>(8)
1957    7: TAIL(8)
1958    8: EXACT <baz>(10)
1959   10: END(0)
1960
1961 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1962 and would end up looking like:
1963
1964     1: TRIE(8)
1965       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1966         <foo>
1967         <bar>
1968    7: TAIL(8)
1969    8: EXACT <baz>(10)
1970   10: END(0)
1971
1972     d = uvchr_to_utf8_flags(d, uv, 0);
1973
1974 is the recommended Unicode-aware way of saying
1975
1976     *(d++) = uv;
1977 */
1978
1979 #define TRIE_STORE_REVCHAR(val)                                            \
1980     STMT_START {                                                           \
1981         if (UTF) {                                                         \
1982             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1983             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1984             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1985             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1986             SvPOK_on(zlopp);                                               \
1987             SvUTF8_on(zlopp);                                              \
1988             av_push(revcharmap, zlopp);                                    \
1989         } else {                                                           \
1990             char ooooff = (char)val;                                           \
1991             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1992         }                                                                  \
1993         } STMT_END
1994
1995 /* This gets the next character from the input, folding it if not already
1996  * folded. */
1997 #define TRIE_READ_CHAR STMT_START {                                           \
1998     wordlen++;                                                                \
1999     if ( UTF ) {                                                              \
2000         /* if it is UTF then it is either already folded, or does not need    \
2001          * folding */                                                         \
2002         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2003     }                                                                         \
2004     else if (folder == PL_fold_latin1) {                                      \
2005         /* This folder implies Unicode rules, which in the range expressible  \
2006          *  by not UTF is the lower case, with the two exceptions, one of     \
2007          *  which should have been taken care of before calling this */       \
2008         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2009         uvc = toLOWER_L1(*uc);                                                \
2010         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2011         len = 1;                                                              \
2012     } else {                                                                  \
2013         /* raw data, will be folded later if needed */                        \
2014         uvc = (U32)*uc;                                                       \
2015         len = 1;                                                              \
2016     }                                                                         \
2017 } STMT_END
2018
2019
2020
2021 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2022     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2023         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2024         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2025     }                                                           \
2026     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2027     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2028     TRIE_LIST_CUR( state )++;                                   \
2029 } STMT_END
2030
2031 #define TRIE_LIST_NEW(state) STMT_START {                       \
2032     Newxz( trie->states[ state ].trans.list,               \
2033         4, reg_trie_trans_le );                                 \
2034      TRIE_LIST_CUR( state ) = 1;                                \
2035      TRIE_LIST_LEN( state ) = 4;                                \
2036 } STMT_END
2037
2038 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2039     U16 dupe= trie->states[ state ].wordnum;                    \
2040     regnode * const noper_next = regnext( noper );              \
2041                                                                 \
2042     DEBUG_r({                                                   \
2043         /* store the word for dumping */                        \
2044         SV* tmp;                                                \
2045         if (OP(noper) != NOTHING)                               \
2046             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2047         else                                                    \
2048             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2049         av_push( trie_words, tmp );                             \
2050     });                                                         \
2051                                                                 \
2052     curword++;                                                  \
2053     trie->wordinfo[curword].prev   = 0;                         \
2054     trie->wordinfo[curword].len    = wordlen;                   \
2055     trie->wordinfo[curword].accept = state;                     \
2056                                                                 \
2057     if ( noper_next < tail ) {                                  \
2058         if (!trie->jump)                                        \
2059             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2060                                                  sizeof(U16) ); \
2061         trie->jump[curword] = (U16)(noper_next - convert);      \
2062         if (!jumper)                                            \
2063             jumper = noper_next;                                \
2064         if (!nextbranch)                                        \
2065             nextbranch= regnext(cur);                           \
2066     }                                                           \
2067                                                                 \
2068     if ( dupe ) {                                               \
2069         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2070         /* chain, so that when the bits of chain are later    */\
2071         /* linked together, the dups appear in the chain      */\
2072         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2073         trie->wordinfo[dupe].prev = curword;                    \
2074     } else {                                                    \
2075         /* we haven't inserted this word yet.                */ \
2076         trie->states[ state ].wordnum = curword;                \
2077     }                                                           \
2078 } STMT_END
2079
2080
2081 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2082      ( ( base + charid >=  ucharcount                                   \
2083          && base + charid < ubound                                      \
2084          && state == trie->trans[ base - ucharcount + charid ].check    \
2085          && trie->trans[ base - ucharcount + charid ].next )            \
2086            ? trie->trans[ base - ucharcount + charid ].next             \
2087            : ( state==1 ? special : 0 )                                 \
2088       )
2089
2090 #define MADE_TRIE       1
2091 #define MADE_JUMP_TRIE  2
2092 #define MADE_EXACT_TRIE 4
2093
2094 STATIC I32
2095 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2096                   regnode *first, regnode *last, regnode *tail,
2097                   U32 word_count, U32 flags, U32 depth)
2098 {
2099     /* first pass, loop through and scan words */
2100     reg_trie_data *trie;
2101     HV *widecharmap = NULL;
2102     AV *revcharmap = newAV();
2103     regnode *cur;
2104     STRLEN len = 0;
2105     UV uvc = 0;
2106     U16 curword = 0;
2107     U32 next_alloc = 0;
2108     regnode *jumper = NULL;
2109     regnode *nextbranch = NULL;
2110     regnode *convert = NULL;
2111     U32 *prev_states; /* temp array mapping each state to previous one */
2112     /* we just use folder as a flag in utf8 */
2113     const U8 * folder = NULL;
2114
2115 #ifdef DEBUGGING
2116     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2117     AV *trie_words = NULL;
2118     /* along with revcharmap, this only used during construction but both are
2119      * useful during debugging so we store them in the struct when debugging.
2120      */
2121 #else
2122     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2123     STRLEN trie_charcount=0;
2124 #endif
2125     SV *re_trie_maxbuff;
2126     GET_RE_DEBUG_FLAGS_DECL;
2127
2128     PERL_ARGS_ASSERT_MAKE_TRIE;
2129 #ifndef DEBUGGING
2130     PERL_UNUSED_ARG(depth);
2131 #endif
2132
2133     switch (flags) {
2134         case EXACT: break;
2135         case EXACTFA:
2136         case EXACTFU_SS:
2137         case EXACTFU: folder = PL_fold_latin1; break;
2138         case EXACTF:  folder = PL_fold; break;
2139         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2140     }
2141
2142     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2143     trie->refcount = 1;
2144     trie->startstate = 1;
2145     trie->wordcount = word_count;
2146     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2147     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2148     if (flags == EXACT)
2149         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2150     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2151                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2152
2153     DEBUG_r({
2154         trie_words = newAV();
2155     });
2156
2157     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2158     assert(re_trie_maxbuff);
2159     if (!SvIOK(re_trie_maxbuff)) {
2160         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2161     }
2162     DEBUG_TRIE_COMPILE_r({
2163         PerlIO_printf( Perl_debug_log,
2164           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2165           (int)depth * 2 + 2, "",
2166           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2167           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2168     });
2169
2170    /* Find the node we are going to overwrite */
2171     if ( first == startbranch && OP( last ) != BRANCH ) {
2172         /* whole branch chain */
2173         convert = first;
2174     } else {
2175         /* branch sub-chain */
2176         convert = NEXTOPER( first );
2177     }
2178
2179     /*  -- First loop and Setup --
2180
2181        We first traverse the branches and scan each word to determine if it
2182        contains widechars, and how many unique chars there are, this is
2183        important as we have to build a table with at least as many columns as we
2184        have unique chars.
2185
2186        We use an array of integers to represent the character codes 0..255
2187        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2188        the native representation of the character value as the key and IV's for
2189        the coded index.
2190
2191        *TODO* If we keep track of how many times each character is used we can
2192        remap the columns so that the table compression later on is more
2193        efficient in terms of memory by ensuring the most common value is in the
2194        middle and the least common are on the outside.  IMO this would be better
2195        than a most to least common mapping as theres a decent chance the most
2196        common letter will share a node with the least common, meaning the node
2197        will not be compressible. With a middle is most common approach the worst
2198        case is when we have the least common nodes twice.
2199
2200      */
2201
2202     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2203         regnode *noper = NEXTOPER( cur );
2204         const U8 *uc = (U8*)STRING( noper );
2205         const U8 *e  = uc + STR_LEN( noper );
2206         int foldlen = 0;
2207         U32 wordlen      = 0;         /* required init */
2208         STRLEN minchars = 0;
2209         STRLEN maxchars = 0;
2210         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2211                                                bitmap?*/
2212
2213         if (OP(noper) == NOTHING) {
2214             regnode *noper_next= regnext(noper);
2215             if (noper_next != tail && OP(noper_next) == flags) {
2216                 noper = noper_next;
2217                 uc= (U8*)STRING(noper);
2218                 e= uc + STR_LEN(noper);
2219                 trie->minlen= STR_LEN(noper);
2220             } else {
2221                 trie->minlen= 0;
2222                 continue;
2223             }
2224         }
2225
2226         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2227             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2228                                           regardless of encoding */
2229             if (OP( noper ) == EXACTFU_SS) {
2230                 /* false positives are ok, so just set this */
2231                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2232             }
2233         }
2234         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2235                                            branch */
2236             TRIE_CHARCOUNT(trie)++;
2237             TRIE_READ_CHAR;
2238
2239             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2240              * is in effect.  Under /i, this character can match itself, or
2241              * anything that folds to it.  If not under /i, it can match just
2242              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2243              * all fold to k, and all are single characters.   But some folds
2244              * expand to more than one character, so for example LATIN SMALL
2245              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2246              * the string beginning at 'uc' is 'ffi', it could be matched by
2247              * three characters, or just by the one ligature character. (It
2248              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2249              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2250              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2251              * match.)  The trie needs to know the minimum and maximum number
2252              * of characters that could match so that it can use size alone to
2253              * quickly reject many match attempts.  The max is simple: it is
2254              * the number of folded characters in this branch (since a fold is
2255              * never shorter than what folds to it. */
2256
2257             maxchars++;
2258
2259             /* And the min is equal to the max if not under /i (indicated by
2260              * 'folder' being NULL), or there are no multi-character folds.  If
2261              * there is a multi-character fold, the min is incremented just
2262              * once, for the character that folds to the sequence.  Each
2263              * character in the sequence needs to be added to the list below of
2264              * characters in the trie, but we count only the first towards the
2265              * min number of characters needed.  This is done through the
2266              * variable 'foldlen', which is returned by the macros that look
2267              * for these sequences as the number of bytes the sequence
2268              * occupies.  Each time through the loop, we decrement 'foldlen' by
2269              * how many bytes the current char occupies.  Only when it reaches
2270              * 0 do we increment 'minchars' or look for another multi-character
2271              * sequence. */
2272             if (folder == NULL) {
2273                 minchars++;
2274             }
2275             else if (foldlen > 0) {
2276                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2277             }
2278             else {
2279                 minchars++;
2280
2281                 /* See if *uc is the beginning of a multi-character fold.  If
2282                  * so, we decrement the length remaining to look at, to account
2283                  * for the current character this iteration.  (We can use 'uc'
2284                  * instead of the fold returned by TRIE_READ_CHAR because for
2285                  * non-UTF, the latin1_safe macro is smart enough to account
2286                  * for all the unfolded characters, and because for UTF, the
2287                  * string will already have been folded earlier in the
2288                  * compilation process */
2289                 if (UTF) {
2290                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2291                         foldlen -= UTF8SKIP(uc);
2292                     }
2293                 }
2294                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2295                     foldlen--;
2296                 }
2297             }
2298
2299             /* The current character (and any potential folds) should be added
2300              * to the possible matching characters for this position in this
2301              * branch */
2302             if ( uvc < 256 ) {
2303                 if ( folder ) {
2304                     U8 folded= folder[ (U8) uvc ];
2305                     if ( !trie->charmap[ folded ] ) {
2306                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2307                         TRIE_STORE_REVCHAR( folded );
2308                     }
2309                 }
2310                 if ( !trie->charmap[ uvc ] ) {
2311                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2312                     TRIE_STORE_REVCHAR( uvc );
2313                 }
2314                 if ( set_bit ) {
2315                     /* store the codepoint in the bitmap, and its folded
2316                      * equivalent. */
2317                     TRIE_BITMAP_SET(trie, uvc);
2318
2319                     /* store the folded codepoint */
2320                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2321
2322                     if ( !UTF ) {
2323                         /* store first byte of utf8 representation of
2324                            variant codepoints */
2325                         if (! UVCHR_IS_INVARIANT(uvc)) {
2326                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2327                         }
2328                     }
2329                     set_bit = 0; /* We've done our bit :-) */
2330                 }
2331             } else {
2332
2333                 /* XXX We could come up with the list of code points that fold
2334                  * to this using PL_utf8_foldclosures, except not for
2335                  * multi-char folds, as there may be multiple combinations
2336                  * there that could work, which needs to wait until runtime to
2337                  * resolve (The comment about LIGATURE FFI above is such an
2338                  * example */
2339
2340                 SV** svpp;
2341                 if ( !widecharmap )
2342                     widecharmap = newHV();
2343
2344                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2345
2346                 if ( !svpp )
2347                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2348
2349                 if ( !SvTRUE( *svpp ) ) {
2350                     sv_setiv( *svpp, ++trie->uniquecharcount );
2351                     TRIE_STORE_REVCHAR(uvc);
2352                 }
2353             }
2354         } /* end loop through characters in this branch of the trie */
2355
2356         /* We take the min and max for this branch and combine to find the min
2357          * and max for all branches processed so far */
2358         if( cur == first ) {
2359             trie->minlen = minchars;
2360             trie->maxlen = maxchars;
2361         } else if (minchars < trie->minlen) {
2362             trie->minlen = minchars;
2363         } else if (maxchars > trie->maxlen) {
2364             trie->maxlen = maxchars;
2365         }
2366     } /* end first pass */
2367     DEBUG_TRIE_COMPILE_r(
2368         PerlIO_printf( Perl_debug_log,
2369                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2370                 (int)depth * 2 + 2,"",
2371                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2372                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2373                 (int)trie->minlen, (int)trie->maxlen )
2374     );
2375
2376     /*
2377         We now know what we are dealing with in terms of unique chars and
2378         string sizes so we can calculate how much memory a naive
2379         representation using a flat table  will take. If it's over a reasonable
2380         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2381         conservative but potentially much slower representation using an array
2382         of lists.
2383
2384         At the end we convert both representations into the same compressed
2385         form that will be used in regexec.c for matching with. The latter
2386         is a form that cannot be used to construct with but has memory
2387         properties similar to the list form and access properties similar
2388         to the table form making it both suitable for fast searches and
2389         small enough that its feasable to store for the duration of a program.
2390
2391         See the comment in the code where the compressed table is produced
2392         inplace from the flat tabe representation for an explanation of how
2393         the compression works.
2394
2395     */
2396
2397
2398     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2399     prev_states[1] = 0;
2400
2401     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2402                                                     > SvIV(re_trie_maxbuff) )
2403     {
2404         /*
2405             Second Pass -- Array Of Lists Representation
2406
2407             Each state will be represented by a list of charid:state records
2408             (reg_trie_trans_le) the first such element holds the CUR and LEN
2409             points of the allocated array. (See defines above).
2410
2411             We build the initial structure using the lists, and then convert
2412             it into the compressed table form which allows faster lookups
2413             (but cant be modified once converted).
2414         */
2415
2416         STRLEN transcount = 1;
2417
2418         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2419             "%*sCompiling trie using list compiler\n",
2420             (int)depth * 2 + 2, ""));
2421
2422         trie->states = (reg_trie_state *)
2423             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2424                                   sizeof(reg_trie_state) );
2425         TRIE_LIST_NEW(1);
2426         next_alloc = 2;
2427
2428         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2429
2430             regnode *noper   = NEXTOPER( cur );
2431             U8 *uc           = (U8*)STRING( noper );
2432             const U8 *e      = uc + STR_LEN( noper );
2433             U32 state        = 1;         /* required init */
2434             U16 charid       = 0;         /* sanity init */
2435             U32 wordlen      = 0;         /* required init */
2436
2437             if (OP(noper) == NOTHING) {
2438                 regnode *noper_next= regnext(noper);
2439                 if (noper_next != tail && OP(noper_next) == flags) {
2440                     noper = noper_next;
2441                     uc= (U8*)STRING(noper);
2442                     e= uc + STR_LEN(noper);
2443                 }
2444             }
2445
2446             if (OP(noper) != NOTHING) {
2447                 for ( ; uc < e ; uc += len ) {
2448
2449                     TRIE_READ_CHAR;
2450
2451                     if ( uvc < 256 ) {
2452                         charid = trie->charmap[ uvc ];
2453                     } else {
2454                         SV** const svpp = hv_fetch( widecharmap,
2455                                                     (char*)&uvc,
2456                                                     sizeof( UV ),
2457                                                     0);
2458                         if ( !svpp ) {
2459                             charid = 0;
2460                         } else {
2461                             charid=(U16)SvIV( *svpp );
2462                         }
2463                     }
2464                     /* charid is now 0 if we dont know the char read, or
2465                      * nonzero if we do */
2466                     if ( charid ) {
2467
2468                         U16 check;
2469                         U32 newstate = 0;
2470
2471                         charid--;
2472                         if ( !trie->states[ state ].trans.list ) {
2473                             TRIE_LIST_NEW( state );
2474                         }
2475                         for ( check = 1;
2476                               check <= TRIE_LIST_USED( state );
2477                               check++ )
2478                         {
2479                             if ( TRIE_LIST_ITEM( state, check ).forid
2480                                                                     == charid )
2481                             {
2482                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2483                                 break;
2484                             }
2485                         }
2486                         if ( ! newstate ) {
2487                             newstate = next_alloc++;
2488                             prev_states[newstate] = state;
2489                             TRIE_LIST_PUSH( state, charid, newstate );
2490                             transcount++;
2491                         }
2492                         state = newstate;
2493                     } else {
2494                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2495                     }
2496                 }
2497             }
2498             TRIE_HANDLE_WORD(state);
2499
2500         } /* end second pass */
2501
2502         /* next alloc is the NEXT state to be allocated */
2503         trie->statecount = next_alloc;
2504         trie->states = (reg_trie_state *)
2505             PerlMemShared_realloc( trie->states,
2506                                    next_alloc
2507                                    * sizeof(reg_trie_state) );
2508
2509         /* and now dump it out before we compress it */
2510         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2511                                                          revcharmap, next_alloc,
2512                                                          depth+1)
2513         );
2514
2515         trie->trans = (reg_trie_trans *)
2516             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2517         {
2518             U32 state;
2519             U32 tp = 0;
2520             U32 zp = 0;
2521
2522
2523             for( state=1 ; state < next_alloc ; state ++ ) {
2524                 U32 base=0;
2525
2526                 /*
2527                 DEBUG_TRIE_COMPILE_MORE_r(
2528                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2529                 );
2530                 */
2531
2532                 if (trie->states[state].trans.list) {
2533                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2534                     U16 maxid=minid;
2535                     U16 idx;
2536
2537                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2538                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2539                         if ( forid < minid ) {
2540                             minid=forid;
2541                         } else if ( forid > maxid ) {
2542                             maxid=forid;
2543                         }
2544                     }
2545                     if ( transcount < tp + maxid - minid + 1) {
2546                         transcount *= 2;
2547                         trie->trans = (reg_trie_trans *)
2548                             PerlMemShared_realloc( trie->trans,
2549                                                      transcount
2550                                                      * sizeof(reg_trie_trans) );
2551                         Zero( trie->trans + (transcount / 2),
2552                               transcount / 2,
2553                               reg_trie_trans );
2554                     }
2555                     base = trie->uniquecharcount + tp - minid;
2556                     if ( maxid == minid ) {
2557                         U32 set = 0;
2558                         for ( ; zp < tp ; zp++ ) {
2559                             if ( ! trie->trans[ zp ].next ) {
2560                                 base = trie->uniquecharcount + zp - minid;
2561                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2562                                                                    1).newstate;
2563                                 trie->trans[ zp ].check = state;
2564                                 set = 1;
2565                                 break;
2566                             }
2567                         }
2568                         if ( !set ) {
2569                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2570                                                                    1).newstate;
2571                             trie->trans[ tp ].check = state;
2572                             tp++;
2573                             zp = tp;
2574                         }
2575                     } else {
2576                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2577                             const U32 tid = base
2578                                            - trie->uniquecharcount
2579                                            + TRIE_LIST_ITEM( state, idx ).forid;
2580                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2581                                                                 idx ).newstate;
2582                             trie->trans[ tid ].check = state;
2583                         }
2584                         tp += ( maxid - minid + 1 );
2585                     }
2586                     Safefree(trie->states[ state ].trans.list);
2587                 }
2588                 /*
2589                 DEBUG_TRIE_COMPILE_MORE_r(
2590                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2591                 );
2592                 */
2593                 trie->states[ state ].trans.base=base;
2594             }
2595             trie->lasttrans = tp + 1;
2596         }
2597     } else {
2598         /*
2599            Second Pass -- Flat Table Representation.
2600
2601            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2602            each.  We know that we will need Charcount+1 trans at most to store
2603            the data (one row per char at worst case) So we preallocate both
2604            structures assuming worst case.
2605
2606            We then construct the trie using only the .next slots of the entry
2607            structs.
2608
2609            We use the .check field of the first entry of the node temporarily
2610            to make compression both faster and easier by keeping track of how
2611            many non zero fields are in the node.
2612
2613            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2614            transition.
2615
2616            There are two terms at use here: state as a TRIE_NODEIDX() which is
2617            a number representing the first entry of the node, and state as a
2618            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2619            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2620            if there are 2 entrys per node. eg:
2621
2622              A B       A B
2623           1. 2 4    1. 3 7
2624           2. 0 3    3. 0 5
2625           3. 0 0    5. 0 0
2626           4. 0 0    7. 0 0
2627
2628            The table is internally in the right hand, idx form. However as we
2629            also have to deal with the states array which is indexed by nodenum
2630            we have to use TRIE_NODENUM() to convert.
2631
2632         */
2633         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2634             "%*sCompiling trie using table compiler\n",
2635             (int)depth * 2 + 2, ""));
2636
2637         trie->trans = (reg_trie_trans *)
2638             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2639                                   * trie->uniquecharcount + 1,
2640                                   sizeof(reg_trie_trans) );
2641         trie->states = (reg_trie_state *)
2642             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2643                                   sizeof(reg_trie_state) );
2644         next_alloc = trie->uniquecharcount + 1;
2645
2646
2647         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2648
2649             regnode *noper   = NEXTOPER( cur );
2650             const U8 *uc     = (U8*)STRING( noper );
2651             const U8 *e      = uc + STR_LEN( noper );
2652
2653             U32 state        = 1;         /* required init */
2654
2655             U16 charid       = 0;         /* sanity init */
2656             U32 accept_state = 0;         /* sanity init */
2657
2658             U32 wordlen      = 0;         /* required init */
2659
2660             if (OP(noper) == NOTHING) {
2661                 regnode *noper_next= regnext(noper);
2662                 if (noper_next != tail && OP(noper_next) == flags) {
2663                     noper = noper_next;
2664                     uc= (U8*)STRING(noper);
2665                     e= uc + STR_LEN(noper);
2666                 }
2667             }
2668
2669             if ( OP(noper) != NOTHING ) {
2670                 for ( ; uc < e ; uc += len ) {
2671
2672                     TRIE_READ_CHAR;
2673
2674                     if ( uvc < 256 ) {
2675                         charid = trie->charmap[ uvc ];
2676                     } else {
2677                         SV* const * const svpp = hv_fetch( widecharmap,
2678                                                            (char*)&uvc,
2679                                                            sizeof( UV ),
2680                                                            0);
2681                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2682                     }
2683                     if ( charid ) {
2684                         charid--;
2685                         if ( !trie->trans[ state + charid ].next ) {
2686                             trie->trans[ state + charid ].next = next_alloc;
2687                             trie->trans[ state ].check++;
2688                             prev_states[TRIE_NODENUM(next_alloc)]
2689                                     = TRIE_NODENUM(state);
2690                             next_alloc += trie->uniquecharcount;
2691                         }
2692                         state = trie->trans[ state + charid ].next;
2693                     } else {
2694                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2695                     }
2696                     /* charid is now 0 if we dont know the char read, or
2697                      * nonzero if we do */
2698                 }
2699             }
2700             accept_state = TRIE_NODENUM( state );
2701             TRIE_HANDLE_WORD(accept_state);
2702
2703         } /* end second pass */
2704
2705         /* and now dump it out before we compress it */
2706         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2707                                                           revcharmap,
2708                                                           next_alloc, depth+1));
2709
2710         {
2711         /*
2712            * Inplace compress the table.*
2713
2714            For sparse data sets the table constructed by the trie algorithm will
2715            be mostly 0/FAIL transitions or to put it another way mostly empty.
2716            (Note that leaf nodes will not contain any transitions.)
2717
2718            This algorithm compresses the tables by eliminating most such
2719            transitions, at the cost of a modest bit of extra work during lookup:
2720
2721            - Each states[] entry contains a .base field which indicates the
2722            index in the state[] array wheres its transition data is stored.
2723
2724            - If .base is 0 there are no valid transitions from that node.
2725
2726            - If .base is nonzero then charid is added to it to find an entry in
2727            the trans array.
2728
2729            -If trans[states[state].base+charid].check!=state then the
2730            transition is taken to be a 0/Fail transition. Thus if there are fail
2731            transitions at the front of the node then the .base offset will point
2732            somewhere inside the previous nodes data (or maybe even into a node
2733            even earlier), but the .check field determines if the transition is
2734            valid.
2735
2736            XXX - wrong maybe?
2737            The following process inplace converts the table to the compressed
2738            table: We first do not compress the root node 1,and mark all its
2739            .check pointers as 1 and set its .base pointer as 1 as well. This
2740            allows us to do a DFA construction from the compressed table later,
2741            and ensures that any .base pointers we calculate later are greater
2742            than 0.
2743
2744            - We set 'pos' to indicate the first entry of the second node.
2745
2746            - We then iterate over the columns of the node, finding the first and
2747            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2748            and set the .check pointers accordingly, and advance pos
2749            appropriately and repreat for the next node. Note that when we copy
2750            the next pointers we have to convert them from the original
2751            NODEIDX form to NODENUM form as the former is not valid post
2752            compression.
2753
2754            - If a node has no transitions used we mark its base as 0 and do not
2755            advance the pos pointer.
2756
2757            - If a node only has one transition we use a second pointer into the
2758            structure to fill in allocated fail transitions from other states.
2759            This pointer is independent of the main pointer and scans forward
2760            looking for null transitions that are allocated to a state. When it
2761            finds one it writes the single transition into the "hole".  If the
2762            pointer doesnt find one the single transition is appended as normal.
2763
2764            - Once compressed we can Renew/realloc the structures to release the
2765            excess space.
2766
2767            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2768            specifically Fig 3.47 and the associated pseudocode.
2769
2770            demq
2771         */
2772         const U32 laststate = TRIE_NODENUM( next_alloc );
2773         U32 state, charid;
2774         U32 pos = 0, zp=0;
2775         trie->statecount = laststate;
2776
2777         for ( state = 1 ; state < laststate ; state++ ) {
2778             U8 flag = 0;
2779             const U32 stateidx = TRIE_NODEIDX( state );
2780             const U32 o_used = trie->trans[ stateidx ].check;
2781             U32 used = trie->trans[ stateidx ].check;
2782             trie->trans[ stateidx ].check = 0;
2783
2784             for ( charid = 0;
2785                   used && charid < trie->uniquecharcount;
2786                   charid++ )
2787             {
2788                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2789                     if ( trie->trans[ stateidx + charid ].next ) {
2790                         if (o_used == 1) {
2791                             for ( ; zp < pos ; zp++ ) {
2792                                 if ( ! trie->trans[ zp ].next ) {
2793                                     break;
2794                                 }
2795                             }
2796                             trie->states[ state ].trans.base
2797                                                     = zp
2798                                                       + trie->uniquecharcount
2799                                                       - charid ;
2800                             trie->trans[ zp ].next
2801                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2802                                                              + charid ].next );
2803                             trie->trans[ zp ].check = state;
2804                             if ( ++zp > pos ) pos = zp;
2805                             break;
2806                         }
2807                         used--;
2808                     }
2809                     if ( !flag ) {
2810                         flag = 1;
2811                         trie->states[ state ].trans.base
2812                                        = pos + trie->uniquecharcount - charid ;
2813                     }
2814                     trie->trans[ pos ].next
2815                         = SAFE_TRIE_NODENUM(
2816                                        trie->trans[ stateidx + charid ].next );
2817                     trie->trans[ pos ].check = state;
2818                     pos++;
2819                 }
2820             }
2821         }
2822         trie->lasttrans = pos + 1;
2823         trie->states = (reg_trie_state *)
2824             PerlMemShared_realloc( trie->states, laststate
2825                                    * sizeof(reg_trie_state) );
2826         DEBUG_TRIE_COMPILE_MORE_r(
2827             PerlIO_printf( Perl_debug_log,
2828                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2829                 (int)depth * 2 + 2,"",
2830                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2831                        + 1 ),
2832                 (IV)next_alloc,
2833                 (IV)pos,
2834                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2835             );
2836
2837         } /* end table compress */
2838     }
2839     DEBUG_TRIE_COMPILE_MORE_r(
2840             PerlIO_printf(Perl_debug_log,
2841                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2842                 (int)depth * 2 + 2, "",
2843                 (UV)trie->statecount,
2844                 (UV)trie->lasttrans)
2845     );
2846     /* resize the trans array to remove unused space */
2847     trie->trans = (reg_trie_trans *)
2848         PerlMemShared_realloc( trie->trans, trie->lasttrans
2849                                * sizeof(reg_trie_trans) );
2850
2851     {   /* Modify the program and insert the new TRIE node */
2852         U8 nodetype =(U8)(flags & 0xFF);
2853         char *str=NULL;
2854
2855 #ifdef DEBUGGING
2856         regnode *optimize = NULL;
2857 #ifdef RE_TRACK_PATTERN_OFFSETS
2858
2859         U32 mjd_offset = 0;
2860         U32 mjd_nodelen = 0;
2861 #endif /* RE_TRACK_PATTERN_OFFSETS */
2862 #endif /* DEBUGGING */
2863         /*
2864            This means we convert either the first branch or the first Exact,
2865            depending on whether the thing following (in 'last') is a branch
2866            or not and whther first is the startbranch (ie is it a sub part of
2867            the alternation or is it the whole thing.)
2868            Assuming its a sub part we convert the EXACT otherwise we convert
2869            the whole branch sequence, including the first.
2870          */
2871         /* Find the node we are going to overwrite */
2872         if ( first != startbranch || OP( last ) == BRANCH ) {
2873             /* branch sub-chain */
2874             NEXT_OFF( first ) = (U16)(last - first);
2875 #ifdef RE_TRACK_PATTERN_OFFSETS
2876             DEBUG_r({
2877                 mjd_offset= Node_Offset((convert));
2878                 mjd_nodelen= Node_Length((convert));
2879             });
2880 #endif
2881             /* whole branch chain */
2882         }
2883 #ifdef RE_TRACK_PATTERN_OFFSETS
2884         else {
2885             DEBUG_r({
2886                 const  regnode *nop = NEXTOPER( convert );
2887                 mjd_offset= Node_Offset((nop));
2888                 mjd_nodelen= Node_Length((nop));
2889             });
2890         }
2891         DEBUG_OPTIMISE_r(
2892             PerlIO_printf(Perl_debug_log,
2893                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2894                 (int)depth * 2 + 2, "",
2895                 (UV)mjd_offset, (UV)mjd_nodelen)
2896         );
2897 #endif
2898         /* But first we check to see if there is a common prefix we can
2899            split out as an EXACT and put in front of the TRIE node.  */
2900         trie->startstate= 1;
2901         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2902             U32 state;
2903             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2904                 U32 ofs = 0;
2905                 I32 idx = -1;
2906                 U32 count = 0;
2907                 const U32 base = trie->states[ state ].trans.base;
2908
2909                 if ( trie->states[state].wordnum )
2910                         count = 1;
2911
2912                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2913                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2914                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2915                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2916                     {
2917                         if ( ++count > 1 ) {
2918                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2919                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2920                             if ( state == 1 ) break;
2921                             if ( count == 2 ) {
2922                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2923                                 DEBUG_OPTIMISE_r(
2924                                     PerlIO_printf(Perl_debug_log,
2925                                         "%*sNew Start State=%"UVuf" Class: [",
2926                                         (int)depth * 2 + 2, "",
2927                                         (UV)state));
2928                                 if (idx >= 0) {
2929                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2930                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2931
2932                                     TRIE_BITMAP_SET(trie,*ch);
2933                                     if ( folder )
2934                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2935                                     DEBUG_OPTIMISE_r(
2936                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2937                                     );
2938                                 }
2939                             }
2940                             TRIE_BITMAP_SET(trie,*ch);
2941                             if ( folder )
2942                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2943                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2944                         }
2945                         idx = ofs;
2946                     }
2947                 }
2948                 if ( count == 1 ) {
2949                     SV **tmp = av_fetch( revcharmap, idx, 0);
2950                     STRLEN len;
2951                     char *ch = SvPV( *tmp, len );
2952                     DEBUG_OPTIMISE_r({
2953                         SV *sv=sv_newmortal();
2954                         PerlIO_printf( Perl_debug_log,
2955                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2956                             (int)depth * 2 + 2, "",
2957                             (UV)state, (UV)idx,
2958                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2959                                 PL_colors[0], PL_colors[1],
2960                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2961                                 PERL_PV_ESCAPE_FIRSTCHAR
2962                             )
2963                         );
2964                     });
2965                     if ( state==1 ) {
2966                         OP( convert ) = nodetype;
2967                         str=STRING(convert);
2968                         STR_LEN(convert)=0;
2969                     }
2970                     STR_LEN(convert) += len;
2971                     while (len--)
2972                         *str++ = *ch++;
2973                 } else {
2974 #ifdef DEBUGGING
2975                     if (state>1)
2976                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2977 #endif
2978                     break;
2979                 }
2980             }
2981             trie->prefixlen = (state-1);
2982             if (str) {
2983                 regnode *n = convert+NODE_SZ_STR(convert);
2984                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2985                 trie->startstate = state;
2986                 trie->minlen -= (state - 1);
2987                 trie->maxlen -= (state - 1);
2988 #ifdef DEBUGGING
2989                /* At least the UNICOS C compiler choked on this
2990                 * being argument to DEBUG_r(), so let's just have
2991                 * it right here. */
2992                if (
2993 #ifdef PERL_EXT_RE_BUILD
2994                    1
2995 #else
2996                    DEBUG_r_TEST
2997 #endif
2998                    ) {
2999                    regnode *fix = convert;
3000                    U32 word = trie->wordcount;
3001                    mjd_nodelen++;
3002                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3003                    while( ++fix < n ) {
3004                        Set_Node_Offset_Length(fix, 0, 0);
3005                    }
3006                    while (word--) {
3007                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3008                        if (tmp) {
3009                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3010                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3011                            else
3012                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3013                        }
3014                    }
3015                }
3016 #endif
3017                 if (trie->maxlen) {
3018                     convert = n;
3019                 } else {
3020                     NEXT_OFF(convert) = (U16)(tail - convert);
3021                     DEBUG_r(optimize= n);
3022                 }
3023             }
3024         }
3025         if (!jumper)
3026             jumper = last;
3027         if ( trie->maxlen ) {
3028             NEXT_OFF( convert ) = (U16)(tail - convert);
3029             ARG_SET( convert, data_slot );
3030             /* Store the offset to the first unabsorbed branch in
3031                jump[0], which is otherwise unused by the jump logic.
3032                We use this when dumping a trie and during optimisation. */
3033             if (trie->jump)
3034                 trie->jump[0] = (U16)(nextbranch - convert);
3035
3036             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3037              *   and there is a bitmap
3038              *   and the first "jump target" node we found leaves enough room
3039              * then convert the TRIE node into a TRIEC node, with the bitmap
3040              * embedded inline in the opcode - this is hypothetically faster.
3041              */
3042             if ( !trie->states[trie->startstate].wordnum
3043                  && trie->bitmap
3044                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3045             {
3046                 OP( convert ) = TRIEC;
3047                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3048                 PerlMemShared_free(trie->bitmap);
3049                 trie->bitmap= NULL;
3050             } else
3051                 OP( convert ) = TRIE;
3052
3053             /* store the type in the flags */
3054             convert->flags = nodetype;
3055             DEBUG_r({
3056             optimize = convert
3057                       + NODE_STEP_REGNODE
3058                       + regarglen[ OP( convert ) ];
3059             });
3060             /* XXX We really should free up the resource in trie now,
3061                    as we won't use them - (which resources?) dmq */
3062         }
3063         /* needed for dumping*/
3064         DEBUG_r(if (optimize) {
3065             regnode *opt = convert;
3066
3067             while ( ++opt < optimize) {
3068                 Set_Node_Offset_Length(opt,0,0);
3069             }
3070             /*
3071                 Try to clean up some of the debris left after the
3072                 optimisation.
3073              */
3074             while( optimize < jumper ) {
3075                 mjd_nodelen += Node_Length((optimize));
3076                 OP( optimize ) = OPTIMIZED;
3077                 Set_Node_Offset_Length(optimize,0,0);
3078                 optimize++;
3079             }
3080             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3081         });
3082     } /* end node insert */
3083
3084     /*  Finish populating the prev field of the wordinfo array.  Walk back
3085      *  from each accept state until we find another accept state, and if
3086      *  so, point the first word's .prev field at the second word. If the
3087      *  second already has a .prev field set, stop now. This will be the
3088      *  case either if we've already processed that word's accept state,
3089      *  or that state had multiple words, and the overspill words were
3090      *  already linked up earlier.
3091      */
3092     {
3093         U16 word;
3094         U32 state;
3095         U16 prev;
3096
3097         for (word=1; word <= trie->wordcount; word++) {
3098             prev = 0;
3099             if (trie->wordinfo[word].prev)
3100                 continue;
3101             state = trie->wordinfo[word].accept;
3102             while (state) {
3103                 state = prev_states[state];
3104                 if (!state)
3105                     break;
3106                 prev = trie->states[state].wordnum;
3107                 if (prev)
3108                     break;
3109             }
3110             trie->wordinfo[word].prev = prev;
3111         }
3112         Safefree(prev_states);
3113     }
3114
3115
3116     /* and now dump out the compressed format */
3117     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3118
3119     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3120 #ifdef DEBUGGING
3121     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3122     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3123 #else
3124     SvREFCNT_dec_NN(revcharmap);
3125 #endif
3126     return trie->jump
3127            ? MADE_JUMP_TRIE
3128            : trie->startstate>1
3129              ? MADE_EXACT_TRIE
3130              : MADE_TRIE;
3131 }
3132
3133 STATIC regnode *
3134 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3135 {
3136 /* The Trie is constructed and compressed now so we can build a fail array if
3137  * it's needed
3138
3139    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3140    3.32 in the
3141    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3142    Ullman 1985/88
3143    ISBN 0-201-10088-6
3144
3145    We find the fail state for each state in the trie, this state is the longest
3146    proper suffix of the current state's 'word' that is also a proper prefix of
3147    another word in our trie. State 1 represents the word '' and is thus the
3148    default fail state. This allows the DFA not to have to restart after its
3149    tried and failed a word at a given point, it simply continues as though it
3150    had been matching the other word in the first place.
3151    Consider
3152       'abcdgu'=~/abcdefg|cdgu/
3153    When we get to 'd' we are still matching the first word, we would encounter
3154    'g' which would fail, which would bring us to the state representing 'd' in
3155    the second word where we would try 'g' and succeed, proceeding to match
3156    'cdgu'.
3157  */
3158  /* add a fail transition */
3159     const U32 trie_offset = ARG(source);
3160     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3161     U32 *q;
3162     const U32 ucharcount = trie->uniquecharcount;
3163     const U32 numstates = trie->statecount;
3164     const U32 ubound = trie->lasttrans + ucharcount;
3165     U32 q_read = 0;
3166     U32 q_write = 0;
3167     U32 charid;
3168     U32 base = trie->states[ 1 ].trans.base;
3169     U32 *fail;
3170     reg_ac_data *aho;
3171     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3172     regnode *stclass;
3173     GET_RE_DEBUG_FLAGS_DECL;
3174
3175     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3176     PERL_UNUSED_CONTEXT;
3177 #ifndef DEBUGGING
3178     PERL_UNUSED_ARG(depth);
3179 #endif
3180
3181     if ( OP(source) == TRIE ) {
3182         struct regnode_1 *op = (struct regnode_1 *)
3183             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3184         StructCopy(source,op,struct regnode_1);
3185         stclass = (regnode *)op;
3186     } else {
3187         struct regnode_charclass *op = (struct regnode_charclass *)
3188             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3189         StructCopy(source,op,struct regnode_charclass);
3190         stclass = (regnode *)op;
3191     }
3192     OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3193
3194     ARG_SET( stclass, data_slot );
3195     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3196     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3197     aho->trie=trie_offset;
3198     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3199     Copy( trie->states, aho->states, numstates, reg_trie_state );
3200     Newxz( q, numstates, U32);
3201     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3202     aho->refcount = 1;
3203     fail = aho->fail;
3204     /* initialize fail[0..1] to be 1 so that we always have
3205        a valid final fail state */
3206     fail[ 0 ] = fail[ 1 ] = 1;
3207
3208     for ( charid = 0; charid < ucharcount ; charid++ ) {
3209         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3210         if ( newstate ) {
3211             q[ q_write ] = newstate;
3212             /* set to point at the root */
3213             fail[ q[ q_write++ ] ]=1;
3214         }
3215     }
3216     while ( q_read < q_write) {
3217         const U32 cur = q[ q_read++ % numstates ];
3218         base = trie->states[ cur ].trans.base;
3219
3220         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3221             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3222             if (ch_state) {
3223                 U32 fail_state = cur;
3224                 U32 fail_base;
3225                 do {
3226                     fail_state = fail[ fail_state ];
3227                     fail_base = aho->states[ fail_state ].trans.base;
3228                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3229
3230                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3231                 fail[ ch_state ] = fail_state;
3232                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3233                 {
3234                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3235                 }
3236                 q[ q_write++ % numstates] = ch_state;
3237             }
3238         }
3239     }
3240     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3241        when we fail in state 1, this allows us to use the
3242        charclass scan to find a valid start char. This is based on the principle
3243        that theres a good chance the string being searched contains lots of stuff
3244        that cant be a start char.
3245      */
3246     fail[ 0 ] = fail[ 1 ] = 0;
3247     DEBUG_TRIE_COMPILE_r({
3248         PerlIO_printf(Perl_debug_log,
3249                       "%*sStclass Failtable (%"UVuf" states): 0",
3250                       (int)(depth * 2), "", (UV)numstates
3251         );
3252         for( q_read=1; q_read<numstates; q_read++ ) {
3253             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3254         }
3255         PerlIO_printf(Perl_debug_log, "\n");
3256     });
3257     Safefree(q);
3258     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3259     return stclass;
3260 }
3261
3262
3263 #define DEBUG_PEEP(str,scan,depth) \
3264     DEBUG_OPTIMISE_r({if (scan){ \
3265        regnode *Next = regnext(scan); \
3266        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3267        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3268            (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3269            Next ? (REG_NODE_NUM(Next)) : 0 ); \
3270        DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3271        PerlIO_printf(Perl_debug_log, "\n"); \
3272    }});
3273
3274 /* The below joins as many adjacent EXACTish nodes as possible into a single
3275  * one.  The regop may be changed if the node(s) contain certain sequences that
3276  * require special handling.  The joining is only done if:
3277  * 1) there is room in the current conglomerated node to entirely contain the
3278  *    next one.
3279  * 2) they are the exact same node type
3280  *
3281  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3282  * these get optimized out
3283  *
3284  * If a node is to match under /i (folded), the number of characters it matches
3285  * can be different than its character length if it contains a multi-character
3286  * fold.  *min_subtract is set to the total delta number of characters of the
3287  * input nodes.
3288  *
3289  * And *unfolded_multi_char is set to indicate whether or not the node contains
3290  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3291  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3292  * SMALL LETTER SHARP S, as only if the target string being matched against
3293  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3294  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3295  * whose components are all above the Latin1 range are not run-time locale
3296  * dependent, and have already been folded by the time this function is
3297  * called.)
3298  *
3299  * This is as good a place as any to discuss the design of handling these
3300  * multi-character fold sequences.  It's been wrong in Perl for a very long
3301  * time.  There are three code points in Unicode whose multi-character folds
3302  * were long ago discovered to mess things up.  The previous designs for
3303  * dealing with these involved assigning a special node for them.  This
3304  * approach doesn't always work, as evidenced by this example:
3305  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3306  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3307  * would match just the \xDF, it won't be able to handle the case where a
3308  * successful match would have to cross the node's boundary.  The new approach
3309  * that hopefully generally solves the problem generates an EXACTFU_SS node
3310  * that is "sss" in this case.
3311  *
3312  * It turns out that there are problems with all multi-character folds, and not
3313  * just these three.  Now the code is general, for all such cases.  The
3314  * approach taken is:
3315  * 1)   This routine examines each EXACTFish node that could contain multi-
3316  *      character folded sequences.  Since a single character can fold into
3317  *      such a sequence, the minimum match length for this node is less than
3318  *      the number of characters in the node.  This routine returns in
3319  *      *min_subtract how many characters to subtract from the the actual
3320  *      length of the string to get a real minimum match length; it is 0 if
3321  *      there are no multi-char foldeds.  This delta is used by the caller to
3322  *      adjust the min length of the match, and the delta between min and max,
3323  *      so that the optimizer doesn't reject these possibilities based on size
3324  *      constraints.
3325  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3326  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3327  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3328  *      there is a possible fold length change.  That means that a regular
3329  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3330  *      with length changes, and so can be processed faster.  regexec.c takes
3331  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3332  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3333  *      known until runtime).  This saves effort in regex matching.  However,
3334  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3335  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3336  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3337  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3338  *      possibilities for the non-UTF8 patterns are quite simple, except for
3339  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3340  *      members of a fold-pair, and arrays are set up for all of them so that
3341  *      the other member of the pair can be found quickly.  Code elsewhere in
3342  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3343  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3344  *      described in the next item.
3345  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3346  *      validity of the fold won't be known until runtime, and so must remain
3347  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3348  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3349  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3350  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3351  *      The reason this is a problem is that the optimizer part of regexec.c
3352  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3353  *      that a character in the pattern corresponds to at most a single
3354  *      character in the target string.  (And I do mean character, and not byte
3355  *      here, unlike other parts of the documentation that have never been
3356  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3357  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3358  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3359  *      nodes, violate the assumption, and they are the only instances where it
3360  *      is violated.  I'm reluctant to try to change the assumption, as the
3361  *      code involved is impenetrable to me (khw), so instead the code here
3362  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3363  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3364  *      boolean indicating whether or not the node contains such a fold.  When
3365  *      it is true, the caller sets a flag that later causes the optimizer in
3366  *      this file to not set values for the floating and fixed string lengths,
3367  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3368  *      assumption.  Thus, there is no optimization based on string lengths for
3369  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3370  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3371  *      assumption is wrong only in these cases is that all other non-UTF-8
3372  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3373  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3374  *      EXACTF nodes because we don't know at compile time if it actually
3375  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3376  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3377  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3378  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3379  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3380  *      string would require the pattern to be forced into UTF-8, the overhead
3381  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3382  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3383  *      locale.)
3384  *
3385  *      Similarly, the code that generates tries doesn't currently handle
3386  *      not-already-folded multi-char folds, and it looks like a pain to change
3387  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3388  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3389  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3390  *      using /iaa matching will be doing so almost entirely with ASCII
3391  *      strings, so this should rarely be encountered in practice */
3392
3393 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3394     if (PL_regkind[OP(scan)] == EXACT) \
3395         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3396
3397 STATIC U32
3398 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3399                    UV *min_subtract, bool *unfolded_multi_char,
3400                    U32 flags,regnode *val, U32 depth)
3401 {
3402     /* Merge several consecutive EXACTish nodes into one. */
3403     regnode *n = regnext(scan);
3404     U32 stringok = 1;
3405     regnode *next = scan + NODE_SZ_STR(scan);
3406     U32 merged = 0;
3407     U32 stopnow = 0;
3408 #ifdef DEBUGGING
3409     regnode *stop = scan;
3410     GET_RE_DEBUG_FLAGS_DECL;
3411 #else
3412     PERL_UNUSED_ARG(depth);
3413 #endif
3414
3415     PERL_ARGS_ASSERT_JOIN_EXACT;
3416 #ifndef EXPERIMENTAL_INPLACESCAN
3417     PERL_UNUSED_ARG(flags);
3418     PERL_UNUSED_ARG(val);
3419 #endif
3420     DEBUG_PEEP("join",scan,depth);
3421
3422     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3423      * EXACT ones that are mergeable to the current one. */
3424     while (n
3425            && (PL_regkind[OP(n)] == NOTHING
3426                || (stringok && OP(n) == OP(scan)))
3427            && NEXT_OFF(n)
3428            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3429     {
3430
3431         if (OP(n) == TAIL || n > next)
3432             stringok = 0;
3433         if (PL_regkind[OP(n)] == NOTHING) {
3434             DEBUG_PEEP("skip:",n,depth);
3435             NEXT_OFF(scan) += NEXT_OFF(n);
3436             next = n + NODE_STEP_REGNODE;
3437 #ifdef DEBUGGING
3438             if (stringok)
3439                 stop = n;
3440 #endif
3441             n = regnext(n);
3442         }
3443         else if (stringok) {
3444             const unsigned int oldl = STR_LEN(scan);
3445             regnode * const nnext = regnext(n);
3446
3447             /* XXX I (khw) kind of doubt that this works on platforms (should
3448              * Perl ever run on one) where U8_MAX is above 255 because of lots
3449              * of other assumptions */
3450             /* Don't join if the sum can't fit into a single node */
3451             if (oldl + STR_LEN(n) > U8_MAX)
3452                 break;
3453
3454             DEBUG_PEEP("merg",n,depth);
3455             merged++;
3456
3457             NEXT_OFF(scan) += NEXT_OFF(n);
3458             STR_LEN(scan) += STR_LEN(n);
3459             next = n + NODE_SZ_STR(n);
3460             /* Now we can overwrite *n : */
3461             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3462 #ifdef DEBUGGING
3463             stop = next - 1;
3464 #endif
3465             n = nnext;
3466             if (stopnow) break;
3467         }
3468
3469 #ifdef EXPERIMENTAL_INPLACESCAN
3470         if (flags && !NEXT_OFF(n)) {
3471             DEBUG_PEEP("atch", val, depth);
3472             if (reg_off_by_arg[OP(n)]) {
3473                 ARG_SET(n, val - n);
3474             }
3475             else {
3476                 NEXT_OFF(n) = val - n;
3477             }
3478             stopnow = 1;
3479         }
3480 #endif
3481     }
3482
3483     *min_subtract = 0;
3484     *unfolded_multi_char = FALSE;
3485
3486     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3487      * can now analyze for sequences of problematic code points.  (Prior to
3488      * this final joining, sequences could have been split over boundaries, and
3489      * hence missed).  The sequences only happen in folding, hence for any
3490      * non-EXACT EXACTish node */
3491     if (OP(scan) != EXACT) {
3492         U8* s0 = (U8*) STRING(scan);
3493         U8* s = s0;
3494         U8* s_end = s0 + STR_LEN(scan);
3495
3496         int total_count_delta = 0;  /* Total delta number of characters that
3497                                        multi-char folds expand to */
3498
3499         /* One pass is made over the node's string looking for all the
3500          * possibilities.  To avoid some tests in the loop, there are two main
3501          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3502          * non-UTF-8 */
3503         if (UTF) {
3504             U8* folded = NULL;
3505
3506             if (OP(scan) == EXACTFL) {
3507                 U8 *d;
3508
3509                 /* An EXACTFL node would already have been changed to another
3510                  * node type unless there is at least one character in it that
3511                  * is problematic; likely a character whose fold definition
3512                  * won't be known until runtime, and so has yet to be folded.
3513                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3514                  * to handle the UTF-8 case, we need to create a temporary
3515                  * folded copy using UTF-8 locale rules in order to analyze it.
3516                  * This is because our macros that look to see if a sequence is
3517                  * a multi-char fold assume everything is folded (otherwise the
3518                  * tests in those macros would be too complicated and slow).
3519                  * Note that here, the non-problematic folds will have already
3520                  * been done, so we can just copy such characters.  We actually
3521                  * don't completely fold the EXACTFL string.  We skip the
3522                  * unfolded multi-char folds, as that would just create work
3523                  * below to figure out the size they already are */
3524
3525                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3526                 d = folded;
3527                 while (s < s_end) {
3528                     STRLEN s_len = UTF8SKIP(s);
3529                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3530                         Copy(s, d, s_len, U8);
3531                         d += s_len;
3532                     }
3533                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3534                         *unfolded_multi_char = TRUE;
3535                         Copy(s, d, s_len, U8);
3536                         d += s_len;
3537                     }
3538                     else if (isASCII(*s)) {
3539                         *(d++) = toFOLD(*s);
3540                     }
3541                     else {
3542                         STRLEN len;
3543                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3544                         d += len;
3545                     }
3546                     s += s_len;
3547                 }
3548
3549                 /* Point the remainder of the routine to look at our temporary
3550                  * folded copy */
3551                 s = folded;
3552                 s_end = d;
3553             } /* End of creating folded copy of EXACTFL string */
3554
3555             /* Examine the string for a multi-character fold sequence.  UTF-8
3556              * patterns have all characters pre-folded by the time this code is
3557              * executed */
3558             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3559                                      length sequence we are looking for is 2 */
3560             {
3561                 int count = 0;  /* How many characters in a multi-char fold */
3562                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3563                 if (! len) {    /* Not a multi-char fold: get next char */
3564                     s += UTF8SKIP(s);
3565                     continue;
3566                 }
3567
3568                 /* Nodes with 'ss' require special handling, except for
3569                  * EXACTFA-ish for which there is no multi-char fold to this */
3570                 if (len == 2 && *s == 's' && *(s+1) == 's'
3571                     && OP(scan) != EXACTFA
3572                     && OP(scan) != EXACTFA_NO_TRIE)
3573                 {
3574                     count = 2;
3575                     if (OP(scan) != EXACTFL) {
3576                         OP(scan) = EXACTFU_SS;
3577                     }
3578                     s += 2;
3579                 }
3580                 else { /* Here is a generic multi-char fold. */
3581                     U8* multi_end  = s + len;
3582
3583                     /* Count how many characters are in it.  In the case of
3584                      * /aa, no folds which contain ASCII code points are
3585                      * allowed, so check for those, and skip if found. */
3586                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3587                         count = utf8_length(s, multi_end);
3588                         s = multi_end;
3589                     }
3590                     else {
3591                         while (s < multi_end) {
3592                             if (isASCII(*s)) {
3593                                 s++;
3594                                 goto next_iteration;
3595                             }
3596                             else {
3597                                 s += UTF8SKIP(s);
3598                             }
3599                             count++;
3600                         }
3601                     }
3602                 }
3603
3604                 /* The delta is how long the sequence is minus 1 (1 is how long
3605                  * the character that folds to the sequence is) */
3606                 total_count_delta += count - 1;
3607               next_iteration: ;
3608             }
3609
3610             /* We created a temporary folded copy of the string in EXACTFL
3611              * nodes.  Therefore we need to be sure it doesn't go below zero,
3612              * as the real string could be shorter */
3613             if (OP(scan) == EXACTFL) {
3614                 int total_chars = utf8_length((U8*) STRING(scan),
3615                                            (U8*) STRING(scan) + STR_LEN(scan));
3616                 if (total_count_delta > total_chars) {
3617                     total_count_delta = total_chars;
3618                 }
3619             }
3620
3621             *min_subtract += total_count_delta;
3622             Safefree(folded);
3623         }
3624         else if (OP(scan) == EXACTFA) {
3625
3626             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3627              * fold to the ASCII range (and there are no existing ones in the
3628              * upper latin1 range).  But, as outlined in the comments preceding
3629              * this function, we need to flag any occurrences of the sharp s.
3630              * This character forbids trie formation (because of added
3631              * complexity) */
3632             while (s < s_end) {
3633                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3634                     OP(scan) = EXACTFA_NO_TRIE;
3635                     *unfolded_multi_char = TRUE;
3636                     break;
3637                 }
3638                 s++;
3639                 continue;
3640             }
3641         }
3642         else {
3643
3644             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3645              * folds that are all Latin1.  As explained in the comments
3646              * preceding this function, we look also for the sharp s in EXACTF
3647              * and EXACTFL nodes; it can be in the final position.  Otherwise
3648              * we can stop looking 1 byte earlier because have to find at least
3649              * two characters for a multi-fold */
3650             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3651                               ? s_end
3652                               : s_end -1;
3653
3654             while (s < upper) {
3655                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3656                 if (! len) {    /* Not a multi-char fold. */
3657                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3658                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3659                     {
3660                         *unfolded_multi_char = TRUE;
3661                     }
3662                     s++;
3663                     continue;
3664                 }
3665
3666                 if (len == 2
3667                     && isALPHA_FOLD_EQ(*s, 's')
3668                     && isALPHA_FOLD_EQ(*(s+1), 's'))
3669                 {
3670
3671                     /* EXACTF nodes need to know that the minimum length
3672                      * changed so that a sharp s in the string can match this
3673                      * ss in the pattern, but they remain EXACTF nodes, as they
3674                      * won't match this unless the target string is is UTF-8,
3675                      * which we don't know until runtime.  EXACTFL nodes can't
3676                      * transform into EXACTFU nodes */
3677                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3678                         OP(scan) = EXACTFU_SS;
3679                     }
3680                 }
3681
3682                 *min_subtract += len - 1;
3683                 s += len;
3684             }
3685         }
3686     }
3687
3688 #ifdef DEBUGGING
3689     /* Allow dumping but overwriting the collection of skipped
3690      * ops and/or strings with fake optimized ops */
3691     n = scan + NODE_SZ_STR(scan);
3692     while (n <= stop) {
3693         OP(n) = OPTIMIZED;
3694         FLAGS(n) = 0;
3695         NEXT_OFF(n) = 0;
3696         n++;
3697     }
3698 #endif
3699     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3700     return stopnow;
3701 }
3702
3703 /* REx optimizer.  Converts nodes into quicker variants "in place".
3704    Finds fixed substrings.  */
3705
3706 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3707    to the position after last scanned or to NULL. */
3708
3709 #define INIT_AND_WITHP \
3710     assert(!and_withp); \
3711     Newx(and_withp,1, regnode_ssc); \
3712     SAVEFREEPV(and_withp)
3713
3714
3715 static void
3716 S_unwind_scan_frames(pTHX_ const void *p)
3717 {
3718     scan_frame *f= (scan_frame *)p;
3719     do {
3720         scan_frame *n= f->next_frame;
3721         Safefree(f);
3722         f= n;
3723     } while (f);
3724 }
3725
3726
3727 STATIC SSize_t
3728 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3729                         SSize_t *minlenp, SSize_t *deltap,
3730                         regnode *last,
3731                         scan_data_t *data,
3732                         I32 stopparen,
3733                         U32 recursed_depth,
3734                         regnode_ssc *and_withp,
3735                         U32 flags, U32 depth)
3736                         /* scanp: Start here (read-write). */
3737                         /* deltap: Write maxlen-minlen here. */
3738                         /* last: Stop before this one. */
3739                         /* data: string data about the pattern */
3740                         /* stopparen: treat close N as END */
3741                         /* recursed: which subroutines have we recursed into */
3742                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3743 {
3744     /* There must be at least this number of characters to match */
3745     SSize_t min = 0;
3746     I32 pars = 0, code;
3747     regnode *scan = *scanp, *next;
3748     SSize_t delta = 0;
3749     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3750     int is_inf_internal = 0;            /* The studied chunk is infinite */
3751     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3752     scan_data_t data_fake;
3753     SV *re_trie_maxbuff = NULL;
3754     regnode *first_non_open = scan;
3755     SSize_t stopmin = SSize_t_MAX;
3756     scan_frame *frame = NULL;
3757     GET_RE_DEBUG_FLAGS_DECL;
3758
3759     PERL_ARGS_ASSERT_STUDY_CHUNK;
3760
3761
3762     if ( depth == 0 ) {
3763         while (first_non_open && OP(first_non_open) == OPEN)
3764             first_non_open=regnext(first_non_open);
3765     }
3766
3767
3768   fake_study_recurse:
3769     DEBUG_r(
3770         RExC_study_chunk_recursed_count++;
3771     );
3772     DEBUG_OPTIMISE_MORE_r(
3773     {
3774         PerlIO_printf(Perl_debug_log,
3775             "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3776             (int)(depth*2), "", (long)stopparen,
3777             (unsigned long)RExC_study_chunk_recursed_count,
3778             (unsigned long)depth, (unsigned long)recursed_depth,
3779             scan,
3780             last);
3781         if (recursed_depth) {
3782             U32 i;
3783             U32 j;
3784             for ( j = 0 ; j < recursed_depth ; j++ ) {
3785                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3786                     if (
3787                         PAREN_TEST(RExC_study_chunk_recursed +
3788                                    ( j * RExC_study_chunk_recursed_bytes), i )
3789                         && (
3790                             !j ||
3791                             !PAREN_TEST(RExC_study_chunk_recursed +
3792                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3793                         )
3794                     ) {
3795                         PerlIO_printf(Perl_debug_log," %d",(int)i);
3796                         break;
3797                     }
3798                 }
3799                 if ( j + 1 < recursed_depth ) {
3800                     PerlIO_printf(Perl_debug_log, ",");
3801                 }
3802             }
3803         }
3804         PerlIO_printf(Perl_debug_log,"\n");
3805     }
3806     );
3807     while ( scan && OP(scan) != END && scan < last ){
3808         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3809                                    node length to get a real minimum (because
3810                                    the folded version may be shorter) */
3811         bool unfolded_multi_char = FALSE;
3812         /* Peephole optimizer: */
3813         DEBUG_STUDYDATA("Peep:", data, depth);
3814         DEBUG_PEEP("Peep", scan, depth);
3815
3816
3817         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3818          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3819          * by a different invocation of reg() -- Yves
3820          */
3821         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3822
3823         /* Follow the next-chain of the current node and optimize
3824            away all the NOTHINGs from it.  */
3825         if (OP(scan) != CURLYX) {
3826             const int max = (reg_off_by_arg[OP(scan)]
3827                        ? I32_MAX
3828                        /* I32 may be smaller than U16 on CRAYs! */
3829                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3830             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3831             int noff;
3832             regnode *n = scan;
3833
3834             /* Skip NOTHING and LONGJMP. */
3835             while ((n = regnext(n))
3836                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3837                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3838                    && off + noff < max)
3839                 off += noff;
3840             if (reg_off_by_arg[OP(scan)])
3841                 ARG(scan) = off;
3842             else
3843                 NEXT_OFF(scan) = off;
3844         }
3845
3846         /* The principal pseudo-switch.  Cannot be a switch, since we
3847            look into several different things.  */
3848         if ( OP(scan) == DEFINEP ) {
3849             SSize_t minlen = 0;
3850             SSize_t deltanext = 0;
3851             SSize_t fake_last_close = 0;
3852             I32 f = SCF_IN_DEFINE;
3853
3854             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3855             scan = regnext(scan);
3856             assert( OP(scan) == IFTHEN );
3857             DEBUG_PEEP("expect IFTHEN", scan, depth);
3858
3859             data_fake.last_closep= &fake_last_close;
3860             minlen = *minlenp;
3861             next = regnext(scan);
3862             scan = NEXTOPER(NEXTOPER(scan));
3863             DEBUG_PEEP("scan", scan, depth);
3864             DEBUG_PEEP("next", next, depth);
3865
3866             /* we suppose the run is continuous, last=next...
3867              * NOTE we dont use the return here! */
3868             (void)study_chunk(pRExC_state, &scan, &minlen,
3869                               &deltanext, next, &data_fake, stopparen,
3870                               recursed_depth, NULL, f, depth+1);
3871
3872             scan = next;
3873         } else
3874         if (
3875             OP(scan) == BRANCH  ||
3876             OP(scan) == BRANCHJ ||
3877             OP(scan) == IFTHEN
3878         ) {
3879             next = regnext(scan);
3880             code = OP(scan);
3881
3882             /* The op(next)==code check below is to see if we
3883              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3884              * IFTHEN is special as it might not appear in pairs.
3885              * Not sure whether BRANCH-BRANCHJ is possible, regardless
3886              * we dont handle it cleanly. */
3887             if (OP(next) == code || code == IFTHEN) {
3888                 /* NOTE - There is similar code to this block below for
3889                  * handling TRIE nodes on a re-study.  If you change stuff here
3890                  * check there too. */
3891                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3892                 regnode_ssc accum;
3893                 regnode * const startbranch=scan;
3894
3895                 if (flags & SCF_DO_SUBSTR) {
3896                     /* Cannot merge strings after this. */
3897                     scan_commit(pRExC_state, data, minlenp, is_inf);
3898                 }
3899
3900                 if (flags & SCF_DO_STCLASS)
3901                     ssc_init_zero(pRExC_state, &accum);
3902
3903                 while (OP(scan) == code) {
3904                     SSize_t deltanext, minnext, fake;
3905                     I32 f = 0;
3906                     regnode_ssc this_class;
3907
3908                     DEBUG_PEEP("Branch", scan, depth);
3909
3910                     num++;
3911                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3912                     if (data) {
3913                         data_fake.whilem_c = data->whilem_c;
3914                         data_fake.last_closep = data->last_closep;
3915                     }
3916                     else
3917                         data_fake.last_closep = &fake;
3918
3919                     data_fake.pos_delta = delta;
3920                     next = regnext(scan);
3921
3922                     scan = NEXTOPER(scan); /* everything */
3923                     if (code != BRANCH)    /* everything but BRANCH */
3924                         scan = NEXTOPER(scan);
3925
3926                     if (flags & SCF_DO_STCLASS) {
3927                         ssc_init(pRExC_state, &this_class);
3928                         data_fake.start_class = &this_class;
3929                         f = SCF_DO_STCLASS_AND;
3930                     }
3931                     if (flags & SCF_WHILEM_VISITED_POS)
3932                         f |= SCF_WHILEM_VISITED_POS;
3933
3934                     /* we suppose the run is continuous, last=next...*/
3935                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3936                                       &deltanext, next, &data_fake, stopparen,
3937                                       recursed_depth, NULL, f,depth+1);
3938
3939                     if (min1 > minnext)
3940                         min1 = minnext;
3941                     if (deltanext == SSize_t_MAX) {
3942                         is_inf = is_inf_internal = 1;
3943                         max1 = SSize_t_MAX;
3944                     } else if (max1 < minnext + deltanext)
3945                         max1 = minnext + deltanext;
3946                     scan = next;
3947                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3948                         pars++;
3949                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3950                         if ( stopmin > minnext)
3951                             stopmin = min + min1;
3952                         flags &= ~SCF_DO_SUBSTR;
3953                         if (data)
3954                             data->flags |= SCF_SEEN_ACCEPT;
3955                     }
3956                     if (data) {
3957                         if (data_fake.flags & SF_HAS_EVAL)
3958                             data->flags |= SF_HAS_EVAL;
3959                         data->whilem_c = data_fake.whilem_c;
3960                     }
3961                     if (flags & SCF_DO_STCLASS)
3962                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3963                 }
3964                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3965                     min1 = 0;
3966                 if (flags & SCF_DO_SUBSTR) {
3967                     data->pos_min += min1;
3968                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3969                         data->pos_delta = SSize_t_MAX;
3970                     else
3971                         data->pos_delta += max1 - min1;
3972                     if (max1 != min1 || is_inf)
3973                         data->longest = &(data->longest_float);
3974                 }
3975                 min += min1;
3976                 if (delta == SSize_t_MAX
3977                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3978                     delta = SSize_t_MAX;
3979                 else
3980                     delta += max1 - min1;
3981                 if (flags & SCF_DO_STCLASS_OR) {
3982                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3983                     if (min1) {
3984                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3985                         flags &= ~SCF_DO_STCLASS;
3986                     }
3987                 }
3988                 else if (flags & SCF_DO_STCLASS_AND) {
3989                     if (min1) {
3990                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3991                         flags &= ~SCF_DO_STCLASS;
3992                     }
3993                     else {
3994                         /* Switch to OR mode: cache the old value of
3995                          * data->start_class */
3996                         INIT_AND_WITHP;
3997                         StructCopy(data->start_class, and_withp, regnode_ssc);
3998                         flags &= ~SCF_DO_STCLASS_AND;
3999                         StructCopy(&accum, data->start_class, regnode_ssc);
4000                         flags |= SCF_DO_STCLASS_OR;
4001                     }
4002                 }
4003
4004                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4005                         OP( startbranch ) == BRANCH )
4006                 {
4007                 /* demq.
4008
4009                    Assuming this was/is a branch we are dealing with: 'scan'
4010                    now points at the item that follows the branch sequence,
4011                    whatever it is. We now start at the beginning of the
4012                    sequence and look for subsequences of
4013
4014                    BRANCH->EXACT=>x1
4015                    BRANCH->EXACT=>x2
4016                    tail
4017
4018                    which would be constructed from a pattern like
4019                    /A|LIST|OF|WORDS/
4020
4021                    If we can find such a subsequence we need to turn the first
4022                    element into a trie and then add the subsequent branch exact
4023                    strings to the trie.
4024
4025                    We have two cases
4026
4027                      1. patterns where the whole set of branches can be
4028                         converted.
4029
4030                      2. patterns where only a subset can be converted.
4031
4032                    In case 1 we can replace the whole set with a single regop
4033                    for the trie. In case 2 we need to keep the start and end
4034                    branches so
4035
4036                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4037                      becomes BRANCH TRIE; BRANCH X;
4038
4039                   There is an additional case, that being where there is a
4040                   common prefix, which gets split out into an EXACT like node
4041                   preceding the TRIE node.
4042
4043                   If x(1..n)==tail then we can do a simple trie, if not we make
4044                   a "jump" trie, such that when we match the appropriate word
4045                   we "jump" to the appropriate tail node. Essentially we turn
4046                   a nested if into a case structure of sorts.
4047
4048                 */
4049
4050                     int made=0;
4051                     if (!re_trie_maxbuff) {
4052                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4053                         if (!SvIOK(re_trie_maxbuff))
4054                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4055                     }
4056                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4057                         regnode *cur;
4058                         regnode *first = (regnode *)NULL;
4059                         regnode *last = (regnode *)NULL;
4060                         regnode *tail = scan;
4061                         U8 trietype = 0;
4062                         U32 count=0;
4063
4064                         /* var tail is used because there may be a TAIL
4065                            regop in the way. Ie, the exacts will point to the
4066                            thing following the TAIL, but the last branch will
4067                            point at the TAIL. So we advance tail. If we
4068                            have nested (?:) we may have to move through several
4069                            tails.
4070                          */
4071
4072                         while ( OP( tail ) == TAIL ) {
4073                             /* this is the TAIL generated by (?:) */
4074                             tail = regnext( tail );
4075                         }
4076
4077
4078                         DEBUG_TRIE_COMPILE_r({
4079                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4080                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4081                               (int)depth * 2 + 2, "",
4082                               "Looking for TRIE'able sequences. Tail node is: ",
4083                               SvPV_nolen_const( RExC_mysv )
4084                             );
4085                         });
4086
4087                         /*
4088
4089                             Step through the branches
4090                                 cur represents each branch,
4091                                 noper is the first thing to be matched as part
4092                                       of that branch
4093                                 noper_next is the regnext() of that node.
4094
4095                             We normally handle a case like this
4096                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4097                             support building with NOJUMPTRIE, which restricts
4098                             the trie logic to structures like /FOO|BAR/.
4099
4100                             If noper is a trieable nodetype then the branch is
4101                             a possible optimization target. If we are building
4102                             under NOJUMPTRIE then we require that noper_next is
4103                             the same as scan (our current position in the regex
4104                             program).
4105
4106                             Once we have two or more consecutive such branches
4107                             we can create a trie of the EXACT's contents and
4108                             stitch it in place into the program.
4109
4110                             If the sequence represents all of the branches in
4111                             the alternation we replace the entire thing with a
4112                             single TRIE node.
4113
4114                             Otherwise when it is a subsequence we need to
4115                             stitch it in place and replace only the relevant
4116                             branches. This means the first branch has to remain
4117                             as it is used by the alternation logic, and its
4118                             next pointer, and needs to be repointed at the item
4119                             on the branch chain following the last branch we
4120                             have optimized away.
4121
4122                             This could be either a BRANCH, in which case the
4123                             subsequence is internal, or it could be the item
4124                             following the branch sequence in which case the
4125                             subsequence is at the end (which does not
4126                             necessarily mean the first node is the start of the
4127                             alternation).
4128
4129                             TRIE_TYPE(X) is a define which maps the optype to a
4130                             trietype.
4131
4132                                 optype          |  trietype
4133                                 ----------------+-----------
4134                                 NOTHING         | NOTHING
4135                                 EXACT           | EXACT
4136                                 EXACTFU         | EXACTFU
4137                                 EXACTFU_SS      | EXACTFU
4138                                 EXACTFA         | EXACTFA
4139
4140
4141                         */
4142 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
4143                        ( EXACT == (X) )   ? EXACT :        \
4144                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
4145                        ( EXACTFA == (X) ) ? EXACTFA :        \
4146                        0 )
4147
4148                         /* dont use tail as the end marker for this traverse */
4149                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4150                             regnode * const noper = NEXTOPER( cur );
4151                             U8 noper_type = OP( noper );
4152                             U8 noper_trietype = TRIE_TYPE( noper_type );
4153 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4154                             regnode * const noper_next = regnext( noper );
4155                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4156                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4157 #endif
4158
4159                             DEBUG_TRIE_COMPILE_r({
4160                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4161                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4162                                    (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4163
4164                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4165                                 PerlIO_printf( Perl_debug_log, " -> %s",
4166                                     SvPV_nolen_const(RExC_mysv));
4167
4168                                 if ( noper_next ) {
4169                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4170                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4171                                     SvPV_nolen_const(RExC_mysv));
4172                                 }
4173                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4174                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4175                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4176                                 );
4177                             });
4178
4179                             /* Is noper a trieable nodetype that can be merged
4180                              * with the current trie (if there is one)? */
4181                             if ( noper_trietype
4182                                   &&
4183                                   (
4184                                         ( noper_trietype == NOTHING)
4185                                         || ( trietype == NOTHING )
4186                                         || ( trietype == noper_trietype )
4187                                   )
4188 #ifdef NOJUMPTRIE
4189                                   && noper_next == tail
4190 #endif
4191                                   && count < U16_MAX)
4192                             {
4193                                 /* Handle mergable triable node Either we are
4194                                  * the first node in a new trieable sequence,
4195                                  * in which case we do some bookkeeping,
4196                                  * otherwise we update the end pointer. */
4197                                 if ( !first ) {
4198                                     first = cur;
4199                                     if ( noper_trietype == NOTHING ) {
4200 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4201                                         regnode * const noper_next = regnext( noper );
4202                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4203                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4204 #endif
4205
4206                                         if ( noper_next_trietype ) {
4207                                             trietype = noper_next_trietype;
4208                                         } else if (noper_next_type)  {
4209                                             /* a NOTHING regop is 1 regop wide.
4210                                              * We need at least two for a trie
4211                                              * so we can't merge this in */
4212                                             first = NULL;
4213                                         }
4214                                     } else {
4215                                         trietype = noper_trietype;
4216                                     }
4217                                 } else {
4218                                     if ( trietype == NOTHING )
4219                                         trietype = noper_trietype;
4220                                     last = cur;
4221                                 }
4222                                 if (first)
4223                                     count++;
4224                             } /* end handle mergable triable node */
4225                             else {
4226                                 /* handle unmergable node -
4227                                  * noper may either be a triable node which can
4228                                  * not be tried together with the current trie,
4229                                  * or a non triable node */
4230                                 if ( last ) {
4231                                     /* If last is set and trietype is not
4232                                      * NOTHING then we have found at least two
4233                                      * triable branch sequences in a row of a
4234                                      * similar trietype so we can turn them
4235                                      * into a trie. If/when we allow NOTHING to
4236                                      * start a trie sequence this condition
4237                                      * will be required, and it isn't expensive
4238                                      * so we leave it in for now. */
4239                                     if ( trietype && trietype != NOTHING )
4240                                         make_trie( pRExC_state,
4241                                                 startbranch, first, cur, tail,
4242                                                 count, trietype, depth+1 );
4243                                     last = NULL; /* note: we clear/update
4244                                                     first, trietype etc below,
4245                                                     so we dont do it here */
4246                                 }
4247                                 if ( noper_trietype
4248 #ifdef NOJUMPTRIE
4249                                      && noper_next == tail
4250 #endif
4251                                 ){
4252                                     /* noper is triable, so we can start a new
4253                                      * trie sequence */
4254                                     count = 1;
4255                                     first = cur;
4256                                     trietype = noper_trietype;
4257                                 } else if (first) {
4258                                     /* if we already saw a first but the
4259                                      * current node is not triable then we have
4260                                      * to reset the first information. */
4261                                     count = 0;
4262                                     first = NULL;
4263                                     trietype = 0;
4264                                 }
4265                             } /* end handle unmergable node */
4266                         } /* loop over branches */
4267                         DEBUG_TRIE_COMPILE_r({
4268                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4269                             PerlIO_printf( Perl_debug_log,
4270                               "%*s- %s (%d) <SCAN FINISHED>\n",
4271                               (int)depth * 2 + 2,
4272                               "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4273
4274                         });
4275                         if ( last && trietype ) {
4276                             if ( trietype != NOTHING ) {
4277                                 /* the last branch of the sequence was part of
4278                                  * a trie, so we have to construct it here
4279                                  * outside of the loop */
4280                                 made= make_trie( pRExC_state, startbranch,
4281                                                  first, scan, tail, count,
4282                                                  trietype, depth+1 );
4283 #ifdef TRIE_STUDY_OPT
4284                                 if ( ((made == MADE_EXACT_TRIE &&
4285                                      startbranch == first)
4286                                      || ( first_non_open == first )) &&
4287                                      depth==0 ) {
4288                                     flags |= SCF_TRIE_RESTUDY;
4289                                     if ( startbranch == first
4290                                          && scan == tail )
4291                                     {
4292                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4293                                     }
4294                                 }
4295 #endif
4296                             } else {
4297                                 /* at this point we know whatever we have is a
4298                                  * NOTHING sequence/branch AND if 'startbranch'
4299                                  * is 'first' then we can turn the whole thing
4300                                  * into a NOTHING
4301                                  */
4302                                 if ( startbranch == first ) {
4303                                     regnode *opt;
4304                                     /* the entire thing is a NOTHING sequence,
4305                                      * something like this: (?:|) So we can
4306                                      * turn it into a plain NOTHING op. */
4307                                     DEBUG_TRIE_COMPILE_r({
4308                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4309                                         PerlIO_printf( Perl_debug_log,
4310                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4311                                           "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4312
4313                                     });
4314                                     OP(startbranch)= NOTHING;
4315                                     NEXT_OFF(startbranch)= tail - startbranch;
4316                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4317                                         OP(opt)= OPTIMIZED;
4318                                 }
4319                             }
4320                         } /* end if ( last) */
4321                     } /* TRIE_MAXBUF is non zero */
4322
4323                 } /* do trie */
4324
4325             }
4326             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4327                 scan = NEXTOPER(NEXTOPER(scan));
4328             } else                      /* single branch is optimized. */
4329                 scan = NEXTOPER(scan);
4330             continue;
4331         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4332             I32 paren = 0;
4333             regnode *start = NULL;
4334             regnode *end = NULL;
4335             U32 my_recursed_depth= recursed_depth;
4336
4337
4338             if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4339                 /* Do setup, note this code has side effects beyond
4340                  * the rest of this block. Specifically setting
4341                  * RExC_recurse[] must happen at least once during
4342                  * study_chunk(). */
4343                 if (OP(scan) == GOSUB) {
4344                     paren = ARG(scan);
4345                     RExC_recurse[ARG2L(scan)] = scan;
4346                     start = RExC_open_parens[paren-1];
4347                     end   = RExC_close_parens[paren-1];
4348                 } else {
4349                     start = RExC_rxi->program + 1;
4350                     end   = RExC_opend;
4351                 }
4352                 /* NOTE we MUST always execute the above code, even
4353                  * if we do nothing with a GOSUB/GOSTART */
4354                 if (
4355                     ( flags & SCF_IN_DEFINE )
4356                     ||
4357                     (
4358                         (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4359                         &&
4360                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4361                     )
4362                 ) {
4363                     /* no need to do anything here if we are in a define. */
4364                     /* or we are after some kind of infinite construct
4365                      * so we can skip recursing into this item.
4366                      * Since it is infinite we will not change the maxlen
4367                      * or delta, and if we miss something that might raise
4368                      * the minlen it will merely pessimise a little.
4369                      *
4370                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4371                      * might result in a minlen of 1 and not of 4,
4372                      * but this doesn't make us mismatch, just try a bit
4373                      * harder than we should.
4374                      * */
4375                     scan= regnext(scan);
4376                     continue;
4377                 }
4378
4379                 if (
4380                     !recursed_depth
4381                     ||
4382                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4383                 ) {
4384                     /* it is quite possible that there are more efficient ways
4385                      * to do this. We maintain a bitmap per level of recursion
4386                      * of which patterns we have entered so we can detect if a
4387                      * pattern creates a possible infinite loop. When we
4388                      * recurse down a level we copy the previous levels bitmap
4389                      * down. When we are at recursion level 0 we zero the top
4390                      * level bitmap. It would be nice to implement a different
4391                      * more efficient way of doing this. In particular the top
4392                      * level bitmap may be unnecessary.
4393                      */
4394                     if (!recursed_depth) {
4395                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4396                     } else {
4397                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4398                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4399                              RExC_study_chunk_recursed_bytes, U8);
4400                     }
4401                     /* we havent recursed into this paren yet, so recurse into it */
4402                     DEBUG_STUDYDATA("set:", data,depth);
4403                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4404                     my_recursed_depth= recursed_depth + 1;
4405                 } else {
4406                     DEBUG_STUDYDATA("inf:", data,depth);
4407                     /* some form of infinite recursion, assume infinite length
4408                      * */
4409                     if (flags & SCF_DO_SUBSTR) {
4410                         scan_commit(pRExC_state, data, minlenp, is_inf);
4411                         data->longest = &(data->longest_float);
4412                     }
4413                     is_inf = is_inf_internal = 1;
4414                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4415                         ssc_anything(data->start_class);
4416                     flags &= ~SCF_DO_STCLASS;
4417
4418                     start= NULL; /* reset start so we dont recurse later on. */
4419                 }
4420             } else {
4421                 paren = stopparen;
4422                 start = scan + 2;
4423                 end = regnext(scan);
4424             }
4425             if (start) {
4426                 scan_frame *newframe;
4427                 assert(end);
4428                 if (!RExC_frame_last) {
4429                     Newxz(newframe, 1, scan_frame);
4430                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4431                     RExC_frame_head= newframe;
4432                     RExC_frame_count++;
4433                 } else if (!RExC_frame_last->next_frame) {
4434                     Newxz(newframe,1,scan_frame);
4435                     RExC_frame_last->next_frame= newframe;
4436                     newframe->prev_frame= RExC_frame_last;
4437                     RExC_frame_count++;
4438                 } else {
4439                     newframe= RExC_frame_last->next_frame;
4440                 }
4441                 RExC_frame_last= newframe;
4442
4443                 newframe->next_regnode = regnext(scan);
4444                 newframe->last_regnode = last;
4445                 newframe->stopparen = stopparen;
4446                 newframe->prev_recursed_depth = recursed_depth;
4447                 newframe->this_prev_frame= frame;
4448
4449                 DEBUG_STUDYDATA("frame-new:",data,depth);
4450                 DEBUG_PEEP("fnew", scan, depth);
4451
4452                 frame = newframe;
4453                 scan =  start;
4454                 stopparen = paren;
4455                 last = end;
4456                 depth = depth + 1;
4457                 recursed_depth= my_recursed_depth;
4458
4459                 continue;
4460             }
4461         }
4462         else if (OP(scan) == EXACT) {
4463             SSize_t l = STR_LEN(scan);
4464             UV uc;
4465             if (UTF) {
4466                 const U8 * const s = (U8*)STRING(scan);
4467                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4468                 l = utf8_length(s, s + l);
4469             } else {
4470                 uc = *((U8*)STRING(scan));
4471             }
4472             min += l;
4473             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4474                 /* The code below prefers earlier match for fixed
4475                    offset, later match for variable offset.  */
4476                 if (data->last_end == -1) { /* Update the start info. */
4477                     data->last_start_min = data->pos_min;
4478                     data->last_start_max = is_inf
4479                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4480                 }
4481                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4482                 if (UTF)
4483                     SvUTF8_on(data->last_found);
4484                 {
4485                     SV * const sv = data->last_found;
4486                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4487                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4488                     if (mg && mg->mg_len >= 0)
4489                         mg->mg_len += utf8_length((U8*)STRING(scan),
4490                                               (U8*)STRING(scan)+STR_LEN(scan));
4491                 }
4492                 data->last_end = data->pos_min + l;
4493                 data->pos_min += l; /* As in the first entry. */
4494                 data->flags &= ~SF_BEFORE_EOL;
4495             }
4496
4497             /* ANDing the code point leaves at most it, and not in locale, and
4498              * can't match null string */
4499             if (flags & SCF_DO_STCLASS_AND) {
4500                 ssc_cp_and(data->start_class, uc);
4501                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4502                 ssc_clear_locale(data->start_class);
4503             }
4504             else if (flags & SCF_DO_STCLASS_OR) {
4505                 ssc_add_cp(data->start_class, uc);
4506                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4507
4508                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4509                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4510             }
4511             flags &= ~SCF_DO_STCLASS;
4512         }
4513         else if (PL_regkind[OP(scan)] == EXACT) {
4514             /* But OP != EXACT!, so is EXACTFish */
4515             SSize_t l = STR_LEN(scan);
4516             UV uc = *((U8*)STRING(scan));
4517             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4518                                                      separate code points */
4519             const U8 * s = (U8*)STRING(scan);
4520
4521             /* Search for fixed substrings supports EXACT only. */
4522             if (flags & SCF_DO_SUBSTR) {
4523                 assert(data);
4524                 scan_commit(pRExC_state, data, minlenp, is_inf);
4525             }
4526             if (UTF) {
4527                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4528                 l = utf8_length(s, s + l);
4529             }
4530             if (unfolded_multi_char) {
4531                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4532             }
4533             min += l - min_subtract;
4534             assert (min >= 0);
4535             delta += min_subtract;
4536             if (flags & SCF_DO_SUBSTR) {
4537                 data->pos_min += l - min_subtract;
4538                 if (data->pos_min < 0) {
4539                     data->pos_min = 0;
4540                 }
4541                 data->pos_delta += min_subtract;
4542                 if (min_subtract) {
4543                     data->longest = &(data->longest_float);
4544                 }
4545             }
4546
4547             if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4548                 ssc_clear_locale(data->start_class);
4549             }
4550
4551             if (! UTF) {
4552
4553                 /* We punt and assume can match anything if the node begins
4554                  * with a multi-character fold.  Things are complicated.  For
4555                  * example, /ffi/i could match any of:
4556                  *  "\N{LATIN SMALL LIGATURE FFI}"
4557                  *  "\N{LATIN SMALL LIGATURE FF}I"
4558                  *  "F\N{LATIN SMALL LIGATURE FI}"
4559                  *  plus several other things; and making sure we have all the
4560                  *  possibilities is hard. */
4561                 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4562                     EXACTF_invlist =
4563                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4564                 }
4565                 else {
4566
4567                     /* Any Latin1 range character can potentially match any
4568                      * other depending on the locale */
4569                     if (OP(scan) == EXACTFL) {
4570                         _invlist_union(EXACTF_invlist, PL_Latin1,
4571                                                               &EXACTF_invlist);
4572                     }
4573                     else {
4574                         /* But otherwise, it matches at least itself.  We can
4575                          * quickly tell if it has a distinct fold, and if so,
4576                          * it matches that as well */
4577                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4578                         if (IS_IN_SOME_FOLD_L1(uc)) {
4579                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4580                                                            PL_fold_latin1[uc]);
4581                         }
4582                     }
4583
4584                     /* Some characters match above-Latin1 ones under /i.  This
4585                      * is true of EXACTFL ones when the locale is UTF-8 */
4586                     if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4587                         && (! isASCII(uc) || (OP(scan) != EXACTFA
4588                                             && OP(scan) != EXACTFA_NO_TRIE)))
4589                     {
4590                         add_above_Latin1_folds(pRExC_state,
4591                                                (U8) uc,
4592                                                &EXACTF_invlist);
4593                     }
4594                 }
4595             }
4596             else {  /* Pattern is UTF-8 */
4597                 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4598                 STRLEN foldlen = UTF8SKIP(s);
4599                 const U8* e = s + STR_LEN(scan);
4600                 SV** listp;
4601
4602                 /* The only code points that aren't folded in a UTF EXACTFish
4603                  * node are are the problematic ones in EXACTFL nodes */
4604                 if (OP(scan) == EXACTFL
4605                     && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4606                 {
4607                     /* We need to check for the possibility that this EXACTFL
4608                      * node begins with a multi-char fold.  Therefore we fold
4609                      * the first few characters of it so that we can make that
4610                      * check */
4611                     U8 *d = folded;
4612                     int i;
4613
4614                     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4615                         if (isASCII(*s)) {
4616                             *(d++) = (U8) toFOLD(*s);
4617                             s++;
4618                         }
4619                         else {
4620                             STRLEN len;
4621                             to_utf8_fold(s, d, &len);
4622                             d += len;
4623                             s += UTF8SKIP(s);
4624                         }
4625                     }
4626
4627                     /* And set up so the code below that looks in this folded
4628                      * buffer instead of the node's string */
4629                     e = d;
4630                     foldlen = UTF8SKIP(folded);
4631                     s = folded;
4632                 }
4633
4634                 /* When we reach here 's' points to the fold of the first
4635                  * character(s) of the node; and 'e' points to far enough along
4636                  * the folded string to be just past any possible multi-char
4637                  * fold. 'foldlen' is the length in bytes of the first
4638                  * character in 's'
4639                  *
4640                  * Unlike the non-UTF-8 case, the macro for determining if a
4641                  * string is a multi-char fold requires all the characters to
4642                  * already be folded.  This is because of all the complications
4643                  * if not.  Note that they are folded anyway, except in EXACTFL
4644                  * nodes.  Like the non-UTF case above, we punt if the node
4645                  * begins with a multi-char fold  */
4646
4647                 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4648                     EXACTF_invlist =
4649                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4650                 }
4651                 else {  /* Single char fold */
4652
4653                     /* It matches all the things that fold to it, which are
4654                      * found in PL_utf8_foldclosures (including itself) */
4655                     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4656                     if (! PL_utf8_foldclosures) {
4657                         _load_PL_utf8_foldclosures();
4658                     }
4659                     if ((listp = hv_fetch(PL_utf8_foldclosures,
4660                                         (char *) s, foldlen, FALSE)))
4661                     {
4662                         AV* list = (AV*) *listp;
4663                         IV k;
4664                         for (k = 0; k <= av_tindex(list); k++) {
4665                             SV** c_p = av_fetch(list, k, FALSE);
4666                             UV c;
4667                             assert(c_p);
4668
4669                             c = SvUV(*c_p);
4670
4671                             /* /aa doesn't allow folds between ASCII and non- */
4672                             if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4673                                 && isASCII(c) != isASCII(uc))
4674                             {
4675                                 continue;
4676                             }
4677
4678                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4679                         }
4680                     }
4681                 }
4682             }
4683             if (flags & SCF_DO_STCLASS_AND) {
4684                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4685                 ANYOF_POSIXL_ZERO(data->start_class);
4686                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4687             }
4688             else if (flags & SCF_DO_STCLASS_OR) {
4689                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4690                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4691
4692                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4693                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4694             }
4695             flags &= ~SCF_DO_STCLASS;
4696             SvREFCNT_dec(EXACTF_invlist);
4697         }
4698         else if (REGNODE_VARIES(OP(scan))) {
4699             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4700             I32 fl = 0, f = flags;
4701             regnode * const oscan = scan;
4702             regnode_ssc this_class;
4703             regnode_ssc *oclass = NULL;
4704             I32 next_is_eval = 0;
4705
4706             switch (PL_regkind[OP(scan)]) {
4707             case WHILEM:                /* End of (?:...)* . */
4708                 scan = NEXTOPER(scan);
4709                 goto finish;
4710             case PLUS:
4711                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4712                     next = NEXTOPER(scan);
4713                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4714                         mincount = 1;
4715                         maxcount = REG_INFTY;
4716                         next = regnext(scan);
4717                         scan = NEXTOPER(scan);
4718                         goto do_curly;
4719                     }
4720                 }
4721                 if (flags & SCF_DO_SUBSTR)
4722                     data->pos_min++;
4723                 min++;
4724                 /* FALLTHROUGH */
4725             case STAR:
4726                 if (flags & SCF_DO_STCLASS) {
4727                     mincount = 0;
4728                     maxcount = REG_INFTY;
4729                     next = regnext(scan);
4730                     scan = NEXTOPER(scan);
4731                     goto do_curly;
4732                 }
4733                 if (flags & SCF_DO_SUBSTR) {
4734                     scan_commit(pRExC_state, data, minlenp, is_inf);
4735                     /* Cannot extend fixed substrings */
4736                     data->longest = &(data->longest_float);
4737                 }
4738                 is_inf = is_inf_internal = 1;
4739                 scan = regnext(scan);
4740                 goto optimize_curly_tail;
4741             case CURLY:
4742                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4743                     && (scan->flags == stopparen))
4744                 {
4745                     mincount = 1;
4746                     maxcount = 1;
4747                 } else {
4748                     mincount = ARG1(scan);
4749                     maxcount = ARG2(scan);
4750                 }
4751                 next = regnext(scan);
4752                 if (OP(scan) == CURLYX) {
4753                     I32 lp = (data ? *(data->last_closep) : 0);
4754                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4755                 }
4756                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4757                 next_is_eval = (OP(scan) == EVAL);
4758               do_curly:
4759                 if (flags & SCF_DO_SUBSTR) {
4760                     if (mincount == 0)
4761                         scan_commit(pRExC_state, data, minlenp, is_inf);
4762                     /* Cannot extend fixed substrings */
4763                     pos_before = data->pos_min;
4764                 }
4765                 if (data) {
4766                     fl = data->flags;
4767                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4768                     if (is_inf)
4769                         data->flags |= SF_IS_INF;
4770                 }
4771                 if (flags & SCF_DO_STCLASS) {
4772                     ssc_init(pRExC_state, &this_class);
4773                     oclass = data->start_class;
4774                     data->start_class = &this_class;
4775                     f |= SCF_DO_STCLASS_AND;
4776                     f &= ~SCF_DO_STCLASS_OR;
4777                 }
4778                 /* Exclude from super-linear cache processing any {n,m}
4779                    regops for which the combination of input pos and regex
4780                    pos is not enough information to determine if a match
4781                    will be possible.
4782
4783                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4784                    regex pos at the \s*, the prospects for a match depend not
4785                    only on the input position but also on how many (bar\s*)
4786                    repeats into the {4,8} we are. */
4787                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4788                     f &= ~SCF_WHILEM_VISITED_POS;
4789
4790                 /* This will finish on WHILEM, setting scan, or on NULL: */
4791                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4792                                   last, data, stopparen, recursed_depth, NULL,
4793                                   (mincount == 0
4794                                    ? (f & ~SCF_DO_SUBSTR)
4795                                    : f)
4796                                   ,depth+1);
4797
4798                 if (flags & SCF_DO_STCLASS)
4799                     data->start_class = oclass;
4800                 if (mincount == 0 || minnext == 0) {
4801                     if (flags & SCF_DO_STCLASS_OR) {
4802                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4803                     }
4804                     else if (flags & SCF_DO_STCLASS_AND) {
4805                         /* Switch to OR mode: cache the old value of
4806                          * data->start_class */
4807                         INIT_AND_WITHP;
4808                         StructCopy(data->start_class, and_withp, regnode_ssc);
4809                         flags &= ~SCF_DO_STCLASS_AND;
4810                         StructCopy(&this_class, data->start_class, regnode_ssc);
4811                         flags |= SCF_DO_STCLASS_OR;
4812                         ANYOF_FLAGS(data->start_class)
4813                                                 |= SSC_MATCHES_EMPTY_STRING;
4814                     }
4815                 } else {                /* Non-zero len */
4816                     if (flags & SCF_DO_STCLASS_OR) {
4817                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4818                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4819                     }
4820                     else if (flags & SCF_DO_STCLASS_AND)
4821                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4822                     flags &= ~SCF_DO_STCLASS;
4823                 }
4824                 if (!scan)              /* It was not CURLYX, but CURLY. */
4825                     scan = next;
4826                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4827                     /* ? quantifier ok, except for (?{ ... }) */
4828                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4829                     && (minnext == 0) && (deltanext == 0)
4830                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4831                     && maxcount <= REG_INFTY/3) /* Complement check for big
4832                                                    count */
4833                 {
4834                     /* Fatal warnings may leak the regexp without this: */
4835                     SAVEFREESV(RExC_rx_sv);
4836                     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4837                         "Quantifier unexpected on zero-length expression "
4838                         "in regex m/%"UTF8f"/",
4839                          UTF8fARG(UTF, RExC_end - RExC_precomp,
4840                                   RExC_precomp));
4841                     (void)ReREFCNT_inc(RExC_rx_sv);
4842                 }
4843
4844                 min += minnext * mincount;
4845                 is_inf_internal |= deltanext == SSize_t_MAX
4846                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4847                 is_inf |= is_inf_internal;
4848                 if (is_inf) {
4849                     delta = SSize_t_MAX;
4850                 } else {
4851                     delta += (minnext + deltanext) * maxcount
4852                              - minnext * mincount;
4853                 }
4854                 /* Try powerful optimization CURLYX => CURLYN. */
4855                 if (  OP(oscan) == CURLYX && data
4856                       && data->flags & SF_IN_PAR
4857                       && !(data->flags & SF_HAS_EVAL)
4858                       && !deltanext && minnext == 1 ) {
4859                     /* Try to optimize to CURLYN.  */
4860                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4861                     regnode * const nxt1 = nxt;
4862 #ifdef DEBUGGING
4863                     regnode *nxt2;
4864 #endif
4865
4866                     /* Skip open. */
4867                     nxt = regnext(nxt);
4868                     if (!REGNODE_SIMPLE(OP(nxt))
4869                         && !(PL_regkind[OP(nxt)] == EXACT
4870                              && STR_LEN(nxt) == 1))
4871                         goto nogo;
4872 #ifdef DEBUGGING
4873                     nxt2 = nxt;
4874 #endif
4875                     nxt = regnext(nxt);
4876                     if (OP(nxt) != CLOSE)
4877                         goto nogo;
4878                     if (RExC_open_parens) {
4879                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4880                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4881                     }
4882                     /* Now we know that nxt2 is the only contents: */
4883                     oscan->flags = (U8)ARG(nxt);
4884                     OP(oscan) = CURLYN;
4885                     OP(nxt1) = NOTHING; /* was OPEN. */
4886
4887 #ifdef DEBUGGING
4888                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4889                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4890                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4891                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4892                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4893                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4894 #endif
4895                 }
4896               nogo:
4897
4898                 /* Try optimization CURLYX => CURLYM. */
4899                 if (  OP(oscan) == CURLYX && data
4900                       && !(data->flags & SF_HAS_PAR)
4901                       && !(data->flags & SF_HAS_EVAL)
4902                       && !deltanext     /* atom is fixed width */
4903                       && minnext != 0   /* CURLYM can't handle zero width */
4904
4905                          /* Nor characters whose fold at run-time may be
4906                           * multi-character */
4907                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4908                 ) {
4909                     /* XXXX How to optimize if data == 0? */
4910                     /* Optimize to a simpler form.  */
4911                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4912                     regnode *nxt2;
4913
4914                     OP(oscan) = CURLYM;
4915                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4916                             && (OP(nxt2) != WHILEM))
4917                         nxt = nxt2;
4918                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4919                     /* Need to optimize away parenths. */
4920                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4921                         /* Set the parenth number.  */
4922                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4923
4924                         oscan->flags = (U8)ARG(nxt);
4925                         if (RExC_open_parens) {
4926                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4927                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4928                         }
4929                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4930                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4931
4932 #ifdef DEBUGGING
4933                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4934                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4935                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4936                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4937 #endif
4938 #if 0
4939                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4940                             regnode *nnxt = regnext(nxt1);
4941                             if (nnxt == nxt) {
4942                                 if (reg_off_by_arg[OP(nxt1)])
4943                                     ARG_SET(nxt1, nxt2 - nxt1);
4944                                 else if (nxt2 - nxt1 < U16_MAX)
4945                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4946                                 else
4947                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4948                             }
4949                             nxt1 = nnxt;
4950                         }
4951 #endif
4952                         /* Optimize again: */
4953                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4954                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4955                     }
4956                     else
4957                         oscan->flags = 0;
4958                 }
4959                 else if ((OP(oscan) == CURLYX)
4960                          && (flags & SCF_WHILEM_VISITED_POS)
4961                          /* See the comment on a similar expression above.
4962                             However, this time it's not a subexpression
4963                             we care about, but the expression itself. */
4964                          && (maxcount == REG_INFTY)
4965                          && data && ++data->whilem_c < 16) {
4966                     /* This stays as CURLYX, we can put the count/of pair. */
4967                     /* Find WHILEM (as in regexec.c) */
4968                     regnode *nxt = oscan + NEXT_OFF(oscan);
4969
4970                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4971                         nxt += ARG(nxt);
4972                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4973                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4974                 }
4975                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4976                     pars++;
4977                 if (flags & SCF_DO_SUBSTR) {
4978                     SV *last_str = NULL;
4979                     STRLEN last_chrs = 0;
4980                     int counted = mincount != 0;
4981
4982                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4983                                                                   string. */
4984                         SSize_t b = pos_before >= data->last_start_min
4985                             ? pos_before : data->last_start_min;
4986                         STRLEN l;
4987                         const char * const s = SvPV_const(data->last_found, l);
4988                         SSize_t old = b - data->last_start_min;
4989
4990                         if (UTF)
4991                             old = utf8_hop((U8*)s, old) - (U8*)s;
4992                         l -= old;
4993                         /* Get the added string: */
4994                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4995                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4996                                             (U8*)(s + old + l)) : l;
4997                         if (deltanext == 0 && pos_before == b) {
4998                             /* What was added is a constant string */
4999                             if (mincount > 1) {
5000
5001                                 SvGROW(last_str, (mincount * l) + 1);
5002                                 repeatcpy(SvPVX(last_str) + l,
5003                                           SvPVX_const(last_str), l,
5004                                           mincount - 1);
5005                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5006                                 /* Add additional parts. */
5007                                 SvCUR_set(data->last_found,
5008                                           SvCUR(data->last_found) - l);
5009                                 sv_catsv(data->last_found, last_str);
5010                                 {
5011                                     SV * sv = data->last_found;
5012                                     MAGIC *mg =
5013                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5014                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5015                                     if (mg && mg->mg_len >= 0)
5016                                         mg->mg_len += last_chrs * (mincount-1);
5017                                 }
5018                                 last_chrs *= mincount;
5019                                 data->last_end += l * (mincount - 1);
5020                             }
5021                         } else {
5022                             /* start offset must point into the last copy */
5023                             data->last_start_min += minnext * (mincount - 1);
5024                             data->last_start_max += is_inf ? SSize_t_MAX
5025                                 : (maxcount - 1) * (minnext + data->pos_delta);
5026                         }
5027                     }
5028                     /* It is counted once already... */
5029                     data->pos_min += minnext * (mincount - counted);
5030 #if 0
5031 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5032                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5033                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
5034     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5035     (UV)mincount);
5036 if (deltanext != SSize_t_MAX)
5037 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5038     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5039           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5040 #endif
5041                     if (deltanext == SSize_t_MAX
5042                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5043                         data->pos_delta = SSize_t_MAX;
5044                     else
5045                         data->pos_delta += - counted * deltanext +
5046                         (minnext + deltanext) * maxcount - minnext * mincount;
5047                     if (mincount != maxcount) {
5048                          /* Cannot extend fixed substrings found inside
5049                             the group.  */
5050                         scan_commit(pRExC_state, data, minlenp, is_inf);
5051                         if (mincount && last_str) {
5052                             SV * const sv = data->last_found;
5053                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5054                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5055
5056                             if (mg)
5057                                 mg->mg_len = -1;
5058                             sv_setsv(sv, last_str);
5059                             data->last_end = data->pos_min;
5060                             data->last_start_min = data->pos_min - last_chrs;
5061                             data->last_start_max = is_inf
5062                                 ? SSize_t_MAX
5063                                 : data->pos_min + data->pos_delta - last_chrs;
5064                         }
5065                         data->longest = &(data->longest_float);
5066                     }
5067                     SvREFCNT_dec(last_str);
5068                 }
5069                 if (data && (fl & SF_HAS_EVAL))
5070                     data->flags |= SF_HAS_EVAL;
5071               optimize_curly_tail:
5072                 if (OP(oscan) != CURLYX) {
5073                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5074                            && NEXT_OFF(next))
5075                         NEXT_OFF(oscan) += NEXT_OFF(next);
5076                 }
5077                 continue;
5078
5079             default:
5080 #ifdef DEBUGGING
5081                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5082                                                                     OP(scan));
5083 #endif
5084             case REF:
5085             case CLUMP:
5086                 if (flags & SCF_DO_SUBSTR) {
5087                     /* Cannot expect anything... */
5088                     scan_commit(pRExC_state, data, minlenp, is_inf);
5089                     data->longest = &(data->longest_float);
5090                 }
5091                 is_inf = is_inf_internal = 1;
5092                 if (flags & SCF_DO_STCLASS_OR) {
5093                     if (OP(scan) == CLUMP) {
5094                         /* Actually is any start char, but very few code points
5095                          * aren't start characters */
5096                         ssc_match_all_cp(data->start_class);
5097                     }
5098                     else {
5099                         ssc_anything(data->start_class);
5100                     }
5101                 }
5102                 flags &= ~SCF_DO_STCLASS;
5103                 break;
5104             }
5105         }
5106         else if (OP(scan) == LNBREAK) {
5107             if (flags & SCF_DO_STCLASS) {
5108                 if (flags & SCF_DO_STCLASS_AND) {
5109                     ssc_intersection(data->start_class,
5110                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5111                     ssc_clear_locale(data->start_class);
5112                     ANYOF_FLAGS(data->start_class)
5113                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5114                 }
5115                 else if (flags & SCF_DO_STCLASS_OR) {
5116                     ssc_union(data->start_class,
5117                               PL_XPosix_ptrs[_CC_VERTSPACE],
5118                               FALSE);
5119                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5120
5121                     /* See commit msg for
5122                      * 749e076fceedeb708a624933726e7989f2302f6a */
5123                     ANYOF_FLAGS(data->start_class)
5124                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5125                 }
5126                 flags &= ~SCF_DO_STCLASS;
5127             }
5128             min++;
5129             delta++;    /* Because of the 2 char string cr-lf */
5130             if (flags & SCF_DO_SUBSTR) {
5131                 /* Cannot expect anything... */
5132                 scan_commit(pRExC_state, data, minlenp, is_inf);
5133                 data->pos_min += 1;
5134                 data->pos_delta += 1;
5135                 data->longest = &(data->longest_float);
5136             }
5137         }
5138         else if (REGNODE_SIMPLE(OP(scan))) {
5139
5140             if (flags & SCF_DO_SUBSTR) {
5141                 scan_commit(pRExC_state, data, minlenp, is_inf);
5142                 data->pos_min++;
5143             }
5144             min++;
5145             if (flags & SCF_DO_STCLASS) {
5146                 bool invert = 0;
5147                 SV* my_invlist = sv_2mortal(_new_invlist(0));
5148                 U8 namedclass;
5149
5150                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5151                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5152
5153                 /* Some of the logic below assumes that switching
5154                    locale on will only add false positives. */
5155                 switch (OP(scan)) {
5156
5157                 default:
5158 #ifdef DEBUGGING
5159                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5160                                                                      OP(scan));
5161 #endif
5162                 case CANY:
5163                 case SANY:
5164                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5165                         ssc_match_all_cp(data->start_class);
5166                     break;
5167
5168                 case REG_ANY:
5169                     {
5170                         SV* REG_ANY_invlist = _new_invlist(2);
5171                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5172                                                             '\n');
5173                         if (flags & SCF_DO_STCLASS_OR) {
5174                             ssc_union(data->start_class,
5175                                       REG_ANY_invlist,
5176                                       TRUE /* TRUE => invert, hence all but \n
5177                                             */
5178                                       );
5179                         }
5180                         else if (flags & SCF_DO_STCLASS_AND) {
5181                             ssc_intersection(data->start_class,
5182                                              REG_ANY_invlist,
5183                                              TRUE  /* TRUE => invert */
5184                                              );
5185                             ssc_clear_locale(data->start_class);
5186                         }
5187                         SvREFCNT_dec_NN(REG_ANY_invlist);
5188                     }
5189                     break;
5190
5191                 case ANYOF:
5192                     if (flags & SCF_DO_STCLASS_AND)
5193                         ssc_and(pRExC_state, data->start_class,
5194                                 (regnode_charclass *) scan);
5195                     else
5196                         ssc_or(pRExC_state, data->start_class,
5197                                                           (regnode_charclass *) scan);
5198                     break;
5199
5200                 case NPOSIXL:
5201                     invert = 1;
5202                     /* FALLTHROUGH */
5203
5204                 case POSIXL:
5205                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5206                     if (flags & SCF_DO_STCLASS_AND) {
5207                         bool was_there = cBOOL(
5208                                           ANYOF_POSIXL_TEST(data->start_class,
5209                                                                  namedclass));
5210                         ANYOF_POSIXL_ZERO(data->start_class);
5211                         if (was_there) {    /* Do an AND */
5212                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5213                         }
5214                         /* No individual code points can now match */
5215                         data->start_class->invlist
5216                                                 = sv_2mortal(_new_invlist(0));
5217                     }
5218                     else {
5219                         int complement = namedclass + ((invert) ? -1 : 1);
5220
5221                         assert(flags & SCF_DO_STCLASS_OR);
5222
5223                         /* If the complement of this class was already there,
5224                          * the result is that they match all code points,
5225                          * (\d + \D == everything).  Remove the classes from
5226                          * future consideration.  Locale is not relevant in
5227                          * this case */
5228                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5229                             ssc_match_all_cp(data->start_class);
5230                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5231                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5232                         }
5233                         else {  /* The usual case; just add this class to the
5234                                    existing set */
5235                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5236                         }
5237                     }
5238                     break;
5239
5240                 case NPOSIXA:   /* For these, we always know the exact set of
5241                                    what's matched */
5242                     invert = 1;
5243                     /* FALLTHROUGH */
5244                 case POSIXA:
5245                     if (FLAGS(scan) == _CC_ASCII) {
5246                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
5247                     }
5248                     else {
5249                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5250                                               PL_XPosix_ptrs[_CC_ASCII],
5251                                               &my_invlist);
5252                     }
5253                     goto join_posix;
5254
5255                 case NPOSIXD:
5256                 case NPOSIXU:
5257                     invert = 1;
5258                     /* FALLTHROUGH */
5259                 case POSIXD:
5260                 case POSIXU:
5261                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5262
5263                     /* NPOSIXD matches all upper Latin1 code points unless the
5264                      * target string being matched is UTF-8, which is
5265                      * unknowable until match time.  Since we are going to
5266                      * invert, we want to get rid of all of them so that the
5267                      * inversion will match all */
5268                     if (OP(scan) == NPOSIXD) {
5269                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5270                                           &my_invlist);
5271                     }
5272
5273                   join_posix:
5274
5275                     if (flags & SCF_DO_STCLASS_AND) {
5276                         ssc_intersection(data->start_class, my_invlist, invert);
5277                         ssc_clear_locale(data->start_class);
5278                     }
5279                     else {
5280                         assert(flags & SCF_DO_STCLASS_OR);
5281                         ssc_union(data->start_class, my_invlist, invert);
5282                     }
5283                 }
5284                 if (flags & SCF_DO_STCLASS_OR)
5285                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5286                 flags &= ~SCF_DO_STCLASS;
5287             }
5288         }
5289         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5290             data->flags |= (OP(scan) == MEOL
5291                             ? SF_BEFORE_MEOL
5292                             : SF_BEFORE_SEOL);
5293             scan_commit(pRExC_state, data, minlenp, is_inf);
5294
5295         }
5296         else if (  PL_regkind[OP(scan)] == BRANCHJ
5297                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5298                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5299                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5300         {
5301             if ( OP(scan) == UNLESSM &&
5302                  scan->flags == 0 &&
5303                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5304                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5305             ) {
5306                 regnode *opt;
5307                 regnode *upto= regnext(scan);
5308                 DEBUG_PARSE_r({
5309                     DEBUG_STUDYDATA("OPFAIL",data,depth);
5310
5311                     /*DEBUG_PARSE_MSG("opfail");*/
5312                     regprop(RExC_rx, RExC_mysv, upto, NULL, pRExC_state);
5313                     PerlIO_printf(Perl_debug_log,
5314                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5315                         SvPV_nolen_const(RExC_mysv),
5316                         (IV)REG_NODE_NUM(upto),
5317                         (IV)(upto - scan)
5318                     );
5319                 });
5320                 OP(scan) = OPFAIL;
5321                 NEXT_OFF(scan) = upto - scan;
5322                 for (opt= scan + 1; opt < upto ; opt++)
5323                     OP(opt) = OPTIMIZED;
5324                 scan= upto;
5325                 continue;
5326             }
5327             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5328                 || OP(scan) == UNLESSM )
5329             {
5330                 /* Negative Lookahead/lookbehind
5331                    In this case we can't do fixed string optimisation.
5332                 */
5333
5334                 SSize_t deltanext, minnext, fake = 0;
5335                 regnode *nscan;
5336                 regnode_ssc intrnl;
5337                 int f = 0;
5338
5339                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5340                 if (data) {
5341                     data_fake.whilem_c = data->whilem_c;
5342                     data_fake.last_closep = data->last_closep;
5343                 }
5344                 else
5345                     data_fake.last_closep = &fake;
5346                 data_fake.pos_delta = delta;
5347                 if ( flags & SCF_DO_STCLASS && !scan->flags
5348                      && OP(scan) == IFMATCH ) { /* Lookahead */
5349                     ssc_init(pRExC_state, &intrnl);
5350                     data_fake.start_class = &intrnl;
5351                     f |= SCF_DO_STCLASS_AND;
5352                 }
5353                 if (flags & SCF_WHILEM_VISITED_POS)
5354                     f |= SCF_WHILEM_VISITED_POS;
5355                 next = regnext(scan);
5356                 nscan = NEXTOPER(NEXTOPER(scan));
5357                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5358                                       last, &data_fake, stopparen,
5359                                       recursed_depth, NULL, f, depth+1);
5360                 if (scan->flags) {
5361                     if (deltanext) {
5362                         FAIL("Variable length lookbehind not implemented");
5363                     }
5364                     else if (minnext > (I32)U8_MAX) {
5365                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5366                               (UV)U8_MAX);
5367                     }
5368                     scan->flags = (U8)minnext;
5369                 }
5370                 if (data) {
5371                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5372                         pars++;
5373                     if (data_fake.flags & SF_HAS_EVAL)
5374                         data->flags |= SF_HAS_EVAL;
5375                     data->whilem_c = data_fake.whilem_c;
5376                 }
5377                 if (f & SCF_DO_STCLASS_AND) {
5378                     if (flags & SCF_DO_STCLASS_OR) {
5379                         /* OR before, AND after: ideally we would recurse with
5380                          * data_fake to get the AND applied by study of the
5381                          * remainder of the pattern, and then derecurse;
5382                          * *** HACK *** for now just treat as "no information".
5383                          * See [perl #56690].
5384                          */
5385                         ssc_init(pRExC_state, data->start_class);
5386                     }  else {
5387                         /* AND before and after: combine and continue.  These
5388                          * assertions are zero-length, so can match an EMPTY
5389                          * string */
5390                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5391                         ANYOF_FLAGS(data->start_class)
5392                                                    |= SSC_MATCHES_EMPTY_STRING;
5393                     }
5394                 }
5395             }
5396 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5397             else {
5398                 /* Positive Lookahead/lookbehind
5399                    In this case we can do fixed string optimisation,
5400                    but we must be careful about it. Note in the case of
5401                    lookbehind the positions will be offset by the minimum
5402                    length of the pattern, something we won't know about
5403                    until after the recurse.
5404                 */
5405                 SSize_t deltanext, fake = 0;
5406                 regnode *nscan;
5407                 regnode_ssc intrnl;
5408                 int f = 0;
5409                 /* We use SAVEFREEPV so that when the full compile
5410                     is finished perl will clean up the allocated
5411                     minlens when it's all done. This way we don't
5412                     have to worry about freeing them when we know
5413                     they wont be used, which would be a pain.
5414                  */
5415                 SSize_t *minnextp;
5416                 Newx( minnextp, 1, SSize_t );
5417                 SAVEFREEPV(minnextp);
5418
5419                 if (data) {
5420                     StructCopy(data, &data_fake, scan_data_t);
5421                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5422                         f |= SCF_DO_SUBSTR;
5423                         if (scan->flags)
5424                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5425                         data_fake.last_found=newSVsv(data->last_found);
5426                     }
5427                 }
5428                 else
5429                     data_fake.last_closep = &fake;
5430                 data_fake.flags = 0;
5431                 data_fake.pos_delta = delta;
5432                 if (is_inf)
5433                     data_fake.flags |= SF_IS_INF;
5434                 if ( flags & SCF_DO_STCLASS && !scan->flags
5435                      && OP(scan) == IFMATCH ) { /* Lookahead */
5436                     ssc_init(pRExC_state, &intrnl);
5437                     data_fake.start_class = &intrnl;
5438                     f |= SCF_DO_STCLASS_AND;
5439                 }
5440                 if (flags & SCF_WHILEM_VISITED_POS)
5441                     f |= SCF_WHILEM_VISITED_POS;
5442                 next = regnext(scan);
5443                 nscan = NEXTOPER(NEXTOPER(scan));
5444
5445                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5446                                         &deltanext, last, &data_fake,
5447                                         stopparen, recursed_depth, NULL,
5448                                         f,depth+1);
5449                 if (scan->flags) {
5450                     if (deltanext) {
5451                         FAIL("Variable length lookbehind not implemented");
5452                     }
5453                     else if (*minnextp > (I32)U8_MAX) {
5454                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5455                               (UV)U8_MAX);
5456                     }
5457                     scan->flags = (U8)*minnextp;
5458                 }
5459
5460                 *minnextp += min;
5461
5462                 if (f & SCF_DO_STCLASS_AND) {
5463                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5464                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5465                 }
5466                 if (data) {
5467                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5468                         pars++;
5469                     if (data_fake.flags & SF_HAS_EVAL)
5470                         data->flags |= SF_HAS_EVAL;
5471                     data->whilem_c = data_fake.whilem_c;
5472                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5473                         if (RExC_rx->minlen<*minnextp)
5474                             RExC_rx->minlen=*minnextp;
5475                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5476                         SvREFCNT_dec_NN(data_fake.last_found);
5477
5478                         if ( data_fake.minlen_fixed != minlenp )
5479                         {
5480                             data->offset_fixed= data_fake.offset_fixed;
5481                             data->minlen_fixed= data_fake.minlen_fixed;
5482                             data->lookbehind_fixed+= scan->flags;
5483                         }
5484                         if ( data_fake.minlen_float != minlenp )
5485                         {
5486                             data->minlen_float= data_fake.minlen_float;
5487                             data->offset_float_min=data_fake.offset_float_min;
5488                             data->offset_float_max=data_fake.offset_float_max;
5489                             data->lookbehind_float+= scan->flags;
5490                         }
5491                     }
5492                 }
5493             }
5494 #endif
5495         }
5496         else if (OP(scan) == OPEN) {
5497             if (stopparen != (I32)ARG(scan))
5498                 pars++;
5499         }
5500         else if (OP(scan) == CLOSE) {
5501             if (stopparen == (I32)ARG(scan)) {
5502                 break;
5503             }
5504             if ((I32)ARG(scan) == is_par) {
5505                 next = regnext(scan);
5506
5507                 if ( next && (OP(next) != WHILEM) && next < last)
5508                     is_par = 0;         /* Disable optimization */
5509             }
5510             if (data)
5511                 *(data->last_closep) = ARG(scan);
5512         }
5513         else if (OP(scan) == EVAL) {
5514                 if (data)
5515                     data->flags |= SF_HAS_EVAL;
5516         }
5517         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5518             if (flags & SCF_DO_SUBSTR) {
5519                 scan_commit(pRExC_state, data, minlenp, is_inf);
5520                 flags &= ~SCF_DO_SUBSTR;
5521             }
5522             if (data && OP(scan)==ACCEPT) {
5523                 data->flags |= SCF_SEEN_ACCEPT;
5524                 if (stopmin > min)
5525                     stopmin = min;
5526             }
5527         }
5528         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5529         {
5530                 if (flags & SCF_DO_SUBSTR) {
5531                     scan_commit(pRExC_state, data, minlenp, is_inf);
5532                     data->longest = &(data->longest_float);
5533                 }
5534                 is_inf = is_inf_internal = 1;
5535                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5536                     ssc_anything(data->start_class);
5537                 flags &= ~SCF_DO_STCLASS;
5538         }
5539         else if (OP(scan) == GPOS) {
5540             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5541                 !(delta || is_inf || (data && data->pos_delta)))
5542             {
5543                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5544                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5545                 if (RExC_rx->gofs < (STRLEN)min)
5546                     RExC_rx->gofs = min;
5547             } else {
5548                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5549                 RExC_rx->gofs = 0;
5550             }
5551         }
5552 #ifdef TRIE_STUDY_OPT
5553 #ifdef FULL_TRIE_STUDY
5554         else if (PL_regkind[OP(scan)] == TRIE) {
5555             /* NOTE - There is similar code to this block above for handling
5556                BRANCH nodes on the initial study.  If you change stuff here
5557                check there too. */
5558             regnode *trie_node= scan;
5559             regnode *tail= regnext(scan);
5560             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5561             SSize_t max1 = 0, min1 = SSize_t_MAX;
5562             regnode_ssc accum;
5563
5564             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5565                 /* Cannot merge strings after this. */
5566                 scan_commit(pRExC_state, data, minlenp, is_inf);
5567             }
5568             if (flags & SCF_DO_STCLASS)
5569                 ssc_init_zero(pRExC_state, &accum);
5570
5571             if (!trie->jump) {
5572                 min1= trie->minlen;
5573                 max1= trie->maxlen;
5574             } else {
5575                 const regnode *nextbranch= NULL;
5576                 U32 word;
5577
5578                 for ( word=1 ; word <= trie->wordcount ; word++)
5579                 {
5580                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5581                     regnode_ssc this_class;
5582
5583                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5584                     if (data) {
5585                         data_fake.whilem_c = data->whilem_c;
5586                         data_fake.last_closep = data->last_closep;
5587                     }
5588                     else
5589                         data_fake.last_closep = &fake;
5590                     data_fake.pos_delta = delta;
5591                     if (flags & SCF_DO_STCLASS) {
5592                         ssc_init(pRExC_state, &this_class);
5593                         data_fake.start_class = &this_class;
5594                         f = SCF_DO_STCLASS_AND;
5595                     }
5596                     if (flags & SCF_WHILEM_VISITED_POS)
5597                         f |= SCF_WHILEM_VISITED_POS;
5598
5599                     if (trie->jump[word]) {
5600                         if (!nextbranch)
5601                             nextbranch = trie_node + trie->jump[0];
5602                         scan= trie_node + trie->jump[word];
5603                         /* We go from the jump point to the branch that follows
5604                            it. Note this means we need the vestigal unused
5605                            branches even though they arent otherwise used. */
5606                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5607                             &deltanext, (regnode *)nextbranch, &data_fake,
5608                             stopparen, recursed_depth, NULL, f,depth+1);
5609                     }
5610                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5611                         nextbranch= regnext((regnode*)nextbranch);
5612
5613                     if (min1 > (SSize_t)(minnext + trie->minlen))
5614                         min1 = minnext + trie->minlen;
5615                     if (deltanext == SSize_t_MAX) {
5616                         is_inf = is_inf_internal = 1;
5617                         max1 = SSize_t_MAX;
5618                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5619                         max1 = minnext + deltanext + trie->maxlen;
5620
5621                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5622                         pars++;
5623                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5624                         if ( stopmin > min + min1)
5625                             stopmin = min + min1;
5626                         flags &= ~SCF_DO_SUBSTR;
5627                         if (data)
5628                             data->flags |= SCF_SEEN_ACCEPT;
5629                     }
5630                     if (data) {
5631                         if (data_fake.flags & SF_HAS_EVAL)
5632                             data->flags |= SF_HAS_EVAL;
5633                         data->whilem_c = data_fake.whilem_c;
5634                     }
5635                     if (flags & SCF_DO_STCLASS)
5636                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5637                 }
5638             }
5639             if (flags & SCF_DO_SUBSTR) {
5640                 data->pos_min += min1;
5641                 data->pos_delta += max1 - min1;
5642                 if (max1 != min1 || is_inf)
5643                     data->longest = &(data->longest_float);
5644             }
5645             min += min1;
5646             delta += max1 - min1;
5647             if (flags & SCF_DO_STCLASS_OR) {
5648                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5649                 if (min1) {
5650                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5651                     flags &= ~SCF_DO_STCLASS;
5652                 }
5653             }
5654             else if (flags & SCF_DO_STCLASS_AND) {
5655                 if (min1) {
5656                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5657                     flags &= ~SCF_DO_STCLASS;
5658                 }
5659                 else {
5660                     /* Switch to OR mode: cache the old value of
5661                      * data->start_class */
5662                     INIT_AND_WITHP;
5663                     StructCopy(data->start_class, and_withp, regnode_ssc);
5664                     flags &= ~SCF_DO_STCLASS_AND;
5665                     StructCopy(&accum, data->start_class, regnode_ssc);
5666                     flags |= SCF_DO_STCLASS_OR;
5667                 }
5668             }
5669             scan= tail;
5670             continue;
5671         }
5672 #else
5673         else if (PL_regkind[OP(scan)] == TRIE) {
5674             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5675             U8*bang=NULL;
5676
5677             min += trie->minlen;
5678             delta += (trie->maxlen - trie->minlen);
5679             flags &= ~SCF_DO_STCLASS; /* xxx */
5680             if (flags & SCF_DO_SUBSTR) {
5681                 /* Cannot expect anything... */
5682                 scan_commit(pRExC_state, data, minlenp, is_inf);
5683                 data->pos_min += trie->minlen;
5684                 data->pos_delta += (trie->maxlen - trie->minlen);
5685                 if (trie->maxlen != trie->minlen)
5686                     data->longest = &(data->longest_float);
5687             }
5688             if (trie->jump) /* no more substrings -- for now /grr*/
5689                flags &= ~SCF_DO_SUBSTR;
5690         }
5691 #endif /* old or new */
5692 #endif /* TRIE_STUDY_OPT */
5693
5694         /* Else: zero-length, ignore. */
5695         scan = regnext(scan);
5696     }
5697     /* If we are exiting a recursion we can unset its recursed bit
5698      * and allow ourselves to enter it again - no danger of an
5699      * infinite loop there.
5700     if (stopparen > -1 && recursed) {
5701         DEBUG_STUDYDATA("unset:", data,depth);
5702         PAREN_UNSET( recursed, stopparen);
5703     }
5704     */
5705     if (frame) {
5706         depth = depth - 1;
5707
5708         DEBUG_STUDYDATA("frame-end:",data,depth);
5709         DEBUG_PEEP("fend", scan, depth);
5710
5711         /* restore previous context */
5712         last = frame->last_regnode;
5713         scan = frame->next_regnode;
5714         stopparen = frame->stopparen;
5715         recursed_depth = frame->prev_recursed_depth;
5716
5717         RExC_frame_last = frame->prev_frame;
5718         frame = frame->this_prev_frame;
5719         goto fake_study_recurse;
5720     }
5721
5722   finish:
5723     assert(!frame);
5724     DEBUG_STUDYDATA("pre-fin:",data,depth);
5725
5726     *scanp = scan;
5727     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5728
5729     if (flags & SCF_DO_SUBSTR && is_inf)
5730         data->pos_delta = SSize_t_MAX - data->pos_min;
5731     if (is_par > (I32)U8_MAX)
5732         is_par = 0;
5733     if (is_par && pars==1 && data) {
5734         data->flags |= SF_IN_PAR;
5735         data->flags &= ~SF_HAS_PAR;
5736     }
5737     else if (pars && data) {
5738         data->flags |= SF_HAS_PAR;
5739         data->flags &= ~SF_IN_PAR;
5740     }
5741     if (flags & SCF_DO_STCLASS_OR)
5742         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5743     if (flags & SCF_TRIE_RESTUDY)
5744         data->flags |=  SCF_TRIE_RESTUDY;
5745
5746     DEBUG_STUDYDATA("post-fin:",data,depth);
5747
5748     {
5749         SSize_t final_minlen= min < stopmin ? min : stopmin;
5750
5751         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5752             RExC_maxlen = final_minlen + delta;
5753         }
5754         return final_minlen;
5755     }
5756     /* not-reached */
5757 }
5758
5759 STATIC U32
5760 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5761 {
5762     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5763
5764     PERL_ARGS_ASSERT_ADD_DATA;
5765
5766     Renewc(RExC_rxi->data,
5767            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5768            char, struct reg_data);
5769     if(count)
5770         Renew(RExC_rxi->data->what, count + n, U8);
5771     else
5772         Newx(RExC_rxi->data->what, n, U8);
5773     RExC_rxi->data->count = count + n;
5774     Copy(s, RExC_rxi->data->what + count, n, U8);
5775     return count;
5776 }
5777
5778 /*XXX: todo make this not included in a non debugging perl, but appears to be
5779  * used anyway there, in 'use re' */
5780 #ifndef PERL_IN_XSUB_RE
5781 void
5782 Perl_reginitcolors(pTHX)
5783 {
5784     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5785     if (s) {
5786         char *t = savepv(s);
5787         int i = 0;
5788         PL_colors[0] = t;
5789         while (++i < 6) {
5790             t = strchr(t, '\t');
5791             if (t) {
5792                 *t = '\0';
5793                 PL_colors[i] = ++t;
5794             }
5795             else
5796                 PL_colors[i] = t = (char *)"";
5797         }
5798     } else {
5799         int i = 0;
5800         while (i < 6)
5801             PL_colors[i++] = (char *)"";
5802     }
5803     PL_colorset = 1;
5804 }
5805 #endif
5806
5807
5808 #ifdef TRIE_STUDY_OPT
5809 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5810     STMT_START {                                            \
5811         if (                                                \
5812               (data.flags & SCF_TRIE_RESTUDY)               \
5813               && ! restudied++                              \
5814         ) {                                                 \
5815             dOsomething;                                    \
5816             goto reStudy;                                   \
5817         }                                                   \
5818     } STMT_END
5819 #else
5820 #define CHECK_RESTUDY_GOTO_butfirst
5821 #endif
5822
5823 /*
5824  * pregcomp - compile a regular expression into internal code
5825  *
5826  * Decides which engine's compiler to call based on the hint currently in
5827  * scope
5828  */
5829
5830 #ifndef PERL_IN_XSUB_RE
5831
5832 /* return the currently in-scope regex engine (or the default if none)  */
5833
5834 regexp_engine const *
5835 Perl_current_re_engine(pTHX)
5836 {
5837     if (IN_PERL_COMPILETIME) {
5838         HV * const table = GvHV(PL_hintgv);
5839         SV **ptr;
5840
5841         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5842             return &PL_core_reg_engine;
5843         ptr = hv_fetchs(table, "regcomp", FALSE);
5844         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5845             return &PL_core_reg_engine;
5846         return INT2PTR(regexp_engine*,SvIV(*ptr));
5847     }
5848     else {
5849         SV *ptr;
5850         if (!PL_curcop->cop_hints_hash)
5851             return &PL_core_reg_engine;
5852         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5853         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5854             return &PL_core_reg_engine;
5855         return INT2PTR(regexp_engine*,SvIV(ptr));
5856     }
5857 }
5858
5859
5860 REGEXP *
5861 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5862 {
5863     regexp_engine const *eng = current_re_engine();
5864     GET_RE_DEBUG_FLAGS_DECL;
5865
5866     PERL_ARGS_ASSERT_PREGCOMP;
5867
5868     /* Dispatch a request to compile a regexp to correct regexp engine. */
5869     DEBUG_COMPILE_r({
5870         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5871                         PTR2UV(eng));
5872     });
5873     return CALLREGCOMP_ENG(eng, pattern, flags);
5874 }
5875 #endif
5876
5877 /* public(ish) entry point for the perl core's own regex compiling code.
5878  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5879  * pattern rather than a list of OPs, and uses the internal engine rather
5880  * than the current one */
5881
5882 REGEXP *
5883 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5884 {
5885     SV *pat = pattern; /* defeat constness! */
5886     PERL_ARGS_ASSERT_RE_COMPILE;
5887     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5888 #ifdef PERL_IN_XSUB_RE
5889                                 &my_reg_engine,
5890 #else
5891                                 &PL_core_reg_engine,
5892 #endif
5893                                 NULL, NULL, rx_flags, 0);
5894 }
5895
5896
5897 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5898  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5899  * point to the realloced string and length.
5900  *
5901  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5902  * stuff added */
5903
5904 static void
5905 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5906                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5907 {
5908     U8 *const src = (U8*)*pat_p;
5909     U8 *dst, *d;
5910     int n=0;
5911     STRLEN s = 0;
5912     bool do_end = 0;
5913     GET_RE_DEBUG_FLAGS_DECL;
5914
5915     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5916         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5917
5918     Newx(dst, *plen_p * 2 + 1, U8);
5919     d = dst;
5920
5921     while (s < *plen_p) {
5922         append_utf8_from_native_byte(src[s], &d);
5923         if (n < num_code_blocks) {
5924             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5925                 pRExC_state->code_blocks[n].start = d - dst - 1;
5926                 assert(*(d - 1) == '(');
5927                 do_end = 1;
5928             }
5929             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5930                 pRExC_state->code_blocks[n].end = d - dst - 1;
5931                 assert(*(d - 1) == ')');
5932                 do_end = 0;
5933                 n++;
5934             }
5935         }
5936         s++;
5937     }
5938     *d = '\0';
5939     *plen_p = d - dst;
5940     *pat_p = (char*) dst;
5941     SAVEFREEPV(*pat_p);
5942     RExC_orig_utf8 = RExC_utf8 = 1;
5943 }
5944
5945
5946
5947 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5948  * while recording any code block indices, and handling overloading,
5949  * nested qr// objects etc.  If pat is null, it will allocate a new
5950  * string, or just return the first arg, if there's only one.
5951  *
5952  * Returns the malloced/updated pat.
5953  * patternp and pat_count is the array of SVs to be concatted;
5954  * oplist is the optional list of ops that generated the SVs;
5955  * recompile_p is a pointer to a boolean that will be set if
5956  *   the regex will need to be recompiled.
5957  * delim, if non-null is an SV that will be inserted between each element
5958  */
5959
5960 static SV*
5961 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5962                 SV *pat, SV ** const patternp, int pat_count,
5963                 OP *oplist, bool *recompile_p, SV *delim)
5964 {
5965     SV **svp;
5966     int n = 0;
5967     bool use_delim = FALSE;
5968     bool alloced = FALSE;
5969
5970     /* if we know we have at least two args, create an empty string,
5971      * then concatenate args to that. For no args, return an empty string */
5972     if (!pat && pat_count != 1) {
5973         pat = newSVpvs("");
5974         SAVEFREESV(pat);
5975         alloced = TRUE;
5976     }
5977
5978     for (svp = patternp; svp < patternp + pat_count; svp++) {
5979         SV *sv;
5980         SV *rx  = NULL;
5981         STRLEN orig_patlen = 0;
5982         bool code = 0;
5983         SV *msv = use_delim ? delim : *svp;
5984         if (!msv) msv = &PL_sv_undef;
5985
5986         /* if we've got a delimiter, we go round the loop twice for each
5987          * svp slot (except the last), using the delimiter the second
5988          * time round */
5989         if (use_delim) {
5990             svp--;
5991             use_delim = FALSE;
5992         }
5993         else if (delim)
5994             use_delim = TRUE;
5995
5996         if (SvTYPE(msv) == SVt_PVAV) {
5997             /* we've encountered an interpolated array within
5998              * the pattern, e.g. /...@a..../. Expand the list of elements,
5999              * then recursively append elements.
6000              * The code in this block is based on S_pushav() */
6001
6002             AV *const av = (AV*)msv;
6003             const SSize_t maxarg = AvFILL(av) + 1;
6004             SV **array;
6005
6006             if (oplist) {
6007                 assert(oplist->op_type == OP_PADAV
6008                     || oplist->op_type == OP_RV2AV);
6009                 oplist = OP_SIBLING(oplist);
6010             }
6011
6012             if (SvRMAGICAL(av)) {
6013                 SSize_t i;
6014
6015                 Newx(array, maxarg, SV*);
6016                 SAVEFREEPV(array);
6017                 for (i=0; i < maxarg; i++) {
6018                     SV ** const svp = av_fetch(av, i, FALSE);
6019                     array[i] = svp ? *svp : &PL_sv_undef;
6020                 }
6021             }
6022             else
6023                 array = AvARRAY(av);
6024
6025             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6026                                 array, maxarg, NULL, recompile_p,
6027                                 /* $" */
6028                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6029
6030             continue;
6031         }
6032
6033
6034         /* we make the assumption here that each op in the list of
6035          * op_siblings maps to one SV pushed onto the stack,
6036          * except for code blocks, with have both an OP_NULL and
6037          * and OP_CONST.
6038          * This allows us to match up the list of SVs against the
6039          * list of OPs to find the next code block.
6040          *
6041          * Note that       PUSHMARK PADSV PADSV ..
6042          * is optimised to
6043          *                 PADRANGE PADSV  PADSV  ..
6044          * so the alignment still works. */
6045
6046         if (oplist) {
6047             if (oplist->op_type == OP_NULL
6048                 && (oplist->op_flags & OPf_SPECIAL))
6049             {
6050                 assert(n < pRExC_state->num_code_blocks);
6051                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6052                 pRExC_state->code_blocks[n].block = oplist;
6053                 pRExC_state->code_blocks[n].src_regex = NULL;
6054                 n++;
6055                 code = 1;
6056                 oplist = OP_SIBLING(oplist); /* skip CONST */
6057                 assert(oplist);
6058             }
6059             oplist = OP_SIBLING(oplist);;
6060         }
6061
6062         /* apply magic and QR overloading to arg */
6063
6064         SvGETMAGIC(msv);
6065         if (SvROK(msv) && SvAMAGIC(msv)) {
6066             SV *sv = AMG_CALLunary(msv, regexp_amg);
6067             if (sv) {
6068                 if (SvROK(sv))
6069                     sv = SvRV(sv);
6070                 if (SvTYPE(sv) != SVt_REGEXP)
6071                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6072                 msv = sv;
6073             }
6074         }
6075
6076         /* try concatenation overload ... */
6077         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6078                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6079         {
6080             sv_setsv(pat, sv);
6081             /* overloading involved: all bets are off over literal
6082              * code. Pretend we haven't seen it */
6083             pRExC_state->num_code_blocks -= n;
6084             n = 0;
6085         }
6086         else  {
6087             /* ... or failing that, try "" overload */
6088             while (SvAMAGIC(msv)
6089                     && (sv = AMG_CALLunary(msv, string_amg))
6090                     && sv != msv
6091                     &&  !(   SvROK(msv)
6092                           && SvROK(sv)
6093                           && SvRV(msv) == SvRV(sv))
6094             ) {
6095                 msv = sv;
6096                 SvGETMAGIC(msv);
6097             }
6098             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6099                 msv = SvRV(msv);
6100
6101             if (pat) {
6102                 /* this is a partially unrolled
6103                  *     sv_catsv_nomg(pat, msv);
6104                  * that allows us to adjust code block indices if
6105                  * needed */
6106                 STRLEN dlen;
6107                 char *dst = SvPV_force_nomg(pat, dlen);
6108                 orig_patlen = dlen;
6109                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6110                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6111                     sv_setpvn(pat, dst, dlen);
6112                     SvUTF8_on(pat);
6113                 }
6114                 sv_catsv_nomg(pat, msv);
6115                 rx = msv;
6116             }
6117             else
6118                 pat = msv;
6119
6120             if (code)
6121                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6122         }
6123
6124         /* extract any code blocks within any embedded qr//'s */
6125         if (rx && SvTYPE(rx) == SVt_REGEXP
6126             && RX_ENGINE((REGEXP*)rx)->op_comp)
6127         {
6128
6129             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6130             if (ri->num_code_blocks) {
6131                 int i;
6132                 /* the presence of an embedded qr// with code means
6133                  * we should always recompile: the text of the
6134                  * qr// may not have changed, but it may be a
6135                  * different closure than last time */
6136                 *recompile_p = 1;
6137                 Renew(pRExC_state->code_blocks,
6138                     pRExC_state->num_code_blocks + ri->num_code_blocks,
6139                     struct reg_code_block);
6140                 pRExC_state->num_code_blocks += ri->num_code_blocks;
6141
6142                 for (i=0; i < ri->num_code_blocks; i++) {
6143                     struct reg_code_block *src, *dst;
6144                     STRLEN offset =  orig_patlen
6145                         + ReANY((REGEXP *)rx)->pre_prefix;
6146                     assert(n < pRExC_state->num_code_blocks);
6147                     src = &ri->code_blocks[i];
6148                     dst = &pRExC_state->code_blocks[n];
6149                     dst->start      = src->start + offset;
6150                     dst->end        = src->end   + offset;
6151                     dst->block      = src->block;
6152                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6153                                             src->src_regex
6154                                                 ? src->src_regex
6155                                                 : (REGEXP*)rx);
6156                     n++;
6157                 }
6158             }
6159         }
6160     }
6161     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6162     if (alloced)
6163         SvSETMAGIC(pat);
6164
6165     return pat;
6166 }
6167
6168
6169
6170 /* see if there are any run-time code blocks in the pattern.
6171  * False positives are allowed */
6172
6173 static bool
6174 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6175                     char *pat, STRLEN plen)
6176 {
6177     int n = 0;
6178     STRLEN s;
6179
6180     PERL_UNUSED_CONTEXT;
6181
6182     for (s = 0; s < plen; s++) {
6183         if (n < pRExC_state->num_code_blocks
6184             && s == pRExC_state->code_blocks[n].start)
6185         {
6186             s = pRExC_state->code_blocks[n].end;
6187             n++;
6188             continue;
6189         }
6190         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6191          * positives here */
6192         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6193             (pat[s+2] == '{'
6194                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6195         )
6196             return 1;
6197     }
6198     return 0;
6199 }
6200
6201 /* Handle run-time code blocks. We will already have compiled any direct
6202  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6203  * copy of it, but with any literal code blocks blanked out and
6204  * appropriate chars escaped; then feed it into
6205  *
6206  *    eval "qr'modified_pattern'"
6207  *
6208  * For example,
6209  *
6210  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6211  *
6212  * becomes
6213  *
6214  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6215  *
6216  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6217  * and merge them with any code blocks of the original regexp.
6218  *
6219  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6220  * instead, just save the qr and return FALSE; this tells our caller that
6221  * the original pattern needs upgrading to utf8.
6222  */
6223
6224 static bool
6225 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6226     char *pat, STRLEN plen)
6227 {
6228     SV *qr;
6229
6230     GET_RE_DEBUG_FLAGS_DECL;
6231
6232     if (pRExC_state->runtime_code_qr) {
6233         /* this is the second time we've been called; this should
6234          * only happen if the main pattern got upgraded to utf8
6235          * during compilation; re-use the qr we compiled first time
6236          * round (which should be utf8 too)
6237          */
6238         qr = pRExC_state->runtime_code_qr;
6239         pRExC_state->runtime_code_qr = NULL;
6240         assert(RExC_utf8 && SvUTF8(qr));
6241     }
6242     else {
6243         int n = 0;
6244         STRLEN s;
6245         char *p, *newpat;
6246         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6247         SV *sv, *qr_ref;
6248         dSP;
6249
6250         /* determine how many extra chars we need for ' and \ escaping */
6251         for (s = 0; s < plen; s++) {
6252             if (pat[s] == '\'' || pat[s] == '\\')
6253                 newlen++;
6254         }
6255
6256         Newx(newpat, newlen, char);
6257         p = newpat;
6258         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6259
6260         for (s = 0; s < plen; s++) {
6261             if (n < pRExC_state->num_code_blocks
6262                 && s == pRExC_state->code_blocks[n].start)
6263             {
6264                 /* blank out literal code block */
6265                 assert(pat[s] == '(');
6266                 while (s <= pRExC_state->code_blocks[n].end) {
6267                     *p++ = '_';
6268                     s++;
6269                 }
6270                 s--;
6271                 n++;
6272                 continue;
6273             }
6274             if (pat[s] == '\'' || pat[s] == '\\')
6275                 *p++ = '\\';
6276             *p++ = pat[s];
6277         }
6278         *p++ = '\'';
6279         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6280             *p++ = 'x';
6281         *p++ = '\0';
6282         DEBUG_COMPILE_r({
6283             PerlIO_printf(Perl_debug_log,
6284                 "%sre-parsing pattern for runtime code:%s %s\n",
6285                 PL_colors[4],PL_colors[5],newpat);
6286         });
6287
6288         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6289         Safefree(newpat);
6290
6291         ENTER;
6292         SAVETMPS;
6293         PUSHSTACKi(PERLSI_REQUIRE);
6294         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6295          * parsing qr''; normally only q'' does this. It also alters
6296          * hints handling */
6297         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6298         SvREFCNT_dec_NN(sv);
6299         SPAGAIN;
6300         qr_ref = POPs;
6301         PUTBACK;
6302         {
6303             SV * const errsv = ERRSV;
6304             if (SvTRUE_NN(errsv))
6305             {
6306                 Safefree(pRExC_state->code_blocks);
6307                 /* use croak_sv ? */
6308                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6309             }
6310         }
6311         assert(SvROK(qr_ref));
6312         qr = SvRV(qr_ref);
6313         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6314         /* the leaving below frees the tmp qr_ref.
6315          * Give qr a life of its own */
6316         SvREFCNT_inc(qr);
6317         POPSTACK;
6318         FREETMPS;
6319         LEAVE;
6320
6321     }
6322
6323     if (!RExC_utf8 && SvUTF8(qr)) {
6324         /* first time through; the pattern got upgraded; save the
6325          * qr for the next time through */
6326         assert(!pRExC_state->runtime_code_qr);
6327         pRExC_state->runtime_code_qr = qr;
6328         return 0;
6329     }
6330
6331
6332     /* extract any code blocks within the returned qr//  */
6333
6334
6335     /* merge the main (r1) and run-time (r2) code blocks into one */
6336     {
6337         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6338         struct reg_code_block *new_block, *dst;
6339         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6340         int i1 = 0, i2 = 0;
6341
6342         if (!r2->num_code_blocks) /* we guessed wrong */
6343         {
6344             SvREFCNT_dec_NN(qr);
6345             return 1;
6346         }
6347
6348         Newx(new_block,
6349             r1->num_code_blocks + r2->num_code_blocks,
6350             struct reg_code_block);
6351         dst = new_block;
6352
6353         while (    i1 < r1->num_code_blocks
6354                 || i2 < r2->num_code_blocks)
6355         {
6356             struct reg_code_block *src;
6357             bool is_qr = 0;
6358
6359             if (i1 == r1->num_code_blocks) {
6360                 src = &r2->code_blocks[i2++];
6361                 is_qr = 1;
6362             }
6363             else if (i2 == r2->num_code_blocks)
6364                 src = &r1->code_blocks[i1++];
6365             else if (  r1->code_blocks[i1].start
6366                      < r2->code_blocks[i2].start)
6367             {
6368                 src = &r1->code_blocks[i1++];
6369                 assert(src->end < r2->code_blocks[i2].start);
6370             }
6371             else {
6372                 assert(  r1->code_blocks[i1].start
6373                        > r2->code_blocks[i2].start);
6374                 src = &r2->code_blocks[i2++];
6375                 is_qr = 1;
6376                 assert(src->end < r1->code_blocks[i1].start);
6377             }
6378
6379             assert(pat[src->start] == '(');
6380             assert(pat[src->end]   == ')');
6381             dst->start      = src->start;
6382             dst->end        = src->end;
6383             dst->block      = src->block;
6384             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6385                                     : src->src_regex;
6386             dst++;
6387         }
6388         r1->num_code_blocks += r2->num_code_blocks;
6389         Safefree(r1->code_blocks);
6390         r1->code_blocks = new_block;
6391     }
6392
6393     SvREFCNT_dec_NN(qr);
6394     return 1;
6395 }
6396
6397
6398 STATIC bool
6399 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6400                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6401                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6402                       STRLEN longest_length, bool eol, bool meol)
6403 {
6404     /* This is the common code for setting up the floating and fixed length
6405      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6406      * as to whether succeeded or not */
6407
6408     I32 t;
6409     SSize_t ml;
6410
6411     if (! (longest_length
6412            || (eol /* Can't have SEOL and MULTI */
6413                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6414           )
6415             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6416         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6417     {
6418         return FALSE;
6419     }
6420
6421     /* copy the information about the longest from the reg_scan_data
6422         over to the program. */
6423     if (SvUTF8(sv_longest)) {
6424         *rx_utf8 = sv_longest;
6425         *rx_substr = NULL;
6426     } else {
6427         *rx_substr = sv_longest;
6428         *rx_utf8 = NULL;
6429     }
6430     /* end_shift is how many chars that must be matched that
6431         follow this item. We calculate it ahead of time as once the
6432         lookbehind offset is added in we lose the ability to correctly
6433         calculate it.*/
6434     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6435     *rx_end_shift = ml - offset
6436         - longest_length + (SvTAIL(sv_longest) != 0)
6437         + lookbehind;
6438
6439     t = (eol/* Can't have SEOL and MULTI */
6440          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6441     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6442
6443     return TRUE;
6444 }
6445
6446 /*
6447  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6448  * regular expression into internal code.
6449  * The pattern may be passed either as:
6450  *    a list of SVs (patternp plus pat_count)
6451  *    a list of OPs (expr)
6452  * If both are passed, the SV list is used, but the OP list indicates
6453  * which SVs are actually pre-compiled code blocks
6454  *
6455  * The SVs in the list have magic and qr overloading applied to them (and
6456  * the list may be modified in-place with replacement SVs in the latter
6457  * case).
6458  *
6459  * If the pattern hasn't changed from old_re, then old_re will be
6460  * returned.
6461  *
6462  * eng is the current engine. If that engine has an op_comp method, then
6463  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6464  * do the initial concatenation of arguments and pass on to the external
6465  * engine.
6466  *
6467  * If is_bare_re is not null, set it to a boolean indicating whether the
6468  * arg list reduced (after overloading) to a single bare regex which has
6469  * been returned (i.e. /$qr/).
6470  *
6471  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6472  *
6473  * pm_flags contains the PMf_* flags, typically based on those from the
6474  * pm_flags field of the related PMOP. Currently we're only interested in
6475  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6476  *
6477  * We can't allocate space until we know how big the compiled form will be,
6478  * but we can't compile it (and thus know how big it is) until we've got a
6479  * place to put the code.  So we cheat:  we compile it twice, once with code
6480  * generation turned off and size counting turned on, and once "for real".
6481  * This also means that we don't allocate space until we are sure that the
6482  * thing really will compile successfully, and we never have to move the
6483  * code and thus invalidate pointers into it.  (Note that it has to be in
6484  * one piece because free() must be able to free it all.) [NB: not true in perl]
6485  *
6486  * Beware that the optimization-preparation code in here knows about some
6487  * of the structure of the compiled regexp.  [I'll say.]
6488  */
6489
6490 REGEXP *
6491 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6492                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6493                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6494 {
6495     REGEXP *rx;
6496     struct regexp *r;
6497     regexp_internal *ri;
6498     STRLEN plen;
6499     char *exp;
6500     regnode *scan;
6501     I32 flags;
6502     SSize_t minlen = 0;
6503     U32 rx_flags;
6504     SV *pat;
6505     SV *code_blocksv = NULL;
6506     SV** new_patternp = patternp;
6507
6508     /* these are all flags - maybe they should be turned
6509      * into a single int with different bit masks */
6510     I32 sawlookahead = 0;
6511     I32 sawplus = 0;
6512     I32 sawopen = 0;
6513     I32 sawminmod = 0;
6514
6515     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6516     bool recompile = 0;
6517     bool runtime_code = 0;
6518     scan_data_t data;
6519     RExC_state_t RExC_state;
6520     RExC_state_t * const pRExC_state = &RExC_state;
6521 #ifdef TRIE_STUDY_OPT
6522     int restudied = 0;
6523     RExC_state_t copyRExC_state;
6524 #endif
6525     GET_RE_DEBUG_FLAGS_DECL;
6526
6527     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6528
6529     DEBUG_r(if (!PL_colorset) reginitcolors());
6530
6531 #ifndef PERL_IN_XSUB_RE
6532     /* Initialize these here instead of as-needed, as is quick and avoids
6533      * having to test them each time otherwise */
6534     if (! PL_AboveLatin1) {
6535         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6536         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6537         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6538         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6539         PL_HasMultiCharFold =
6540                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6541
6542         /* This is calculated here, because the Perl program that generates the
6543          * static global ones doesn't currently have access to
6544          * NUM_ANYOF_CODE_POINTS */
6545         PL_InBitmap = _new_invlist(2);
6546         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6547                                                     NUM_ANYOF_CODE_POINTS - 1);
6548     }
6549 #endif
6550
6551     pRExC_state->code_blocks = NULL;
6552     pRExC_state->num_code_blocks = 0;
6553
6554     if (is_bare_re)
6555         *is_bare_re = FALSE;
6556
6557     if (expr && (expr->op_type == OP_LIST ||
6558                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6559         /* allocate code_blocks if needed */
6560         OP *o;
6561         int ncode = 0;
6562
6563         for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6564             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6565                 ncode++; /* count of DO blocks */
6566         if (ncode) {
6567             pRExC_state->num_code_blocks = ncode;
6568             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6569         }
6570     }
6571
6572     if (!pat_count) {
6573         /* compile-time pattern with just OP_CONSTs and DO blocks */
6574
6575         int n;
6576         OP *o;
6577
6578         /* find how many CONSTs there are */
6579         assert(expr);
6580         n = 0;
6581         if (expr->op_type == OP_CONST)
6582             n = 1;
6583         else
6584             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6585                 if (o->op_type == OP_CONST)
6586                     n++;
6587             }
6588
6589         /* fake up an SV array */
6590
6591         assert(!new_patternp);
6592         Newx(new_patternp, n, SV*);
6593         SAVEFREEPV(new_patternp);
6594         pat_count = n;
6595
6596         n = 0;
6597         if (expr->op_type == OP_CONST)
6598             new_patternp[n] = cSVOPx_sv(expr);
6599         else
6600             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6601                 if (o->op_type == OP_CONST)
6602                     new_patternp[n++] = cSVOPo_sv;
6603             }
6604
6605     }
6606
6607     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6608         "Assembling pattern from %d elements%s\n", pat_count,
6609             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6610
6611     /* set expr to the first arg op */
6612
6613     if (pRExC_state->num_code_blocks
6614          && expr->op_type != OP_CONST)
6615     {
6616             expr = cLISTOPx(expr)->op_first;
6617             assert(   expr->op_type == OP_PUSHMARK
6618                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6619                    || expr->op_type == OP_PADRANGE);
6620             expr = OP_SIBLING(expr);
6621     }
6622
6623     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6624                         expr, &recompile, NULL);
6625
6626     /* handle bare (possibly after overloading) regex: foo =~ $re */
6627     {
6628         SV *re = pat;
6629         if (SvROK(re))
6630             re = SvRV(re);
6631         if (SvTYPE(re) == SVt_REGEXP) {
6632             if (is_bare_re)
6633                 *is_bare_re = TRUE;
6634             SvREFCNT_inc(re);
6635             Safefree(pRExC_state->code_blocks);
6636             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6637                 "Precompiled pattern%s\n",
6638                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6639
6640             return (REGEXP*)re;
6641         }
6642     }
6643
6644     exp = SvPV_nomg(pat, plen);
6645
6646     if (!eng->op_comp) {
6647         if ((SvUTF8(pat) && IN_BYTES)
6648                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6649         {
6650             /* make a temporary copy; either to convert to bytes,
6651              * or to avoid repeating get-magic / overloaded stringify */
6652             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6653                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6654         }
6655         Safefree(pRExC_state->code_blocks);
6656         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6657     }
6658
6659     /* ignore the utf8ness if the pattern is 0 length */
6660     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6661     RExC_uni_semantics = 0;
6662     RExC_contains_locale = 0;
6663     RExC_contains_i = 0;
6664     pRExC_state->runtime_code_qr = NULL;
6665     RExC_frame_head= NULL;
6666     RExC_frame_last= NULL;
6667     RExC_frame_count= 0;
6668
6669     DEBUG_r({
6670         RExC_mysv1= sv_newmortal();
6671         RExC_mysv2= sv_newmortal();
6672     });
6673     DEBUG_COMPILE_r({
6674             SV *dsv= sv_newmortal();
6675             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6676             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6677                           PL_colors[4],PL_colors[5],s);
6678         });
6679
6680   redo_first_pass:
6681     /* we jump here if we upgrade the pattern to utf8 and have to
6682      * recompile */
6683
6684     if ((pm_flags & PMf_USE_RE_EVAL)
6685                 /* this second condition covers the non-regex literal case,
6686                  * i.e.  $foo =~ '(?{})'. */
6687                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6688     )
6689         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6690
6691     /* return old regex if pattern hasn't changed */
6692     /* XXX: note in the below we have to check the flags as well as the
6693      * pattern.
6694      *
6695      * Things get a touch tricky as we have to compare the utf8 flag
6696      * independently from the compile flags.  */
6697
6698     if (   old_re
6699         && !recompile
6700         && !!RX_UTF8(old_re) == !!RExC_utf8
6701         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6702         && RX_PRECOMP(old_re)
6703         && RX_PRELEN(old_re) == plen
6704         && memEQ(RX_PRECOMP(old_re), exp, plen)
6705         && !runtime_code /* with runtime code, always recompile */ )
6706     {
6707         Safefree(pRExC_state->code_blocks);
6708         return old_re;
6709     }
6710
6711     rx_flags = orig_rx_flags;
6712
6713     if (rx_flags & PMf_FOLD) {
6714         RExC_contains_i = 1;
6715     }
6716     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6717
6718         /* Set to use unicode semantics if the pattern is in utf8 and has the
6719          * 'depends' charset specified, as it means unicode when utf8  */
6720         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6721     }
6722
6723     RExC_precomp = exp;
6724     RExC_flags = rx_flags;
6725     RExC_pm_flags = pm_flags;
6726
6727     if (runtime_code) {
6728         if (TAINTING_get && TAINT_get)
6729             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6730
6731         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6732             /* whoops, we have a non-utf8 pattern, whilst run-time code
6733              * got compiled as utf8. Try again with a utf8 pattern */
6734             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6735                                     pRExC_state->num_code_blocks);
6736             goto redo_first_pass;
6737         }
6738     }
6739     assert(!pRExC_state->runtime_code_qr);
6740
6741     RExC_sawback = 0;
6742
6743     RExC_seen = 0;
6744     RExC_maxlen = 0;
6745     RExC_in_lookbehind = 0;
6746     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6747     RExC_extralen = 0;
6748     RExC_override_recoding = 0;
6749     RExC_in_multi_char_class = 0;
6750
6751     /* First pass: determine size, legality. */
6752     RExC_parse = exp;
6753     RExC_start = exp;
6754     RExC_end = exp + plen;
6755     RExC_naughty = 0;
6756     RExC_npar = 1;
6757     RExC_nestroot = 0;
6758     RExC_size = 0L;
6759     RExC_emit = (regnode *) &RExC_emit_dummy;
6760     RExC_whilem_seen = 0;
6761     RExC_open_parens = NULL;
6762     RExC_close_parens = NULL;
6763     RExC_opend = NULL;
6764     RExC_paren_names = NULL;
6765 #ifdef DEBUGGING
6766     RExC_paren_name_list = NULL;
6767 #endif
6768     RExC_recurse = NULL;
6769     RExC_study_chunk_recursed = NULL;
6770     RExC_study_chunk_recursed_bytes= 0;
6771     RExC_recurse_count = 0;
6772     pRExC_state->code_index = 0;
6773
6774 #if 0 /* REGC() is (currently) a NOP at the first pass.
6775        * Clever compilers notice this and complain. --jhi */
6776     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6777 #endif
6778     DEBUG_PARSE_r(
6779         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6780         RExC_lastnum=0;
6781         RExC_lastparse=NULL;
6782     );
6783     /* reg may croak on us, not giving us a chance to free
6784        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6785        need it to survive as long as the regexp (qr/(?{})/).
6786        We must check that code_blocksv is not already set, because we may
6787        have jumped back to restart the sizing pass. */
6788     if (pRExC_state->code_blocks && !code_blocksv) {
6789         code_blocksv = newSV_type(SVt_PV);
6790         SAVEFREESV(code_blocksv);
6791         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6792         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6793     }
6794     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6795         /* It's possible to write a regexp in ascii that represents Unicode
6796         codepoints outside of the byte range, such as via \x{100}. If we
6797         detect such a sequence we have to convert the entire pattern to utf8
6798         and then recompile, as our sizing calculation will have been based
6799         on 1 byte == 1 character, but we will need to use utf8 to encode
6800         at least some part of the pattern, and therefore must convert the whole
6801         thing.
6802         -- dmq */
6803         if (flags & RESTART_UTF8) {
6804             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6805                                     pRExC_state->num_code_blocks);
6806             goto redo_first_pass;
6807         }
6808         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6809     }
6810     if (code_blocksv)
6811         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6812
6813     DEBUG_PARSE_r({
6814         PerlIO_printf(Perl_debug_log,
6815             "Required size %"IVdf" nodes\n"
6816             "Starting second pass (creation)\n",
6817             (IV)RExC_size);
6818         RExC_lastnum=0;
6819         RExC_lastparse=NULL;
6820     });
6821
6822     /* The first pass could have found things that force Unicode semantics */
6823     if ((RExC_utf8 || RExC_uni_semantics)
6824          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6825     {
6826         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6827     }
6828
6829     /* Small enough for pointer-storage convention?
6830        If extralen==0, this means that we will not need long jumps. */
6831     if (RExC_size >= 0x10000L && RExC_extralen)
6832         RExC_size += RExC_extralen;
6833     else
6834         RExC_extralen = 0;
6835     if (RExC_whilem_seen > 15)
6836         RExC_whilem_seen = 15;
6837
6838     /* Allocate space and zero-initialize. Note, the two step process
6839        of zeroing when in debug mode, thus anything assigned has to
6840        happen after that */
6841     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6842     r = ReANY(rx);
6843     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6844          char, regexp_internal);
6845     if ( r == NULL || ri == NULL )
6846         FAIL("Regexp out of space");
6847 #ifdef DEBUGGING
6848     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6849     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6850          char);
6851 #else
6852     /* bulk initialize base fields with 0. */
6853     Zero(ri, sizeof(regexp_internal), char);
6854 #endif
6855
6856     /* non-zero initialization begins here */
6857     RXi_SET( r, ri );
6858     r->engine= eng;
6859     r->extflags = rx_flags;
6860     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6861
6862     if (pm_flags & PMf_IS_QR) {
6863         ri->code_blocks = pRExC_state->code_blocks;
6864         ri->num_code_blocks = pRExC_state->num_code_blocks;
6865     }
6866     else
6867     {
6868         int n;
6869         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6870             if (pRExC_state->code_blocks[n].src_regex)
6871                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6872         SAVEFREEPV(pRExC_state->code_blocks);
6873     }
6874
6875     {
6876         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6877         bool has_charset = (get_regex_charset(r->extflags)
6878                                                     != REGEX_DEPENDS_CHARSET);
6879
6880         /* The caret is output if there are any defaults: if not all the STD
6881          * flags are set, or if no character set specifier is needed */
6882         bool has_default =
6883                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6884                     || ! has_charset);
6885         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6886                                                    == REG_RUN_ON_COMMENT_SEEN);
6887         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6888                             >> RXf_PMf_STD_PMMOD_SHIFT);
6889         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6890         char *p;
6891         /* Allocate for the worst case, which is all the std flags are turned
6892          * on.  If more precision is desired, we could do a population count of
6893          * the flags set.  This could be done with a small lookup table, or by
6894          * shifting, masking and adding, or even, when available, assembly
6895          * language for a machine-language population count.
6896          * We never output a minus, as all those are defaults, so are
6897          * covered by the caret */
6898         const STRLEN wraplen = plen + has_p + has_runon
6899             + has_default       /* If needs a caret */
6900
6901                 /* If needs a character set specifier */
6902             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6903             + (sizeof(STD_PAT_MODS) - 1)
6904             + (sizeof("(?:)") - 1);
6905
6906         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6907         r->xpv_len_u.xpvlenu_pv = p;
6908         if (RExC_utf8)
6909             SvFLAGS(rx) |= SVf_UTF8;
6910         *p++='('; *p++='?';
6911
6912         /* If a default, cover it using the caret */
6913         if (has_default) {
6914             *p++= DEFAULT_PAT_MOD;
6915         }
6916         if (has_charset) {
6917             STRLEN len;
6918             const char* const name = get_regex_charset_name(r->extflags, &len);
6919             Copy(name, p, len, char);
6920             p += len;
6921         }
6922         if (has_p)
6923             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6924         {
6925             char ch;
6926             while((ch = *fptr++)) {
6927                 if(reganch & 1)
6928                     *p++ = ch;
6929                 reganch >>= 1;
6930             }
6931         }
6932
6933         *p++ = ':';
6934         Copy(RExC_precomp, p, plen, char);
6935         assert ((RX_WRAPPED(rx) - p) < 16);
6936         r->pre_prefix = p - RX_WRAPPED(rx);
6937         p += plen;
6938         if (has_runon)
6939             *p++ = '\n';
6940         *p++ = ')';
6941         *p = 0;
6942         SvCUR_set(rx, p - RX_WRAPPED(rx));
6943     }
6944
6945     r->intflags = 0;
6946     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6947
6948     /* setup various meta data about recursion, this all requires
6949      * RExC_npar to be correctly set, and a bit later on we clear it */
6950     if (RExC_seen & REG_RECURSE_SEEN) {
6951         Newxz(RExC_open_parens, RExC_npar,regnode *);
6952         SAVEFREEPV(RExC_open_parens);
6953         Newxz(RExC_close_parens,RExC_npar,regnode *);
6954         SAVEFREEPV(RExC_close_parens);
6955     }
6956     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6957         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6958          * So its 1 if there are no parens. */
6959         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6960                                          ((RExC_npar & 0x07) != 0);
6961         Newx(RExC_study_chunk_recursed,
6962              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6963         SAVEFREEPV(RExC_study_chunk_recursed);
6964     }
6965
6966     /* Useful during FAIL. */
6967 #ifdef RE_TRACK_PATTERN_OFFSETS
6968     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6969     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6970                           "%s %"UVuf" bytes for offset annotations.\n",
6971                           ri->u.offsets ? "Got" : "Couldn't get",
6972                           (UV)((2*RExC_size+1) * sizeof(U32))));
6973 #endif
6974     SetProgLen(ri,RExC_size);
6975     RExC_rx_sv = rx;
6976     RExC_rx = r;
6977     RExC_rxi = ri;
6978
6979     /* Second pass: emit code. */
6980     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6981     RExC_pm_flags = pm_flags;
6982     RExC_parse = exp;
6983     RExC_end = exp + plen;
6984     RExC_naughty = 0;
6985     RExC_npar = 1;
6986     RExC_emit_start = ri->program;
6987     RExC_emit = ri->program;
6988     RExC_emit_bound = ri->program + RExC_size + 1;
6989     pRExC_state->code_index = 0;
6990
6991     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6992     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6993         ReREFCNT_dec(rx);
6994         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6995     }
6996     /* XXXX To minimize changes to RE engine we always allocate
6997        3-units-long substrs field. */
6998     Newx(r->substrs, 1, struct reg_substr_data);
6999     if (RExC_recurse_count) {
7000         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7001         SAVEFREEPV(RExC_recurse);
7002     }
7003
7004 reStudy:
7005     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7006     DEBUG_r(
7007         RExC_study_chunk_recursed_count= 0;
7008     );
7009     Zero(r->substrs, 1, struct reg_substr_data);
7010     if (RExC_study_chunk_recursed) {
7011         Zero(RExC_study_chunk_recursed,
7012              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7013     }
7014
7015
7016 #ifdef TRIE_STUDY_OPT
7017     if (!restudied) {
7018         StructCopy(&zero_scan_data, &data, scan_data_t);
7019         copyRExC_state = RExC_state;
7020     } else {
7021         U32 seen=RExC_seen;
7022         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
7023
7024         RExC_state = copyRExC_state;
7025         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7026             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7027         else
7028             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7029         StructCopy(&zero_scan_data, &data, scan_data_t);
7030     }
7031 #else
7032     StructCopy(&zero_scan_data, &data, scan_data_t);
7033 #endif
7034
7035     /* Dig out information for optimizations. */
7036     r->extflags = RExC_flags; /* was pm_op */
7037     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7038
7039     if (UTF)
7040         SvUTF8_on(rx);  /* Unicode in it? */
7041     ri->regstclass = NULL;
7042     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
7043         r->intflags |= PREGf_NAUGHTY;
7044     scan = ri->program + 1;             /* First BRANCH. */
7045
7046     /* testing for BRANCH here tells us whether there is "must appear"
7047        data in the pattern. If there is then we can use it for optimisations */
7048     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7049                                                   */
7050         SSize_t fake;
7051         STRLEN longest_float_length, longest_fixed_length;
7052         regnode_ssc ch_class; /* pointed to by data */
7053         int stclass_flag;
7054         SSize_t last_close = 0; /* pointed to by data */
7055         regnode *first= scan;
7056         regnode *first_next= regnext(first);
7057         /*
7058          * Skip introductions and multiplicators >= 1
7059          * so that we can extract the 'meat' of the pattern that must
7060          * match in the large if() sequence following.
7061          * NOTE that EXACT is NOT covered here, as it is normally
7062          * picked up by the optimiser separately.
7063          *
7064          * This is unfortunate as the optimiser isnt handling lookahead
7065          * properly currently.
7066          *
7067          */
7068         while ((OP(first) == OPEN && (sawopen = 1)) ||
7069                /* An OR of *one* alternative - should not happen now. */
7070             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7071             /* for now we can't handle lookbehind IFMATCH*/
7072             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7073             (OP(first) == PLUS) ||
7074             (OP(first) == MINMOD) ||
7075                /* An {n,m} with n>0 */
7076             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7077             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7078         {
7079                 /*
7080                  * the only op that could be a regnode is PLUS, all the rest
7081                  * will be regnode_1 or regnode_2.
7082                  *
7083                  * (yves doesn't think this is true)
7084                  */
7085                 if (OP(first) == PLUS)
7086                     sawplus = 1;
7087                 else {
7088                     if (OP(first) == MINMOD)
7089                         sawminmod = 1;
7090                     first += regarglen[OP(first)];
7091                 }
7092                 first = NEXTOPER(first);
7093                 first_next= regnext(first);
7094         }
7095
7096         /* Starting-point info. */
7097       again:
7098         DEBUG_PEEP("first:",first,0);
7099         /* Ignore EXACT as we deal with it later. */
7100         if (PL_regkind[OP(first)] == EXACT) {
7101             if (OP(first) == EXACT)
7102                 NOOP;   /* Empty, get anchored substr later. */
7103             else
7104                 ri->regstclass = first;
7105         }
7106 #ifdef TRIE_STCLASS
7107         else if (PL_regkind[OP(first)] == TRIE &&
7108                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7109         {
7110             /* this can happen only on restudy */
7111             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7112         }
7113 #endif
7114         else if (REGNODE_SIMPLE(OP(first)))
7115             ri->regstclass = first;
7116         else if (PL_regkind[OP(first)] == BOUND ||
7117                  PL_regkind[OP(first)] == NBOUND)
7118             ri->regstclass = first;
7119         else if (PL_regkind[OP(first)] == BOL) {
7120             r->intflags |= (OP(first) == MBOL
7121                            ? PREGf_ANCH_MBOL
7122                            : PREGf_ANCH_SBOL);
7123             first = NEXTOPER(first);
7124             goto again;
7125         }
7126         else if (OP(first) == GPOS) {
7127             r->intflags |= PREGf_ANCH_GPOS;
7128             first = NEXTOPER(first);
7129             goto again;
7130         }
7131         else if ((!sawopen || !RExC_sawback) &&
7132             !sawlookahead &&
7133             (OP(first) == STAR &&
7134             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7135             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7136         {
7137             /* turn .* into ^.* with an implied $*=1 */
7138             const int type =
7139                 (OP(NEXTOPER(first)) == REG_ANY)
7140                     ? PREGf_ANCH_MBOL
7141                     : PREGf_ANCH_SBOL;
7142             r->intflags |= (type | PREGf_IMPLICIT);
7143             first = NEXTOPER(first);
7144             goto again;
7145         }
7146         if (sawplus && !sawminmod && !sawlookahead
7147             && (!sawopen || !RExC_sawback)
7148             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7149             /* x+ must match at the 1st pos of run of x's */
7150             r->intflags |= PREGf_SKIP;
7151
7152         /* Scan is after the zeroth branch, first is atomic matcher. */
7153 #ifdef TRIE_STUDY_OPT
7154         DEBUG_PARSE_r(
7155             if (!restudied)
7156                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7157                               (IV)(first - scan + 1))
7158         );
7159 #else
7160         DEBUG_PARSE_r(
7161             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7162                 (IV)(first - scan + 1))
7163         );
7164 #endif
7165
7166
7167         /*
7168         * If there's something expensive in the r.e., find the
7169         * longest literal string that must appear and make it the
7170         * regmust.  Resolve ties in favor of later strings, since
7171         * the regstart check works with the beginning of the r.e.
7172         * and avoiding duplication strengthens checking.  Not a
7173         * strong reason, but sufficient in the absence of others.
7174         * [Now we resolve ties in favor of the earlier string if
7175         * it happens that c_offset_min has been invalidated, since the
7176         * earlier string may buy us something the later one won't.]
7177         */
7178
7179         data.longest_fixed = newSVpvs("");
7180         data.longest_float = newSVpvs("");
7181         data.last_found = newSVpvs("");
7182         data.longest = &(data.longest_fixed);
7183         ENTER_with_name("study_chunk");
7184         SAVEFREESV(data.longest_fixed);
7185         SAVEFREESV(data.longest_float);
7186         SAVEFREESV(data.last_found);
7187         first = scan;
7188         if (!ri->regstclass) {
7189             ssc_init(pRExC_state, &ch_class);
7190             data.start_class = &ch_class;
7191             stclass_flag = SCF_DO_STCLASS_AND;
7192         } else                          /* XXXX Check for BOUND? */
7193             stclass_flag = 0;
7194         data.last_closep = &last_close;
7195
7196         DEBUG_RExC_seen();
7197         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7198                              scan + RExC_size, /* Up to end */
7199             &data, -1, 0, NULL,
7200             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7201                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7202             0);
7203
7204
7205         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7206
7207
7208         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7209              && data.last_start_min == 0 && data.last_end > 0
7210              && !RExC_seen_zerolen
7211              && !(RExC_seen & REG_VERBARG_SEEN)
7212              && !(RExC_seen & REG_GPOS_SEEN)
7213         ){
7214             r->extflags |= RXf_CHECK_ALL;
7215         }
7216         scan_commit(pRExC_state, &data,&minlen,0);
7217
7218         longest_float_length = CHR_SVLEN(data.longest_float);
7219
7220         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7221                    && data.offset_fixed == data.offset_float_min
7222                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7223             && S_setup_longest (aTHX_ pRExC_state,
7224                                     data.longest_float,
7225                                     &(r->float_utf8),
7226                                     &(r->float_substr),
7227                                     &(r->float_end_shift),
7228                                     data.lookbehind_float,
7229                                     data.offset_float_min,
7230                                     data.minlen_float,
7231                                     longest_float_length,
7232                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7233                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7234         {
7235             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7236             r->float_max_offset = data.offset_float_max;
7237             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7238                 r->float_max_offset -= data.lookbehind_float;
7239             SvREFCNT_inc_simple_void_NN(data.longest_float);
7240         }
7241         else {
7242             r->float_substr = r->float_utf8 = NULL;
7243             longest_float_length = 0;
7244         }
7245
7246         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7247
7248         if (S_setup_longest (aTHX_ pRExC_state,
7249                                 data.longest_fixed,
7250                                 &(r->anchored_utf8),
7251                                 &(r->anchored_substr),
7252                                 &(r->anchored_end_shift),
7253                                 data.lookbehind_fixed,
7254                                 data.offset_fixed,
7255                                 data.minlen_fixed,
7256                                 longest_fixed_length,
7257                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7258                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7259         {
7260             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7261             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7262         }
7263         else {
7264             r->anchored_substr = r->anchored_utf8 = NULL;
7265             longest_fixed_length = 0;
7266         }
7267         LEAVE_with_name("study_chunk");
7268
7269         if (ri->regstclass
7270             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7271             ri->regstclass = NULL;
7272
7273         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7274             && stclass_flag
7275             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7276             && is_ssc_worth_it(pRExC_state, data.start_class))
7277         {
7278             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7279
7280             ssc_finalize(pRExC_state, data.start_class);
7281
7282             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7283             StructCopy(data.start_class,
7284                        (regnode_ssc*)RExC_rxi->data->data[n],
7285                        regnode_ssc);
7286             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7287             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7288             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7289                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7290                       PerlIO_printf(Perl_debug_log,
7291                                     "synthetic stclass \"%s\".\n",
7292                                     SvPVX_const(sv));});
7293             data.start_class = NULL;
7294         }
7295
7296         /* A temporary algorithm prefers floated substr to fixed one to dig
7297          * more info. */
7298         if (longest_fixed_length > longest_float_length) {
7299             r->substrs->check_ix = 0;
7300             r->check_end_shift = r->anchored_end_shift;
7301             r->check_substr = r->anchored_substr;
7302             r->check_utf8 = r->anchored_utf8;
7303             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7304             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7305                 r->intflags |= PREGf_NOSCAN;
7306         }
7307         else {
7308             r->substrs->check_ix = 1;
7309             r->check_end_shift = r->float_end_shift;
7310             r->check_substr = r->float_substr;
7311             r->check_utf8 = r->float_utf8;
7312             r->check_offset_min = r->float_min_offset;
7313             r->check_offset_max = r->float_max_offset;
7314         }
7315         if ((r->check_substr || r->check_utf8) ) {
7316             r->extflags |= RXf_USE_INTUIT;
7317             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7318                 r->extflags |= RXf_INTUIT_TAIL;
7319         }
7320         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7321
7322         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7323         if ( (STRLEN)minlen < longest_float_length )
7324             minlen= longest_float_length;
7325         if ( (STRLEN)minlen < longest_fixed_length )
7326             minlen= longest_fixed_length;
7327         */
7328     }
7329     else {
7330         /* Several toplevels. Best we can is to set minlen. */
7331         SSize_t fake;
7332         regnode_ssc ch_class;
7333         SSize_t last_close = 0;
7334
7335         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7336
7337         scan = ri->program + 1;
7338         ssc_init(pRExC_state, &ch_class);
7339         data.start_class = &ch_class;
7340         data.last_closep = &last_close;
7341
7342         DEBUG_RExC_seen();
7343         minlen = study_chunk(pRExC_state,
7344             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7345             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7346                                                       ? SCF_TRIE_DOING_RESTUDY
7347                                                       : 0),
7348             0);
7349
7350         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7351
7352         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7353                 = r->float_substr = r->float_utf8 = NULL;
7354
7355         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7356             && is_ssc_worth_it(pRExC_state, data.start_class))
7357         {
7358             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7359
7360             ssc_finalize(pRExC_state, data.start_class);
7361
7362             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7363             StructCopy(data.start_class,
7364                        (regnode_ssc*)RExC_rxi->data->data[n],
7365                        regnode_ssc);
7366             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7367             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7368             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7369                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7370                       PerlIO_printf(Perl_debug_log,
7371                                     "synthetic stclass \"%s\".\n",
7372                                     SvPVX_const(sv));});
7373             data.start_class = NULL;
7374         }
7375     }
7376
7377     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7378         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7379         r->maxlen = REG_INFTY;
7380     }
7381     else {
7382         r->maxlen = RExC_maxlen;
7383     }
7384
7385     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7386        the "real" pattern. */
7387     DEBUG_OPTIMISE_r({
7388         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7389                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7390     });
7391     r->minlenret = minlen;
7392     if (r->minlen < minlen)
7393         r->minlen = minlen;
7394
7395     if (RExC_seen & REG_GPOS_SEEN)
7396         r->intflags |= PREGf_GPOS_SEEN;
7397     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7398         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7399                                                 lookbehind */
7400     if (pRExC_state->num_code_blocks)
7401         r->extflags |= RXf_EVAL_SEEN;
7402     if (RExC_seen & REG_CANY_SEEN)
7403         r->intflags |= PREGf_CANY_SEEN;
7404     if (RExC_seen & REG_VERBARG_SEEN)
7405     {
7406         r->intflags |= PREGf_VERBARG_SEEN;
7407         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7408     }
7409     if (RExC_seen & REG_CUTGROUP_SEEN)
7410         r->intflags |= PREGf_CUTGROUP_SEEN;
7411     if (pm_flags & PMf_USE_RE_EVAL)
7412         r->intflags |= PREGf_USE_RE_EVAL;
7413     if (RExC_paren_names)
7414         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7415     else
7416         RXp_PAREN_NAMES(r) = NULL;
7417
7418     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7419      * so it can be used in pp.c */
7420     if (r->intflags & PREGf_ANCH)
7421         r->extflags |= RXf_IS_ANCHORED;
7422
7423
7424     {
7425         /* this is used to identify "special" patterns that might result
7426          * in Perl NOT calling the regex engine and instead doing the match "itself",
7427          * particularly special cases in split//. By having the regex compiler
7428          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7429          * we avoid weird issues with equivalent patterns resulting in different behavior,
7430          * AND we allow non Perl engines to get the same optimizations by the setting the
7431          * flags appropriately - Yves */
7432         regnode *first = ri->program + 1;
7433         U8 fop = OP(first);
7434         regnode *next = NEXTOPER(first);
7435         U8 nop = OP(next);
7436
7437         if (PL_regkind[fop] == NOTHING && nop == END)
7438             r->extflags |= RXf_NULL;
7439         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7440             /* when fop is SBOL first->flags will be true only when it was
7441              * produced by parsing /\A/, and not when parsing /^/. This is
7442              * very important for the split code as there we want to
7443              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7444              * See rt #122761 for more details. -- Yves */
7445             r->extflags |= RXf_START_ONLY;
7446         else if (fop == PLUS
7447                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7448                  && OP(regnext(first)) == END)
7449             r->extflags |= RXf_WHITE;
7450         else if ( r->extflags & RXf_SPLIT
7451                   && fop == EXACT
7452                   && STR_LEN(first) == 1
7453                   && *(STRING(first)) == ' '
7454                   && OP(regnext(first)) == END )
7455             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7456
7457     }
7458
7459     if (RExC_contains_locale) {
7460         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7461     }
7462
7463 #ifdef DEBUGGING
7464     if (RExC_paren_names) {
7465         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7466         ri->data->data[ri->name_list_idx]
7467                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7468     } else
7469 #endif
7470         ri->name_list_idx = 0;
7471
7472     if (RExC_recurse_count) {
7473         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7474             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7475             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7476         }
7477     }
7478     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7479     /* assume we don't need to swap parens around before we match */
7480     DEBUG_TEST_r({
7481         PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7482             (unsigned long)RExC_study_chunk_recursed_count);
7483     });
7484     DEBUG_DUMP_r({
7485         DEBUG_RExC_seen();
7486         PerlIO_printf(Perl_debug_log,"Final program:\n");
7487         regdump(r);
7488     });
7489 #ifdef RE_TRACK_PATTERN_OFFSETS
7490     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7491         const STRLEN len = ri->u.offsets[0];
7492         STRLEN i;
7493         GET_RE_DEBUG_FLAGS_DECL;
7494         PerlIO_printf(Perl_debug_log,
7495                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7496         for (i = 1; i <= len; i++) {
7497             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7498                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7499                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7500             }
7501         PerlIO_printf(Perl_debug_log, "\n");
7502     });
7503 #endif
7504
7505 #ifdef USE_ITHREADS
7506     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7507      * by setting the regexp SV to readonly-only instead. If the
7508      * pattern's been recompiled, the USEDness should remain. */
7509     if (old_re && SvREADONLY(old_re))
7510         SvREADONLY_on(rx);
7511 #endif
7512     return rx;
7513 }
7514
7515
7516 SV*
7517 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7518                     const U32 flags)
7519 {
7520     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7521
7522     PERL_UNUSED_ARG(value);
7523
7524     if (flags & RXapif_FETCH) {
7525         return reg_named_buff_fetch(rx, key, flags);
7526     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7527         Perl_croak_no_modify();
7528         return NULL;
7529     } else if (flags & RXapif_EXISTS) {
7530         return reg_named_buff_exists(rx, key, flags)
7531             ? &PL_sv_yes
7532             : &PL_sv_no;
7533     } else if (flags & RXapif_REGNAMES) {
7534         return reg_named_buff_all(rx, flags);
7535     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7536         return reg_named_buff_scalar(rx, flags);
7537     } else {
7538         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7539         return NULL;
7540     }
7541 }
7542
7543 SV*
7544 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7545                          const U32 flags)
7546 {
7547     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7548     PERL_UNUSED_ARG(lastkey);
7549
7550     if (flags & RXapif_FIRSTKEY)
7551         return reg_named_buff_firstkey(rx, flags);
7552     else if (flags & RXapif_NEXTKEY)
7553         return reg_named_buff_nextkey(rx, flags);
7554     else {
7555         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7556                                             (int)flags);
7557         return NULL;
7558     }
7559 }
7560
7561 SV*
7562 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7563                           const U32 flags)
7564 {
7565     AV *retarray = NULL;
7566     SV *ret;
7567     struct regexp *const rx = ReANY(r);
7568
7569     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7570
7571     if (flags & RXapif_ALL)
7572         retarray=newAV();
7573
7574     if (rx && RXp_PAREN_NAMES(rx)) {
7575         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7576         if (he_str) {
7577             IV i;
7578             SV* sv_dat=HeVAL(he_str);
7579             I32 *nums=(I32*)SvPVX(sv_dat);
7580             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7581                 if ((I32)(rx->nparens) >= nums[i]
7582                     && rx->offs[nums[i]].start != -1
7583                     && rx->offs[nums[i]].end != -1)
7584                 {
7585                     ret = newSVpvs("");
7586                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7587                     if (!retarray)
7588                         return ret;
7589                 } else {
7590                     if (retarray)
7591                         ret = newSVsv(&PL_sv_undef);
7592                 }
7593                 if (retarray)
7594                     av_push(retarray, ret);
7595             }
7596             if (retarray)
7597                 return newRV_noinc(MUTABLE_SV(retarray));
7598         }
7599     }
7600     return NULL;
7601 }
7602
7603 bool
7604 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7605                            const U32 flags)
7606 {
7607     struct regexp *const rx = ReANY(r);
7608
7609     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7610
7611     if (rx && RXp_PAREN_NAMES(rx)) {
7612         if (flags & RXapif_ALL) {
7613             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7614         } else {
7615             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7616             if (sv) {
7617                 SvREFCNT_dec_NN(sv);
7618                 return TRUE;
7619             } else {
7620                 return FALSE;
7621             }
7622         }
7623     } else {
7624         return FALSE;
7625     }
7626 }
7627
7628 SV*
7629 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7630 {
7631     struct regexp *const rx = ReANY(r);
7632
7633     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7634
7635     if ( rx && RXp_PAREN_NAMES(rx) ) {
7636         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7637
7638         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7639     } else {
7640         return FALSE;
7641     }
7642 }
7643
7644 SV*
7645 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7646 {
7647     struct regexp *const rx = ReANY(r);
7648     GET_RE_DEBUG_FLAGS_DECL;
7649
7650     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7651
7652     if (rx && RXp_PAREN_NAMES(rx)) {
7653         HV *hv = RXp_PAREN_NAMES(rx);
7654         HE *temphe;
7655         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7656             IV i;
7657             IV parno = 0;
7658             SV* sv_dat = HeVAL(temphe);
7659             I32 *nums = (I32*)SvPVX(sv_dat);
7660             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7661                 if ((I32)(rx->lastparen) >= nums[i] &&
7662                     rx->offs[nums[i]].start != -1 &&
7663                     rx->offs[nums[i]].end != -1)
7664                 {
7665                     parno = nums[i];
7666                     break;
7667                 }
7668             }
7669             if (parno || flags & RXapif_ALL) {
7670                 return newSVhek(HeKEY_hek(temphe));
7671             }
7672         }
7673     }
7674     return NULL;
7675 }
7676
7677 SV*
7678 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7679 {
7680     SV *ret;
7681     AV *av;
7682     SSize_t length;
7683     struct regexp *const rx = ReANY(r);
7684
7685     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7686
7687     if (rx && RXp_PAREN_NAMES(rx)) {
7688         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7689             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7690         } else if (flags & RXapif_ONE) {
7691             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7692             av = MUTABLE_AV(SvRV(ret));
7693             length = av_tindex(av);
7694             SvREFCNT_dec_NN(ret);
7695             return newSViv(length + 1);
7696         } else {
7697             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7698                                                 (int)flags);
7699             return NULL;
7700         }
7701     }
7702     return &PL_sv_undef;
7703 }
7704
7705 SV*
7706 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7707 {
7708     struct regexp *const rx = ReANY(r);
7709     AV *av = newAV();
7710
7711     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7712
7713     if (rx && RXp_PAREN_NAMES(rx)) {
7714         HV *hv= RXp_PAREN_NAMES(rx);
7715         HE *temphe;
7716         (void)hv_iterinit(hv);
7717         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7718             IV i;
7719             IV parno = 0;
7720             SV* sv_dat = HeVAL(temphe);
7721             I32 *nums = (I32*)SvPVX(sv_dat);
7722             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7723                 if ((I32)(rx->lastparen) >= nums[i] &&
7724                     rx->offs[nums[i]].start != -1 &&
7725                     rx->offs[nums[i]].end != -1)
7726                 {
7727                     parno = nums[i];
7728                     break;
7729                 }
7730             }
7731             if (parno || flags & RXapif_ALL) {
7732                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7733             }
7734         }
7735     }
7736
7737     return newRV_noinc(MUTABLE_SV(av));
7738 }
7739
7740 void
7741 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7742                              SV * const sv)
7743 {
7744     struct regexp *const rx = ReANY(r);
7745     char *s = NULL;
7746     SSize_t i = 0;
7747     SSize_t s1, t1;
7748     I32 n = paren;
7749
7750     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7751
7752     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7753            || n == RX_BUFF_IDX_CARET_FULLMATCH
7754            || n == RX_BUFF_IDX_CARET_POSTMATCH
7755        )
7756     {
7757         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7758         if (!keepcopy) {
7759             /* on something like
7760              *    $r = qr/.../;
7761              *    /$qr/p;
7762              * the KEEPCOPY is set on the PMOP rather than the regex */
7763             if (PL_curpm && r == PM_GETRE(PL_curpm))
7764                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7765         }
7766         if (!keepcopy)
7767             goto ret_undef;
7768     }
7769
7770     if (!rx->subbeg)
7771         goto ret_undef;
7772
7773     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7774         /* no need to distinguish between them any more */
7775         n = RX_BUFF_IDX_FULLMATCH;
7776
7777     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7778         && rx->offs[0].start != -1)
7779     {
7780         /* $`, ${^PREMATCH} */
7781         i = rx->offs[0].start;
7782         s = rx->subbeg;
7783     }
7784     else
7785     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7786         && rx->offs[0].end != -1)
7787     {
7788         /* $', ${^POSTMATCH} */
7789         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7790         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7791     }
7792     else
7793     if ( 0 <= n && n <= (I32)rx->nparens &&
7794         (s1 = rx->offs[n].start) != -1 &&
7795         (t1 = rx->offs[n].end) != -1)
7796     {
7797         /* $&, ${^MATCH},  $1 ... */
7798         i = t1 - s1;
7799         s = rx->subbeg + s1 - rx->suboffset;
7800     } else {
7801         goto ret_undef;
7802     }
7803
7804     assert(s >= rx->subbeg);
7805     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7806     if (i >= 0) {
7807 #ifdef NO_TAINT_SUPPORT
7808         sv_setpvn(sv, s, i);
7809 #else
7810         const int oldtainted = TAINT_get;
7811         TAINT_NOT;
7812         sv_setpvn(sv, s, i);
7813         TAINT_set(oldtainted);
7814 #endif
7815         if ( (rx->intflags & PREGf_CANY_SEEN)
7816             ? (RXp_MATCH_UTF8(rx)
7817                         && (!i || is_utf8_string((U8*)s, i)))
7818             : (RXp_MATCH_UTF8(rx)) )
7819         {
7820             SvUTF8_on(sv);
7821         }
7822         else
7823             SvUTF8_off(sv);
7824         if (TAINTING_get) {
7825             if (RXp_MATCH_TAINTED(rx)) {
7826                 if (SvTYPE(sv) >= SVt_PVMG) {
7827                     MAGIC* const mg = SvMAGIC(sv);
7828                     MAGIC* mgt;
7829                     TAINT;
7830                     SvMAGIC_set(sv, mg->mg_moremagic);
7831                     SvTAINT(sv);
7832                     if ((mgt = SvMAGIC(sv))) {
7833                         mg->mg_moremagic = mgt;
7834                         SvMAGIC_set(sv, mg);
7835                     }
7836                 } else {
7837                     TAINT;
7838                     SvTAINT(sv);
7839                 }
7840             } else
7841                 SvTAINTED_off(sv);
7842         }
7843     } else {
7844       ret_undef:
7845         sv_setsv(sv,&PL_sv_undef);
7846         return;
7847     }
7848 }
7849
7850 void
7851 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7852                                                          SV const * const value)
7853 {
7854     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7855
7856     PERL_UNUSED_ARG(rx);
7857     PERL_UNUSED_ARG(paren);
7858     PERL_UNUSED_ARG(value);
7859
7860     if (!PL_localizing)
7861         Perl_croak_no_modify();
7862 }
7863
7864 I32
7865 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7866                               const I32 paren)
7867 {
7868     struct regexp *const rx = ReANY(r);
7869     I32 i;
7870     I32 s1, t1;
7871
7872     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7873
7874     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7875         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7876         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7877     )
7878     {
7879         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7880         if (!keepcopy) {
7881             /* on something like
7882              *    $r = qr/.../;
7883              *    /$qr/p;
7884              * the KEEPCOPY is set on the PMOP rather than the regex */
7885             if (PL_curpm && r == PM_GETRE(PL_curpm))
7886                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7887         }
7888         if (!keepcopy)
7889             goto warn_undef;
7890     }
7891
7892     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7893     switch (paren) {
7894       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7895       case RX_BUFF_IDX_PREMATCH:       /* $` */
7896         if (rx->offs[0].start != -1) {
7897                         i = rx->offs[0].start;
7898                         if (i > 0) {
7899                                 s1 = 0;
7900                                 t1 = i;
7901                                 goto getlen;
7902                         }
7903             }
7904         return 0;
7905
7906       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7907       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7908             if (rx->offs[0].end != -1) {
7909                         i = rx->sublen - rx->offs[0].end;
7910                         if (i > 0) {
7911                                 s1 = rx->offs[0].end;
7912                                 t1 = rx->sublen;
7913                                 goto getlen;
7914                         }
7915             }
7916         return 0;
7917
7918       default: /* $& / ${^MATCH}, $1, $2, ... */
7919             if (paren <= (I32)rx->nparens &&
7920             (s1 = rx->offs[paren].start) != -1 &&
7921             (t1 = rx->offs[paren].end) != -1)
7922             {
7923             i = t1 - s1;
7924             goto getlen;
7925         } else {
7926           warn_undef:
7927             if (ckWARN(WARN_UNINITIALIZED))
7928                 report_uninit((const SV *)sv);
7929             return 0;
7930         }
7931     }
7932   getlen:
7933     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7934         const char * const s = rx->subbeg - rx->suboffset + s1;
7935         const U8 *ep;
7936         STRLEN el;
7937
7938         i = t1 - s1;
7939         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7940                         i = el;
7941     }
7942     return i;
7943 }
7944
7945 SV*
7946 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7947 {
7948     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7949         PERL_UNUSED_ARG(rx);
7950         if (0)
7951             return NULL;
7952         else
7953             return newSVpvs("Regexp");
7954 }
7955
7956 /* Scans the name of a named buffer from the pattern.
7957  * If flags is REG_RSN_RETURN_NULL returns null.
7958  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7959  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7960  * to the parsed name as looked up in the RExC_paren_names hash.
7961  * If there is an error throws a vFAIL().. type exception.
7962  */
7963
7964 #define REG_RSN_RETURN_NULL    0
7965 #define REG_RSN_RETURN_NAME    1
7966 #define REG_RSN_RETURN_DATA    2
7967
7968 STATIC SV*
7969 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7970 {
7971     char *name_start = RExC_parse;
7972
7973     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7974
7975     assert (RExC_parse <= RExC_end);
7976     if (RExC_parse == RExC_end) NOOP;
7977     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7978          /* skip IDFIRST by using do...while */
7979         if (UTF)
7980             do {
7981                 RExC_parse += UTF8SKIP(RExC_parse);
7982             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7983         else
7984             do {
7985                 RExC_parse++;
7986             } while (isWORDCHAR(*RExC_parse));
7987     } else {
7988         RExC_parse++; /* so the <- from the vFAIL is after the offending
7989                          character */
7990         vFAIL("Group name must start with a non-digit word character");
7991     }
7992     if ( flags ) {
7993         SV* sv_name
7994             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7995                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7996         if ( flags == REG_RSN_RETURN_NAME)
7997             return sv_name;
7998         else if (flags==REG_RSN_RETURN_DATA) {
7999             HE *he_str = NULL;
8000             SV *sv_dat = NULL;
8001             if ( ! sv_name )      /* should not happen*/
8002                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8003             if (RExC_paren_names)
8004                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8005             if ( he_str )
8006                 sv_dat = HeVAL(he_str);
8007             if ( ! sv_dat )
8008                 vFAIL("Reference to nonexistent named group");
8009             return sv_dat;
8010         }
8011         else {
8012             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8013                        (unsigned long) flags);
8014         }
8015         assert(0); /* NOT REACHED */
8016     }
8017     return NULL;
8018 }
8019
8020 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8021     int num;                                                    \
8022     if (RExC_lastparse!=RExC_parse) {                           \
8023         PerlIO_printf(Perl_debug_log, "%s",                     \
8024             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8025                 RExC_end - RExC_parse, 16,                      \
8026                 "", "",                                         \
8027                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8028                 PERL_PV_PRETTY_ELLIPSES   |                     \
8029                 PERL_PV_PRETTY_LTGT       |                     \
8030                 PERL_PV_ESCAPE_RE         |                     \
8031                 PERL_PV_PRETTY_EXACTSIZE                        \
8032             )                                                   \
8033         );                                                      \
8034     } else                                                      \
8035         PerlIO_printf(Perl_debug_log,"%16s","");                \
8036                                                                 \
8037     if (SIZE_ONLY)                                              \
8038        num = RExC_size + 1;                                     \
8039     else                                                        \
8040        num=REG_NODE_NUM(RExC_emit);                             \
8041     if (RExC_lastnum!=num)                                      \
8042        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
8043     else                                                        \
8044        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
8045     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
8046         (int)((depth*2)), "",                                   \
8047         (funcname)                                              \
8048     );                                                          \
8049     RExC_lastnum=num;                                           \
8050     RExC_lastparse=RExC_parse;                                  \
8051 })
8052
8053
8054
8055 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8056     DEBUG_PARSE_MSG((funcname));                            \
8057     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
8058 })
8059 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
8060     DEBUG_PARSE_MSG((funcname));                            \
8061     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
8062 })
8063
8064 /* This section of code defines the inversion list object and its methods.  The
8065  * interfaces are highly subject to change, so as much as possible is static to
8066  * this file.  An inversion list is here implemented as a malloc'd C UV array
8067  * as an SVt_INVLIST scalar.
8068  *
8069  * An inversion list for Unicode is an array of code points, sorted by ordinal
8070  * number.  The zeroth element is the first code point in the list.  The 1th
8071  * element is the first element beyond that not in the list.  In other words,
8072  * the first range is
8073  *  invlist[0]..(invlist[1]-1)
8074  * The other ranges follow.  Thus every element whose index is divisible by two
8075  * marks the beginning of a range that is in the list, and every element not
8076  * divisible by two marks the beginning of a range not in the list.  A single
8077  * element inversion list that contains the single code point N generally
8078  * consists of two elements
8079  *  invlist[0] == N
8080  *  invlist[1] == N+1
8081  * (The exception is when N is the highest representable value on the
8082  * machine, in which case the list containing just it would be a single
8083  * element, itself.  By extension, if the last range in the list extends to
8084  * infinity, then the first element of that range will be in the inversion list
8085  * at a position that is divisible by two, and is the final element in the
8086  * list.)
8087  * Taking the complement (inverting) an inversion list is quite simple, if the
8088  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8089  * This implementation reserves an element at the beginning of each inversion
8090  * list to always contain 0; there is an additional flag in the header which
8091  * indicates if the list begins at the 0, or is offset to begin at the next
8092  * element.
8093  *
8094  * More about inversion lists can be found in "Unicode Demystified"
8095  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8096  * More will be coming when functionality is added later.
8097  *
8098  * The inversion list data structure is currently implemented as an SV pointing
8099  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8100  * array of UV whose memory management is automatically handled by the existing
8101  * facilities for SV's.
8102  *
8103  * Some of the methods should always be private to the implementation, and some
8104  * should eventually be made public */
8105
8106 /* The header definitions are in F<inline_invlist.c> */
8107
8108 PERL_STATIC_INLINE UV*
8109 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8110 {
8111     /* Returns a pointer to the first element in the inversion list's array.
8112      * This is called upon initialization of an inversion list.  Where the
8113      * array begins depends on whether the list has the code point U+0000 in it
8114      * or not.  The other parameter tells it whether the code that follows this
8115      * call is about to put a 0 in the inversion list or not.  The first
8116      * element is either the element reserved for 0, if TRUE, or the element
8117      * after it, if FALSE */
8118
8119     bool* offset = get_invlist_offset_addr(invlist);
8120     UV* zero_addr = (UV *) SvPVX(invlist);
8121
8122     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8123
8124     /* Must be empty */
8125     assert(! _invlist_len(invlist));
8126
8127     *zero_addr = 0;
8128
8129     /* 1^1 = 0; 1^0 = 1 */
8130     *offset = 1 ^ will_have_0;
8131     return zero_addr + *offset;
8132 }
8133
8134 PERL_STATIC_INLINE UV*
8135 S_invlist_array(SV* const invlist)
8136 {
8137     /* Returns the pointer to the inversion list's array.  Every time the
8138      * length changes, this needs to be called in case malloc or realloc moved
8139      * it */
8140
8141     PERL_ARGS_ASSERT_INVLIST_ARRAY;
8142
8143     /* Must not be empty.  If these fail, you probably didn't check for <len>
8144      * being non-zero before trying to get the array */
8145     assert(_invlist_len(invlist));
8146
8147     /* The very first element always contains zero, The array begins either
8148      * there, or if the inversion list is offset, at the element after it.
8149      * The offset header field determines which; it contains 0 or 1 to indicate
8150      * how much additionally to add */
8151     assert(0 == *(SvPVX(invlist)));
8152     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8153 }
8154
8155 PERL_STATIC_INLINE void
8156 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8157 {
8158     /* Sets the current number of elements stored in the inversion list.
8159      * Updates SvCUR correspondingly */
8160     PERL_UNUSED_CONTEXT;
8161     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8162
8163     assert(SvTYPE(invlist) == SVt_INVLIST);
8164
8165     SvCUR_set(invlist,
8166               (len == 0)
8167                ? 0
8168                : TO_INTERNAL_SIZE(len + offset));
8169     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8170 }
8171
8172 PERL_STATIC_INLINE IV*
8173 S_get_invlist_previous_index_addr(SV* invlist)
8174 {
8175     /* Return the address of the IV that is reserved to hold the cached index
8176      * */
8177     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8178
8179     assert(SvTYPE(invlist) == SVt_INVLIST);
8180
8181     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8182 }
8183
8184 PERL_STATIC_INLINE IV
8185 S_invlist_previous_index(SV* const invlist)
8186 {
8187     /* Returns cached index of previous search */
8188
8189     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8190
8191     return *get_invlist_previous_index_addr(invlist);
8192 }
8193
8194 PERL_STATIC_INLINE void
8195 S_invlist_set_previous_index(SV* const invlist, const IV index)
8196 {
8197     /* Caches <index> for later retrieval */
8198
8199     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8200
8201     assert(index == 0 || index < (int) _invlist_len(invlist));
8202
8203     *get_invlist_previous_index_addr(invlist) = index;
8204 }
8205
8206 PERL_STATIC_INLINE UV
8207 S_invlist_max(SV* const invlist)
8208 {
8209     /* Returns the maximum number of elements storable in the inversion list's
8210      * array, without having to realloc() */
8211
8212     PERL_ARGS_ASSERT_INVLIST_MAX;
8213
8214     assert(SvTYPE(invlist) == SVt_INVLIST);
8215
8216     /* Assumes worst case, in which the 0 element is not counted in the
8217      * inversion list, so subtracts 1 for that */
8218     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8219            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8220            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8221 }
8222
8223 #ifndef PERL_IN_XSUB_RE
8224 SV*
8225 Perl__new_invlist(pTHX_ IV initial_size)
8226 {
8227
8228     /* Return a pointer to a newly constructed inversion list, with enough
8229      * space to store 'initial_size' elements.  If that number is negative, a
8230      * system default is used instead */
8231
8232     SV* new_list;
8233
8234     if (initial_size < 0) {
8235         initial_size = 10;
8236     }
8237
8238     /* Allocate the initial space */
8239     new_list = newSV_type(SVt_INVLIST);
8240
8241     /* First 1 is in case the zero element isn't in the list; second 1 is for
8242      * trailing NUL */
8243     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8244     invlist_set_len(new_list, 0, 0);
8245
8246     /* Force iterinit() to be used to get iteration to work */
8247     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8248
8249     *get_invlist_previous_index_addr(new_list) = 0;
8250
8251     return new_list;
8252 }
8253
8254 SV*
8255 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8256 {
8257     /* Return a pointer to a newly constructed inversion list, initialized to
8258      * point to <list>, which has to be in the exact correct inversion list
8259      * form, including internal fields.  Thus this is a dangerous routine that
8260      * should not be used in the wrong hands.  The passed in 'list' contains
8261      * several header fields at the beginning that are not part of the
8262      * inversion list body proper */
8263
8264     const STRLEN length = (STRLEN) list[0];
8265     const UV version_id =          list[1];
8266     const bool offset   =    cBOOL(list[2]);
8267 #define HEADER_LENGTH 3
8268     /* If any of the above changes in any way, you must change HEADER_LENGTH
8269      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8270      *      perl -E 'say int(rand 2**31-1)'
8271      */
8272 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8273                                         data structure type, so that one being
8274                                         passed in can be validated to be an
8275                                         inversion list of the correct vintage.
8276                                        */
8277
8278     SV* invlist = newSV_type(SVt_INVLIST);
8279
8280     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8281
8282     if (version_id != INVLIST_VERSION_ID) {
8283         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8284     }
8285
8286     /* The generated array passed in includes header elements that aren't part
8287      * of the list proper, so start it just after them */
8288     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8289
8290     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8291                                shouldn't touch it */
8292
8293     *(get_invlist_offset_addr(invlist)) = offset;
8294
8295     /* The 'length' passed to us is the physical number of elements in the
8296      * inversion list.  But if there is an offset the logical number is one
8297      * less than that */
8298     invlist_set_len(invlist, length  - offset, offset);
8299
8300     invlist_set_previous_index(invlist, 0);
8301
8302     /* Initialize the iteration pointer. */
8303     invlist_iterfinish(invlist);
8304
8305     SvREADONLY_on(invlist);
8306
8307     return invlist;
8308 }
8309 #endif /* ifndef PERL_IN_XSUB_RE */
8310
8311 STATIC void
8312 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8313 {
8314     /* Grow the maximum size of an inversion list */
8315
8316     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8317
8318     assert(SvTYPE(invlist) == SVt_INVLIST);
8319
8320     /* Add one to account for the zero element at the beginning which may not
8321      * be counted by the calling parameters */
8322     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8323 }
8324
8325 PERL_STATIC_INLINE void
8326 S_invlist_trim(SV* const invlist)
8327 {
8328     PERL_ARGS_ASSERT_INVLIST_TRIM;
8329
8330     assert(SvTYPE(invlist) == SVt_INVLIST);
8331
8332     /* Change the length of the inversion list to how many entries it currently
8333      * has */
8334     SvPV_shrink_to_cur((SV *) invlist);
8335 }
8336
8337 STATIC void
8338 S__append_range_to_invlist(pTHX_ SV* const invlist,
8339                                  const UV start, const UV end)
8340 {
8341    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8342     * the end of the inversion list.  The range must be above any existing
8343     * ones. */
8344
8345     UV* array;
8346     UV max = invlist_max(invlist);
8347     UV len = _invlist_len(invlist);
8348     bool offset;
8349
8350     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8351
8352     if (len == 0) { /* Empty lists must be initialized */
8353         offset = start != 0;
8354         array = _invlist_array_init(invlist, ! offset);
8355     }
8356     else {
8357         /* Here, the existing list is non-empty. The current max entry in the
8358          * list is generally the first value not in the set, except when the
8359          * set extends to the end of permissible values, in which case it is
8360          * the first entry in that final set, and so this call is an attempt to
8361          * append out-of-order */
8362
8363         UV final_element = len - 1;
8364         array = invlist_array(invlist);
8365         if (array[final_element] > start
8366             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8367         {
8368             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8369                      array[final_element], start,
8370                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8371         }
8372
8373         /* Here, it is a legal append.  If the new range begins with the first
8374          * value not in the set, it is extending the set, so the new first
8375          * value not in the set is one greater than the newly extended range.
8376          * */
8377         offset = *get_invlist_offset_addr(invlist);
8378         if (array[final_element] == start) {
8379             if (end != UV_MAX) {
8380                 array[final_element] = end + 1;
8381             }
8382             else {
8383                 /* But if the end is the maximum representable on the machine,
8384                  * just let the range that this would extend to have no end */
8385                 invlist_set_len(invlist, len - 1, offset);
8386             }
8387             return;
8388         }
8389     }
8390
8391     /* Here the new range doesn't extend any existing set.  Add it */
8392
8393     len += 2;   /* Includes an element each for the start and end of range */
8394
8395     /* If wll overflow the existing space, extend, which may cause the array to
8396      * be moved */
8397     if (max < len) {
8398         invlist_extend(invlist, len);
8399
8400         /* Have to set len here to avoid assert failure in invlist_array() */
8401         invlist_set_len(invlist, len, offset);
8402
8403         array = invlist_array(invlist);
8404     }
8405     else {
8406         invlist_set_len(invlist, len, offset);
8407     }
8408
8409     /* The next item on the list starts the range, the one after that is
8410      * one past the new range.  */
8411     array[len - 2] = start;
8412     if (end != UV_MAX) {
8413         array[len - 1] = end + 1;
8414     }
8415     else {
8416         /* But if the end is the maximum representable on the machine, just let
8417          * the range have no end */
8418         invlist_set_len(invlist, len - 1, offset);
8419     }
8420 }
8421
8422 #ifndef PERL_IN_XSUB_RE
8423
8424 IV
8425 Perl__invlist_search(SV* const invlist, const UV cp)
8426 {
8427     /* Searches the inversion list for the entry that contains the input code
8428      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8429      * return value is the index into the list's array of the range that
8430      * contains <cp> */
8431
8432     IV low = 0;
8433     IV mid;
8434     IV high = _invlist_len(invlist);
8435     const IV highest_element = high - 1;
8436     const UV* array;
8437
8438     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8439
8440     /* If list is empty, return failure. */
8441     if (high == 0) {
8442         return -1;
8443     }
8444
8445     /* (We can't get the array unless we know the list is non-empty) */
8446     array = invlist_array(invlist);
8447
8448     mid = invlist_previous_index(invlist);
8449     assert(mid >=0 && mid <= highest_element);
8450
8451     /* <mid> contains the cache of the result of the previous call to this
8452      * function (0 the first time).  See if this call is for the same result,
8453      * or if it is for mid-1.  This is under the theory that calls to this
8454      * function will often be for related code points that are near each other.
8455      * And benchmarks show that caching gives better results.  We also test
8456      * here if the code point is within the bounds of the list.  These tests
8457      * replace others that would have had to be made anyway to make sure that
8458      * the array bounds were not exceeded, and these give us extra information
8459      * at the same time */
8460     if (cp >= array[mid]) {
8461         if (cp >= array[highest_element]) {
8462             return highest_element;
8463         }
8464
8465         /* Here, array[mid] <= cp < array[highest_element].  This means that
8466          * the final element is not the answer, so can exclude it; it also
8467          * means that <mid> is not the final element, so can refer to 'mid + 1'
8468          * safely */
8469         if (cp < array[mid + 1]) {
8470             return mid;
8471         }
8472         high--;
8473         low = mid + 1;
8474     }
8475     else { /* cp < aray[mid] */
8476         if (cp < array[0]) { /* Fail if outside the array */
8477             return -1;
8478         }
8479         high = mid;
8480         if (cp >= array[mid - 1]) {
8481             goto found_entry;
8482         }
8483     }
8484
8485     /* Binary search.  What we are looking for is <i> such that
8486      *  array[i] <= cp < array[i+1]
8487      * The loop below converges on the i+1.  Note that there may not be an
8488      * (i+1)th element in the array, and things work nonetheless */
8489     while (low < high) {
8490         mid = (low + high) / 2;
8491         assert(mid <= highest_element);
8492         if (array[mid] <= cp) { /* cp >= array[mid] */
8493             low = mid + 1;
8494
8495             /* We could do this extra test to exit the loop early.
8496             if (cp < array[low]) {
8497                 return mid;
8498             }
8499             */
8500         }
8501         else { /* cp < array[mid] */
8502             high = mid;
8503         }
8504     }
8505
8506   found_entry:
8507     high--;
8508     invlist_set_previous_index(invlist, high);
8509     return high;
8510 }
8511
8512 void
8513 Perl__invlist_populate_swatch(SV* const invlist,
8514                               const UV start, const UV end, U8* swatch)
8515 {
8516     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8517      * but is used when the swash has an inversion list.  This makes this much
8518      * faster, as it uses a binary search instead of a linear one.  This is
8519      * intimately tied to that function, and perhaps should be in utf8.c,
8520      * except it is intimately tied to inversion lists as well.  It assumes
8521      * that <swatch> is all 0's on input */
8522
8523     UV current = start;
8524     const IV len = _invlist_len(invlist);
8525     IV i;
8526     const UV * array;
8527
8528     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8529
8530     if (len == 0) { /* Empty inversion list */
8531         return;
8532     }
8533
8534     array = invlist_array(invlist);
8535
8536     /* Find which element it is */
8537     i = _invlist_search(invlist, start);
8538
8539     /* We populate from <start> to <end> */
8540     while (current < end) {
8541         UV upper;
8542
8543         /* The inversion list gives the results for every possible code point
8544          * after the first one in the list.  Only those ranges whose index is
8545          * even are ones that the inversion list matches.  For the odd ones,
8546          * and if the initial code point is not in the list, we have to skip
8547          * forward to the next element */
8548         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8549             i++;
8550             if (i >= len) { /* Finished if beyond the end of the array */
8551                 return;
8552             }
8553             current = array[i];
8554             if (current >= end) {   /* Finished if beyond the end of what we
8555                                        are populating */
8556                 if (LIKELY(end < UV_MAX)) {
8557                     return;
8558                 }
8559
8560                 /* We get here when the upper bound is the maximum
8561                  * representable on the machine, and we are looking for just
8562                  * that code point.  Have to special case it */
8563                 i = len;
8564                 goto join_end_of_list;
8565             }
8566         }
8567         assert(current >= start);
8568
8569         /* The current range ends one below the next one, except don't go past
8570          * <end> */
8571         i++;
8572         upper = (i < len && array[i] < end) ? array[i] : end;
8573
8574         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8575          * for each code point in it */
8576         for (; current < upper; current++) {
8577             const STRLEN offset = (STRLEN)(current - start);
8578             swatch[offset >> 3] |= 1 << (offset & 7);
8579         }
8580
8581     join_end_of_list:
8582
8583         /* Quit if at the end of the list */
8584         if (i >= len) {
8585
8586             /* But first, have to deal with the highest possible code point on
8587              * the platform.  The previous code assumes that <end> is one
8588              * beyond where we want to populate, but that is impossible at the
8589              * platform's infinity, so have to handle it specially */
8590             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8591             {
8592                 const STRLEN offset = (STRLEN)(end - start);
8593                 swatch[offset >> 3] |= 1 << (offset & 7);
8594             }
8595             return;
8596         }
8597
8598         /* Advance to the next range, which will be for code points not in the
8599          * inversion list */
8600         current = array[i];
8601     }
8602
8603     return;
8604 }
8605
8606 void
8607 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8608                                          const bool complement_b, SV** output)
8609 {
8610     /* Take the union of two inversion lists and point <output> to it.  *output
8611      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8612      * the reference count to that list will be decremented if not already a
8613      * temporary (mortal); otherwise *output will be made correspondingly
8614      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8615      * second list is returned.  If <complement_b> is TRUE, the union is taken
8616      * of the complement (inversion) of <b> instead of b itself.
8617      *
8618      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8619      * Richard Gillam, published by Addison-Wesley, and explained at some
8620      * length there.  The preface says to incorporate its examples into your
8621      * code at your own risk.
8622      *
8623      * The algorithm is like a merge sort.
8624      *
8625      * XXX A potential performance improvement is to keep track as we go along
8626      * if only one of the inputs contributes to the result, meaning the other
8627      * is a subset of that one.  In that case, we can skip the final copy and
8628      * return the larger of the input lists, but then outside code might need
8629      * to keep track of whether to free the input list or not */
8630
8631     const UV* array_a;    /* a's array */
8632     const UV* array_b;
8633     UV len_a;       /* length of a's array */
8634     UV len_b;
8635
8636     SV* u;                      /* the resulting union */
8637     UV* array_u;
8638     UV len_u;
8639
8640     UV i_a = 0;             /* current index into a's array */
8641     UV i_b = 0;
8642     UV i_u = 0;
8643
8644     /* running count, as explained in the algorithm source book; items are
8645      * stopped accumulating and are output when the count changes to/from 0.
8646      * The count is incremented when we start a range that's in the set, and
8647      * decremented when we start a range that's not in the set.  So its range
8648      * is 0 to 2.  Only when the count is zero is something not in the set.
8649      */
8650     UV count = 0;
8651
8652     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8653     assert(a != b);
8654
8655     /* If either one is empty, the union is the other one */
8656     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8657         bool make_temp = FALSE; /* Should we mortalize the result? */
8658
8659         if (*output == a) {
8660             if (a != NULL) {
8661                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8662                     SvREFCNT_dec_NN(a);
8663                 }
8664             }
8665         }
8666         if (*output != b) {
8667             *output = invlist_clone(b);
8668             if (complement_b) {
8669                 _invlist_invert(*output);
8670             }
8671         } /* else *output already = b; */
8672
8673         if (make_temp) {
8674             sv_2mortal(*output);
8675         }
8676         return;
8677     }
8678     else if ((len_b = _invlist_len(b)) == 0) {
8679         bool make_temp = FALSE;
8680         if (*output == b) {
8681             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8682                 SvREFCNT_dec_NN(b);
8683             }
8684         }
8685
8686         /* The complement of an empty list is a list that has everything in it,
8687          * so the union with <a> includes everything too */
8688         if (complement_b) {
8689             if (a == *output) {
8690                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8691                     SvREFCNT_dec_NN(a);
8692                 }
8693             }
8694             *output = _new_invlist(1);
8695             _append_range_to_invlist(*output, 0, UV_MAX);
8696         }
8697         else if (*output != a) {
8698             *output = invlist_clone(a);
8699         }
8700         /* else *output already = a; */
8701
8702         if (make_temp) {
8703             sv_2mortal(*output);
8704         }
8705         return;
8706     }
8707
8708     /* Here both lists exist and are non-empty */
8709     array_a = invlist_array(a);
8710     array_b = invlist_array(b);
8711
8712     /* If are to take the union of 'a' with the complement of b, set it
8713      * up so are looking at b's complement. */
8714     if (complement_b) {
8715
8716         /* To complement, we invert: if the first element is 0, remove it.  To
8717          * do this, we just pretend the array starts one later */
8718         if (array_b[0] == 0) {
8719             array_b++;
8720             len_b--;
8721         }
8722         else {
8723
8724             /* But if the first element is not zero, we pretend the list starts
8725              * at the 0 that is always stored immediately before the array. */
8726             array_b--;
8727             len_b++;
8728         }
8729     }
8730
8731     /* Size the union for the worst case: that the sets are completely
8732      * disjoint */
8733     u = _new_invlist(len_a + len_b);
8734
8735     /* Will contain U+0000 if either component does */
8736     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8737                                       || (len_b > 0 && array_b[0] == 0));
8738
8739     /* Go through each list item by item, stopping when exhausted one of
8740      * them */
8741     while (i_a < len_a && i_b < len_b) {
8742         UV cp;      /* The element to potentially add to the union's array */
8743         bool cp_in_set;   /* is it in the the input list's set or not */
8744
8745         /* We need to take one or the other of the two inputs for the union.
8746          * Since we are merging two sorted lists, we take the smaller of the
8747          * next items.  In case of a tie, we take the one that is in its set
8748          * first.  If we took one not in the set first, it would decrement the
8749          * count, possibly to 0 which would cause it to be output as ending the
8750          * range, and the next time through we would take the same number, and
8751          * output it again as beginning the next range.  By doing it the
8752          * opposite way, there is no possibility that the count will be
8753          * momentarily decremented to 0, and thus the two adjoining ranges will
8754          * be seamlessly merged.  (In a tie and both are in the set or both not
8755          * in the set, it doesn't matter which we take first.) */
8756         if (array_a[i_a] < array_b[i_b]
8757             || (array_a[i_a] == array_b[i_b]
8758                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8759         {
8760             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8761             cp= array_a[i_a++];
8762         }
8763         else {
8764             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8765             cp = array_b[i_b++];
8766         }
8767
8768         /* Here, have chosen which of the two inputs to look at.  Only output
8769          * if the running count changes to/from 0, which marks the
8770          * beginning/end of a range in that's in the set */
8771         if (cp_in_set) {
8772             if (count == 0) {
8773                 array_u[i_u++] = cp;
8774             }
8775             count++;
8776         }
8777         else {
8778             count--;
8779             if (count == 0) {
8780                 array_u[i_u++] = cp;
8781             }
8782         }
8783     }
8784
8785     /* Here, we are finished going through at least one of the lists, which
8786      * means there is something remaining in at most one.  We check if the list
8787      * that hasn't been exhausted is positioned such that we are in the middle
8788      * of a range in its set or not.  (i_a and i_b point to the element beyond
8789      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8790      * is potentially more to output.
8791      * There are four cases:
8792      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8793      *     in the union is entirely from the non-exhausted set.
8794      *  2) Both were in their sets, count is 2.  Nothing further should
8795      *     be output, as everything that remains will be in the exhausted
8796      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8797      *     that
8798      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8799      *     Nothing further should be output because the union includes
8800      *     everything from the exhausted set.  Not decrementing ensures that.
8801      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8802      *     decrementing to 0 insures that we look at the remainder of the
8803      *     non-exhausted set */
8804     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8805         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8806     {
8807         count--;
8808     }
8809
8810     /* The final length is what we've output so far, plus what else is about to
8811      * be output.  (If 'count' is non-zero, then the input list we exhausted
8812      * has everything remaining up to the machine's limit in its set, and hence
8813      * in the union, so there will be no further output. */
8814     len_u = i_u;
8815     if (count == 0) {
8816         /* At most one of the subexpressions will be non-zero */
8817         len_u += (len_a - i_a) + (len_b - i_b);
8818     }
8819
8820     /* Set result to final length, which can change the pointer to array_u, so
8821      * re-find it */
8822     if (len_u != _invlist_len(u)) {
8823         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8824         invlist_trim(u);
8825         array_u = invlist_array(u);
8826     }
8827
8828     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8829      * the other) ended with everything above it not in its set.  That means
8830      * that the remaining part of the union is precisely the same as the
8831      * non-exhausted list, so can just copy it unchanged.  (If both list were
8832      * exhausted at the same time, then the operations below will be both 0.)
8833      */
8834     if (count == 0) {
8835         IV copy_count; /* At most one will have a non-zero copy count */
8836         if ((copy_count = len_a - i_a) > 0) {
8837             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8838         }
8839         else if ((copy_count = len_b - i_b) > 0) {
8840             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8841         }
8842     }
8843
8844     /*  We may be removing a reference to one of the inputs.  If so, the output
8845      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8846      *  count decremented) */
8847     if (a == *output || b == *output) {
8848         assert(! invlist_is_iterating(*output));
8849         if ((SvTEMP(*output))) {
8850             sv_2mortal(u);
8851         }
8852         else {
8853             SvREFCNT_dec_NN(*output);
8854         }
8855     }
8856
8857     *output = u;
8858
8859     return;
8860 }
8861
8862 void
8863 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8864                                                const bool complement_b, SV** i)
8865 {
8866     /* Take the intersection of two inversion lists and point <i> to it.  *i
8867      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8868      * the reference count to that list will be decremented if not already a
8869      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8870      * The first list, <a>, may be NULL, in which case an empty list is
8871      * returned.  If <complement_b> is TRUE, the result will be the
8872      * intersection of <a> and the complement (or inversion) of <b> instead of
8873      * <b> directly.
8874      *
8875      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8876      * Richard Gillam, published by Addison-Wesley, and explained at some
8877      * length there.  The preface says to incorporate its examples into your
8878      * code at your own risk.  In fact, it had bugs
8879      *
8880      * The algorithm is like a merge sort, and is essentially the same as the
8881      * union above
8882      */
8883
8884     const UV* array_a;          /* a's array */
8885     const UV* array_b;
8886     UV len_a;   /* length of a's array */
8887     UV len_b;
8888
8889     SV* r;                   /* the resulting intersection */
8890     UV* array_r;
8891     UV len_r;
8892
8893     UV i_a = 0;             /* current index into a's array */
8894     UV i_b = 0;
8895     UV i_r = 0;
8896
8897     /* running count, as explained in the algorithm source book; items are
8898      * stopped accumulating and are output when the count changes to/from 2.
8899      * The count is incremented when we start a range that's in the set, and
8900      * decremented when we start a range that's not in the set.  So its range
8901      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8902      */
8903     UV count = 0;
8904
8905     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8906     assert(a != b);
8907
8908     /* Special case if either one is empty */
8909     len_a = (a == NULL) ? 0 : _invlist_len(a);
8910     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8911         bool make_temp = FALSE;
8912
8913         if (len_a != 0 && complement_b) {
8914
8915             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8916              * be empty.  Here, also we are using 'b's complement, which hence
8917              * must be every possible code point.  Thus the intersection is
8918              * simply 'a'. */
8919             if (*i != a) {
8920                 if (*i == b) {
8921                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8922                         SvREFCNT_dec_NN(b);
8923                     }
8924                 }
8925
8926                 *i = invlist_clone(a);
8927             }
8928             /* else *i is already 'a' */
8929
8930             if (make_temp) {
8931                 sv_2mortal(*i);
8932             }
8933             return;
8934         }
8935
8936         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8937          * intersection must be empty */
8938         if (*i == a) {
8939             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8940                 SvREFCNT_dec_NN(a);
8941             }
8942         }
8943         else if (*i == b) {
8944             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8945                 SvREFCNT_dec_NN(b);
8946             }
8947         }
8948         *i = _new_invlist(0);
8949         if (make_temp) {
8950             sv_2mortal(*i);
8951         }
8952
8953         return;
8954     }
8955
8956     /* Here both lists exist and are non-empty */
8957     array_a = invlist_array(a);
8958     array_b = invlist_array(b);
8959
8960     /* If are to take the intersection of 'a' with the complement of b, set it
8961      * up so are looking at b's complement. */
8962     if (complement_b) {
8963
8964         /* To complement, we invert: if the first element is 0, remove it.  To
8965          * do this, we just pretend the array starts one later */
8966         if (array_b[0] == 0) {
8967             array_b++;
8968             len_b--;
8969         }
8970         else {
8971
8972             /* But if the first element is not zero, we pretend the list starts
8973              * at the 0 that is always stored immediately before the array. */
8974             array_b--;
8975             len_b++;
8976         }
8977     }
8978
8979     /* Size the intersection for the worst case: that the intersection ends up
8980      * fragmenting everything to be completely disjoint */
8981     r= _new_invlist(len_a + len_b);
8982
8983     /* Will contain U+0000 iff both components do */
8984     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8985                                      && len_b > 0 && array_b[0] == 0);
8986
8987     /* Go through each list item by item, stopping when exhausted one of
8988      * them */
8989     while (i_a < len_a && i_b < len_b) {
8990         UV cp;      /* The element to potentially add to the intersection's
8991                        array */
8992         bool cp_in_set; /* Is it in the input list's set or not */
8993
8994         /* We need to take one or the other of the two inputs for the
8995          * intersection.  Since we are merging two sorted lists, we take the
8996          * smaller of the next items.  In case of a tie, we take the one that
8997          * is not in its set first (a difference from the union algorithm).  If
8998          * we took one in the set first, it would increment the count, possibly
8999          * to 2 which would cause it to be output as starting a range in the
9000          * intersection, and the next time through we would take that same
9001          * number, and output it again as ending the set.  By doing it the
9002          * opposite of this, there is no possibility that the count will be
9003          * momentarily incremented to 2.  (In a tie and both are in the set or
9004          * both not in the set, it doesn't matter which we take first.) */
9005         if (array_a[i_a] < array_b[i_b]
9006             || (array_a[i_a] == array_b[i_b]
9007                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9008         {
9009             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9010             cp= array_a[i_a++];
9011         }
9012         else {
9013             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9014             cp= array_b[i_b++];
9015         }
9016
9017         /* Here, have chosen which of the two inputs to look at.  Only output
9018          * if the running count changes to/from 2, which marks the
9019          * beginning/end of a range that's in the intersection */
9020         if (cp_in_set) {
9021             count++;
9022             if (count == 2) {
9023                 array_r[i_r++] = cp;
9024             }
9025         }
9026         else {
9027             if (count == 2) {
9028                 array_r[i_r++] = cp;
9029             }
9030             count--;
9031         }
9032     }
9033
9034     /* Here, we are finished going through at least one of the lists, which
9035      * means there is something remaining in at most one.  We check if the list
9036      * that has been exhausted is positioned such that we are in the middle
9037      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
9038      * the ones we care about.)  There are four cases:
9039      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
9040      *     nothing left in the intersection.
9041      *  2) Both were in their sets, count is 2 and perhaps is incremented to
9042      *     above 2.  What should be output is exactly that which is in the
9043      *     non-exhausted set, as everything it has is also in the intersection
9044      *     set, and everything it doesn't have can't be in the intersection
9045      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9046      *     gets incremented to 2.  Like the previous case, the intersection is
9047      *     everything that remains in the non-exhausted set.
9048      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9049      *     remains 1.  And the intersection has nothing more. */
9050     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9051         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9052     {
9053         count++;
9054     }
9055
9056     /* The final length is what we've output so far plus what else is in the
9057      * intersection.  At most one of the subexpressions below will be non-zero
9058      * */
9059     len_r = i_r;
9060     if (count >= 2) {
9061         len_r += (len_a - i_a) + (len_b - i_b);
9062     }
9063
9064     /* Set result to final length, which can change the pointer to array_r, so
9065      * re-find it */
9066     if (len_r != _invlist_len(r)) {
9067         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9068         invlist_trim(r);
9069         array_r = invlist_array(r);
9070     }
9071
9072     /* Finish outputting any remaining */
9073     if (count >= 2) { /* At most one will have a non-zero copy count */
9074         IV copy_count;
9075         if ((copy_count = len_a - i_a) > 0) {
9076             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9077         }
9078         else if ((copy_count = len_b - i_b) > 0) {
9079             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9080         }
9081     }
9082
9083     /*  We may be removing a reference to one of the inputs.  If so, the output
9084      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
9085      *  count decremented) */
9086     if (a == *i || b == *i) {
9087         assert(! invlist_is_iterating(*i));
9088         if (SvTEMP(*i)) {
9089             sv_2mortal(r);
9090         }
9091         else {
9092             SvREFCNT_dec_NN(*i);
9093         }
9094     }
9095
9096     *i = r;
9097
9098     return;
9099 }
9100
9101 SV*
9102 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9103 {
9104     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9105      * set.  A pointer to the inversion list is returned.  This may actually be
9106      * a new list, in which case the passed in one has been destroyed.  The
9107      * passed-in inversion list can be NULL, in which case a new one is created
9108      * with just the one range in it */
9109
9110     SV* range_invlist;
9111     UV len;
9112
9113     if (invlist == NULL) {
9114         invlist = _new_invlist(2);
9115         len = 0;
9116     }
9117     else {
9118         len = _invlist_len(invlist);
9119     }
9120
9121     /* If comes after the final entry actually in the list, can just append it
9122      * to the end, */
9123     if (len == 0
9124         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9125             && start >= invlist_array(invlist)[len - 1]))
9126     {
9127         _append_range_to_invlist(invlist, start, end);
9128         return invlist;
9129     }
9130
9131     /* Here, can't just append things, create and return a new inversion list
9132      * which is the union of this range and the existing inversion list */
9133     range_invlist = _new_invlist(2);
9134     _append_range_to_invlist(range_invlist, start, end);
9135
9136     _invlist_union(invlist, range_invlist, &invlist);
9137
9138     /* The temporary can be freed */
9139     SvREFCNT_dec_NN(range_invlist);
9140
9141     return invlist;
9142 }
9143
9144 SV*
9145 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9146                                  UV** other_elements_ptr)
9147 {
9148     /* Create and return an inversion list whose contents are to be populated
9149      * by the caller.  The caller gives the number of elements (in 'size') and
9150      * the very first element ('element0').  This function will set
9151      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9152      * are to be placed.
9153      *
9154      * Obviously there is some trust involved that the caller will properly
9155      * fill in the other elements of the array.
9156      *
9157      * (The first element needs to be passed in, as the underlying code does
9158      * things differently depending on whether it is zero or non-zero) */
9159
9160     SV* invlist = _new_invlist(size);
9161     bool offset;
9162
9163     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9164
9165     _append_range_to_invlist(invlist, element0, element0);
9166     offset = *get_invlist_offset_addr(invlist);
9167
9168     invlist_set_len(invlist, size, offset);
9169     *other_elements_ptr = invlist_array(invlist) + 1;
9170     return invlist;
9171 }
9172
9173 #endif
9174
9175 PERL_STATIC_INLINE SV*
9176 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9177     return _add_range_to_invlist(invlist, cp, cp);
9178 }
9179
9180 #ifndef PERL_IN_XSUB_RE
9181 void
9182 Perl__invlist_invert(pTHX_ SV* const invlist)
9183 {
9184     /* Complement the input inversion list.  This adds a 0 if the list didn't
9185      * have a zero; removes it otherwise.  As described above, the data
9186      * structure is set up so that this is very efficient */
9187
9188     PERL_ARGS_ASSERT__INVLIST_INVERT;
9189
9190     assert(! invlist_is_iterating(invlist));
9191
9192     /* The inverse of matching nothing is matching everything */
9193     if (_invlist_len(invlist) == 0) {
9194         _append_range_to_invlist(invlist, 0, UV_MAX);
9195         return;
9196     }
9197
9198     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9199 }
9200
9201 #endif
9202
9203 PERL_STATIC_INLINE SV*
9204 S_invlist_clone(pTHX_ SV* const invlist)
9205 {
9206
9207     /* Return a new inversion list that is a copy of the input one, which is
9208      * unchanged.  The new list will not be mortal even if the old one was. */
9209
9210     /* Need to allocate extra space to accommodate Perl's addition of a
9211      * trailing NUL to SvPV's, since it thinks they are always strings */
9212     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9213     STRLEN physical_length = SvCUR(invlist);
9214     bool offset = *(get_invlist_offset_addr(invlist));
9215
9216     PERL_ARGS_ASSERT_INVLIST_CLONE;
9217
9218     *(get_invlist_offset_addr(new_invlist)) = offset;
9219     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9220     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9221
9222     return new_invlist;
9223 }
9224
9225 PERL_STATIC_INLINE STRLEN*
9226 S_get_invlist_iter_addr(SV* invlist)
9227 {
9228     /* Return the address of the UV that contains the current iteration
9229      * position */
9230
9231     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9232
9233     assert(SvTYPE(invlist) == SVt_INVLIST);
9234
9235     return &(((XINVLIST*) SvANY(invlist))->iterator);
9236 }
9237
9238 PERL_STATIC_INLINE void
9239 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9240 {
9241     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9242
9243     *get_invlist_iter_addr(invlist) = 0;
9244 }
9245
9246 PERL_STATIC_INLINE void
9247 S_invlist_iterfinish(SV* invlist)
9248 {
9249     /* Terminate iterator for invlist.  This is to catch development errors.
9250      * Any iteration that is interrupted before completed should call this
9251      * function.  Functions that add code points anywhere else but to the end
9252      * of an inversion list assert that they are not in the middle of an
9253      * iteration.  If they were, the addition would make the iteration
9254      * problematical: if the iteration hadn't reached the place where things
9255      * were being added, it would be ok */
9256
9257     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9258
9259     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9260 }
9261
9262 STATIC bool
9263 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9264 {
9265     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9266      * This call sets in <*start> and <*end>, the next range in <invlist>.
9267      * Returns <TRUE> if successful and the next call will return the next
9268      * range; <FALSE> if was already at the end of the list.  If the latter,
9269      * <*start> and <*end> are unchanged, and the next call to this function
9270      * will start over at the beginning of the list */
9271
9272     STRLEN* pos = get_invlist_iter_addr(invlist);
9273     UV len = _invlist_len(invlist);
9274     UV *array;
9275
9276     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9277
9278     if (*pos >= len) {
9279         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9280         return FALSE;
9281     }
9282
9283     array = invlist_array(invlist);
9284
9285     *start = array[(*pos)++];
9286
9287     if (*pos >= len) {
9288         *end = UV_MAX;
9289     }
9290     else {
9291         *end = array[(*pos)++] - 1;
9292     }
9293
9294     return TRUE;
9295 }
9296
9297 PERL_STATIC_INLINE bool
9298 S_invlist_is_iterating(SV* const invlist)
9299 {
9300     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9301
9302     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9303 }
9304
9305 PERL_STATIC_INLINE UV
9306 S_invlist_highest(SV* const invlist)
9307 {
9308     /* Returns the highest code point that matches an inversion list.  This API
9309      * has an ambiguity, as it returns 0 under either the highest is actually
9310      * 0, or if the list is empty.  If this distinction matters to you, check
9311      * for emptiness before calling this function */
9312
9313     UV len = _invlist_len(invlist);
9314     UV *array;
9315
9316     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9317
9318     if (len == 0) {
9319         return 0;
9320     }
9321
9322     array = invlist_array(invlist);
9323
9324     /* The last element in the array in the inversion list always starts a
9325      * range that goes to infinity.  That range may be for code points that are
9326      * matched in the inversion list, or it may be for ones that aren't
9327      * matched.  In the latter case, the highest code point in the set is one
9328      * less than the beginning of this range; otherwise it is the final element
9329      * of this range: infinity */
9330     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9331            ? UV_MAX
9332            : array[len - 1] - 1;
9333 }
9334
9335 #ifndef PERL_IN_XSUB_RE
9336 SV *
9337 Perl__invlist_contents(pTHX_ SV* const invlist)
9338 {
9339     /* Get the contents of an inversion list into a string SV so that they can
9340      * be printed out.  It uses the format traditionally done for debug tracing
9341      */
9342
9343     UV start, end;
9344     SV* output = newSVpvs("\n");
9345
9346     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9347
9348     assert(! invlist_is_iterating(invlist));
9349
9350     invlist_iterinit(invlist);
9351     while (invlist_iternext(invlist, &start, &end)) {
9352         if (end == UV_MAX) {
9353             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9354         }
9355         else if (end != start) {
9356             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9357                     start,       end);
9358         }
9359         else {
9360             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9361         }
9362     }
9363
9364     return output;
9365 }
9366 #endif
9367
9368 #ifndef PERL_IN_XSUB_RE
9369 void
9370 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9371                          const char * const indent, SV* const invlist)
9372 {
9373     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9374      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9375      * the string 'indent'.  The output looks like this:
9376          [0] 0x000A .. 0x000D
9377          [2] 0x0085
9378          [4] 0x2028 .. 0x2029
9379          [6] 0x3104 .. INFINITY
9380      * This means that the first range of code points matched by the list are
9381      * 0xA through 0xD; the second range contains only the single code point
9382      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9383      * are used to define each range (except if the final range extends to
9384      * infinity, only a single element is needed).  The array index of the
9385      * first element for the corresponding range is given in brackets. */
9386
9387     UV start, end;
9388     STRLEN count = 0;
9389
9390     PERL_ARGS_ASSERT__INVLIST_DUMP;
9391
9392     if (invlist_is_iterating(invlist)) {
9393         Perl_dump_indent(aTHX_ level, file,
9394              "%sCan't dump inversion list because is in middle of iterating\n",
9395              indent);
9396         return;
9397     }
9398
9399     invlist_iterinit(invlist);
9400     while (invlist_iternext(invlist, &start, &end)) {
9401         if (end == UV_MAX) {
9402             Perl_dump_indent(aTHX_ level, file,
9403                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9404                                    indent, (UV)count, start);
9405         }
9406         else if (end != start) {
9407             Perl_dump_indent(aTHX_ level, file,
9408                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9409                                 indent, (UV)count, start,         end);
9410         }
9411         else {
9412             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9413                                             indent, (UV)count, start);
9414         }
9415         count += 2;
9416     }
9417 }
9418
9419 void
9420 Perl__load_PL_utf8_foldclosures (pTHX)
9421 {
9422     assert(! PL_utf8_foldclosures);
9423
9424     /* If the folds haven't been read in, call a fold function
9425      * to force that */
9426     if (! PL_utf8_tofold) {
9427         U8 dummy[UTF8_MAXBYTES_CASE+1];
9428
9429         /* This string is just a short named one above \xff */
9430         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9431         assert(PL_utf8_tofold); /* Verify that worked */
9432     }
9433     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9434 }
9435 #endif
9436
9437 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9438 bool
9439 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9440 {
9441     /* Return a boolean as to if the two passed in inversion lists are
9442      * identical.  The final argument, if TRUE, says to take the complement of
9443      * the second inversion list before doing the comparison */
9444
9445     const UV* array_a = invlist_array(a);
9446     const UV* array_b = invlist_array(b);
9447     UV len_a = _invlist_len(a);
9448     UV len_b = _invlist_len(b);
9449
9450     UV i = 0;               /* current index into the arrays */
9451     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9452
9453     PERL_ARGS_ASSERT__INVLISTEQ;
9454
9455     /* If are to compare 'a' with the complement of b, set it
9456      * up so are looking at b's complement. */
9457     if (complement_b) {
9458
9459         /* The complement of nothing is everything, so <a> would have to have
9460          * just one element, starting at zero (ending at infinity) */
9461         if (len_b == 0) {
9462             return (len_a == 1 && array_a[0] == 0);
9463         }
9464         else if (array_b[0] == 0) {
9465
9466             /* Otherwise, to complement, we invert.  Here, the first element is
9467              * 0, just remove it.  To do this, we just pretend the array starts
9468              * one later */
9469
9470             array_b++;
9471             len_b--;
9472         }
9473         else {
9474
9475             /* But if the first element is not zero, we pretend the list starts
9476              * at the 0 that is always stored immediately before the array. */
9477             array_b--;
9478             len_b++;
9479         }
9480     }
9481
9482     /* Make sure that the lengths are the same, as well as the final element
9483      * before looping through the remainder.  (Thus we test the length, final,
9484      * and first elements right off the bat) */
9485     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9486         retval = FALSE;
9487     }
9488     else for (i = 0; i < len_a - 1; i++) {
9489         if (array_a[i] != array_b[i]) {
9490             retval = FALSE;
9491             break;
9492         }
9493     }
9494
9495     return retval;
9496 }
9497 #endif
9498
9499 #undef HEADER_LENGTH
9500 #undef TO_INTERNAL_SIZE
9501 #undef FROM_INTERNAL_SIZE
9502 #undef INVLIST_VERSION_ID
9503
9504 /* End of inversion list object */
9505
9506 STATIC void
9507 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9508 {
9509     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9510      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9511      * should point to the first flag; it is updated on output to point to the
9512      * final ')' or ':'.  There needs to be at least one flag, or this will
9513      * abort */
9514
9515     /* for (?g), (?gc), and (?o) warnings; warning
9516        about (?c) will warn about (?g) -- japhy    */
9517
9518 #define WASTED_O  0x01
9519 #define WASTED_G  0x02
9520 #define WASTED_C  0x04
9521 #define WASTED_GC (WASTED_G|WASTED_C)
9522     I32 wastedflags = 0x00;
9523     U32 posflags = 0, negflags = 0;
9524     U32 *flagsp = &posflags;
9525     char has_charset_modifier = '\0';
9526     regex_charset cs;
9527     bool has_use_defaults = FALSE;
9528     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9529     int x_mod_count = 0;
9530
9531     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9532
9533     /* '^' as an initial flag sets certain defaults */
9534     if (UCHARAT(RExC_parse) == '^') {
9535         RExC_parse++;
9536         has_use_defaults = TRUE;
9537         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9538         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9539                                         ? REGEX_UNICODE_CHARSET
9540                                         : REGEX_DEPENDS_CHARSET);
9541     }
9542
9543     cs = get_regex_charset(RExC_flags);
9544     if (cs == REGEX_DEPENDS_CHARSET
9545         && (RExC_utf8 || RExC_uni_semantics))
9546     {
9547         cs = REGEX_UNICODE_CHARSET;
9548     }
9549
9550     while (*RExC_parse) {
9551         /* && strchr("iogcmsx", *RExC_parse) */
9552         /* (?g), (?gc) and (?o) are useless here
9553            and must be globally applied -- japhy */
9554         switch (*RExC_parse) {
9555
9556             /* Code for the imsx flags */
9557             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9558
9559             case LOCALE_PAT_MOD:
9560                 if (has_charset_modifier) {
9561                     goto excess_modifier;
9562                 }
9563                 else if (flagsp == &negflags) {
9564                     goto neg_modifier;
9565                 }
9566                 cs = REGEX_LOCALE_CHARSET;
9567                 has_charset_modifier = LOCALE_PAT_MOD;
9568                 break;
9569             case UNICODE_PAT_MOD:
9570                 if (has_charset_modifier) {
9571                     goto excess_modifier;
9572                 }
9573                 else if (flagsp == &negflags) {
9574                     goto neg_modifier;
9575                 }
9576                 cs = REGEX_UNICODE_CHARSET;
9577                 has_charset_modifier = UNICODE_PAT_MOD;
9578                 break;
9579             case ASCII_RESTRICT_PAT_MOD:
9580                 if (flagsp == &negflags) {
9581                     goto neg_modifier;
9582                 }
9583                 if (has_charset_modifier) {
9584                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9585                         goto excess_modifier;
9586                     }
9587                     /* Doubled modifier implies more restricted */
9588                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9589                 }
9590                 else {
9591                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9592                 }
9593                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9594                 break;
9595             case DEPENDS_PAT_MOD:
9596                 if (has_use_defaults) {
9597                     goto fail_modifiers;
9598                 }
9599                 else if (flagsp == &negflags) {
9600                     goto neg_modifier;
9601                 }
9602                 else if (has_charset_modifier) {
9603                     goto excess_modifier;
9604                 }
9605
9606                 /* The dual charset means unicode semantics if the
9607                  * pattern (or target, not known until runtime) are
9608                  * utf8, or something in the pattern indicates unicode
9609                  * semantics */
9610                 cs = (RExC_utf8 || RExC_uni_semantics)
9611                      ? REGEX_UNICODE_CHARSET
9612                      : REGEX_DEPENDS_CHARSET;
9613                 has_charset_modifier = DEPENDS_PAT_MOD;
9614                 break;
9615             excess_modifier:
9616                 RExC_parse++;
9617                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9618                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9619                 }
9620                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9621                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9622                                         *(RExC_parse - 1));
9623                 }
9624                 else {
9625                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9626                 }
9627                 /*NOTREACHED*/
9628             neg_modifier:
9629                 RExC_parse++;
9630                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9631                                     *(RExC_parse - 1));
9632                 /*NOTREACHED*/
9633             case ONCE_PAT_MOD: /* 'o' */
9634             case GLOBAL_PAT_MOD: /* 'g' */
9635                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9636                     const I32 wflagbit = *RExC_parse == 'o'
9637                                          ? WASTED_O
9638                                          : WASTED_G;
9639                     if (! (wastedflags & wflagbit) ) {
9640                         wastedflags |= wflagbit;
9641                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9642                         vWARN5(
9643                             RExC_parse + 1,
9644                             "Useless (%s%c) - %suse /%c modifier",
9645                             flagsp == &negflags ? "?-" : "?",
9646                             *RExC_parse,
9647                             flagsp == &negflags ? "don't " : "",
9648                             *RExC_parse
9649                         );
9650                     }
9651                 }
9652                 break;
9653
9654             case CONTINUE_PAT_MOD: /* 'c' */
9655                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9656                     if (! (wastedflags & WASTED_C) ) {
9657                         wastedflags |= WASTED_GC;
9658                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9659                         vWARN3(
9660                             RExC_parse + 1,
9661                             "Useless (%sc) - %suse /gc modifier",
9662                             flagsp == &negflags ? "?-" : "?",
9663                             flagsp == &negflags ? "don't " : ""
9664                         );
9665                     }
9666                 }
9667                 break;
9668             case KEEPCOPY_PAT_MOD: /* 'p' */
9669                 if (flagsp == &negflags) {
9670                     if (PASS2)
9671                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9672                 } else {
9673                     *flagsp |= RXf_PMf_KEEPCOPY;
9674                 }
9675                 break;
9676             case '-':
9677                 /* A flag is a default iff it is following a minus, so
9678                  * if there is a minus, it means will be trying to
9679                  * re-specify a default which is an error */
9680                 if (has_use_defaults || flagsp == &negflags) {
9681                     goto fail_modifiers;
9682                 }
9683                 flagsp = &negflags;
9684                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9685                 break;
9686             case ':':
9687             case ')':
9688                 RExC_flags |= posflags;
9689                 RExC_flags &= ~negflags;
9690                 set_regex_charset(&RExC_flags, cs);
9691                 if (RExC_flags & RXf_PMf_FOLD) {
9692                     RExC_contains_i = 1;
9693                 }
9694                 if (PASS2) {
9695                     STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9696                 }
9697                 return;
9698                 /*NOTREACHED*/
9699             default:
9700             fail_modifiers:
9701                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9702                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9703                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9704                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9705                 /*NOTREACHED*/
9706         }
9707
9708         ++RExC_parse;
9709     }
9710
9711     if (PASS2) {
9712         STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9713     }
9714 }
9715
9716 /*
9717  - reg - regular expression, i.e. main body or parenthesized thing
9718  *
9719  * Caller must absorb opening parenthesis.
9720  *
9721  * Combining parenthesis handling with the base level of regular expression
9722  * is a trifle forced, but the need to tie the tails of the branches to what
9723  * follows makes it hard to avoid.
9724  */
9725 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9726 #ifdef DEBUGGING
9727 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9728 #else
9729 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9730 #endif
9731
9732 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9733    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9734    needs to be restarted.
9735    Otherwise would only return NULL if regbranch() returns NULL, which
9736    cannot happen.  */
9737 STATIC regnode *
9738 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9739     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9740      * 2 is like 1, but indicates that nextchar() has been called to advance
9741      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9742      * this flag alerts us to the need to check for that */
9743 {
9744     regnode *ret;               /* Will be the head of the group. */
9745     regnode *br;
9746     regnode *lastbr;
9747     regnode *ender = NULL;
9748     I32 parno = 0;
9749     I32 flags;
9750     U32 oregflags = RExC_flags;
9751     bool have_branch = 0;
9752     bool is_open = 0;
9753     I32 freeze_paren = 0;
9754     I32 after_freeze = 0;
9755     I32 num; /* numeric backreferences */
9756
9757     char * parse_start = RExC_parse; /* MJD */
9758     char * const oregcomp_parse = RExC_parse;
9759
9760     GET_RE_DEBUG_FLAGS_DECL;
9761
9762     PERL_ARGS_ASSERT_REG;
9763     DEBUG_PARSE("reg ");
9764
9765     *flagp = 0;                         /* Tentatively. */
9766
9767
9768     /* Make an OPEN node, if parenthesized. */
9769     if (paren) {
9770
9771         /* Under /x, space and comments can be gobbled up between the '(' and
9772          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9773          * intervening space, as the sequence is a token, and a token should be
9774          * indivisible */
9775         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9776
9777         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9778             char *start_verb = RExC_parse;
9779             STRLEN verb_len = 0;
9780             char *start_arg = NULL;
9781             unsigned char op = 0;
9782             int argok = 1;
9783             int internal_argval = 0; /* internal_argval is only useful if
9784                                         !argok */
9785
9786             if (has_intervening_patws) {
9787                 RExC_parse++;
9788                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9789             }
9790             while ( *RExC_parse && *RExC_parse != ')' ) {
9791                 if ( *RExC_parse == ':' ) {
9792                     start_arg = RExC_parse + 1;
9793                     break;
9794                 }
9795                 RExC_parse++;
9796             }
9797             ++start_verb;
9798             verb_len = RExC_parse - start_verb;
9799             if ( start_arg ) {
9800                 RExC_parse++;
9801                 while ( *RExC_parse && *RExC_parse != ')' )
9802                     RExC_parse++;
9803                 if ( *RExC_parse != ')' )
9804                     vFAIL("Unterminated verb pattern argument");
9805                 if ( RExC_parse == start_arg )
9806                     start_arg = NULL;
9807             } else {
9808                 if ( *RExC_parse != ')' )
9809                     vFAIL("Unterminated verb pattern");
9810             }
9811
9812             switch ( *start_verb ) {
9813             case 'A':  /* (*ACCEPT) */
9814                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9815                     op = ACCEPT;
9816                     internal_argval = RExC_nestroot;
9817                 }
9818                 break;
9819             case 'C':  /* (*COMMIT) */
9820                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9821                     op = COMMIT;
9822                 break;
9823             case 'F':  /* (*FAIL) */
9824                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9825                     op = OPFAIL;
9826                     argok = 0;
9827                 }
9828                 break;
9829             case ':':  /* (*:NAME) */
9830             case 'M':  /* (*MARK:NAME) */
9831                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9832                     op = MARKPOINT;
9833                     argok = -1;
9834                 }
9835                 break;
9836             case 'P':  /* (*PRUNE) */
9837                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9838                     op = PRUNE;
9839                 break;
9840             case 'S':   /* (*SKIP) */
9841                 if ( memEQs(start_verb,verb_len,"SKIP") )
9842                     op = SKIP;
9843                 break;
9844             case 'T':  /* (*THEN) */
9845                 /* [19:06] <TimToady> :: is then */
9846                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9847                     op = CUTGROUP;
9848                     RExC_seen |= REG_CUTGROUP_SEEN;
9849                 }
9850                 break;
9851             }
9852             if ( ! op ) {
9853                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9854                 vFAIL2utf8f(
9855                     "Unknown verb pattern '%"UTF8f"'",
9856                     UTF8fARG(UTF, verb_len, start_verb));
9857             }
9858             if ( argok ) {
9859                 if ( start_arg && internal_argval ) {
9860                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9861                         verb_len, start_verb);
9862                 } else if ( argok < 0 && !start_arg ) {
9863                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9864                         verb_len, start_verb);
9865                 } else {
9866                     ret = reganode(pRExC_state, op, internal_argval);
9867                     if ( ! internal_argval && ! SIZE_ONLY ) {
9868                         if (start_arg) {
9869                             SV *sv = newSVpvn( start_arg,
9870                                                RExC_parse - start_arg);
9871                             ARG(ret) = add_data( pRExC_state,
9872                                                  STR_WITH_LEN("S"));
9873                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9874                             ret->flags = 0;
9875                         } else {
9876                             ret->flags = 1;
9877                         }
9878                     }
9879                 }
9880                 if (!internal_argval)
9881                     RExC_seen |= REG_VERBARG_SEEN;
9882             } else if ( start_arg ) {
9883                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9884                         verb_len, start_verb);
9885             } else {
9886                 ret = reg_node(pRExC_state, op);
9887             }
9888             nextchar(pRExC_state);
9889             return ret;
9890         }
9891         else if (*RExC_parse == '?') { /* (?...) */
9892             bool is_logical = 0;
9893             const char * const seqstart = RExC_parse;
9894             const char * endptr;
9895             if (has_intervening_patws) {
9896                 RExC_parse++;
9897                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9898             }
9899
9900             RExC_parse++;
9901             paren = *RExC_parse++;
9902             ret = NULL;                 /* For look-ahead/behind. */
9903             switch (paren) {
9904
9905             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9906                 paren = *RExC_parse++;
9907                 if ( paren == '<')         /* (?P<...>) named capture */
9908                     goto named_capture;
9909                 else if (paren == '>') {   /* (?P>name) named recursion */
9910                     goto named_recursion;
9911                 }
9912                 else if (paren == '=') {   /* (?P=...)  named backref */
9913                     /* this pretty much dupes the code for \k<NAME> in
9914                      * regatom(), if you change this make sure you change that
9915                      * */
9916                     char* name_start = RExC_parse;
9917                     U32 num = 0;
9918                     SV *sv_dat = reg_scan_name(pRExC_state,
9919                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9920                     if (RExC_parse == name_start || *RExC_parse != ')')
9921                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9922                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9923
9924                     if (!SIZE_ONLY) {
9925                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9926                         RExC_rxi->data->data[num]=(void*)sv_dat;
9927                         SvREFCNT_inc_simple_void(sv_dat);
9928                     }
9929                     RExC_sawback = 1;
9930                     ret = reganode(pRExC_state,
9931                                    ((! FOLD)
9932                                      ? NREF
9933                                      : (ASCII_FOLD_RESTRICTED)
9934                                        ? NREFFA
9935                                        : (AT_LEAST_UNI_SEMANTICS)
9936                                          ? NREFFU
9937                                          : (LOC)
9938                                            ? NREFFL
9939                                            : NREFF),
9940                                     num);
9941                     *flagp |= HASWIDTH;
9942
9943                     Set_Node_Offset(ret, parse_start+1);
9944                     Set_Node_Cur_Length(ret, parse_start);
9945
9946                     nextchar(pRExC_state);
9947                     return ret;
9948                 }
9949                 RExC_parse++;
9950                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9951                 vFAIL3("Sequence (%.*s...) not recognized",
9952                                 RExC_parse-seqstart, seqstart);
9953                 /*NOTREACHED*/
9954             case '<':           /* (?<...) */
9955                 if (*RExC_parse == '!')
9956                     paren = ',';
9957                 else if (*RExC_parse != '=')
9958               named_capture:
9959                 {               /* (?<...>) */
9960                     char *name_start;
9961                     SV *svname;
9962                     paren= '>';
9963             case '\'':          /* (?'...') */
9964                     name_start= RExC_parse;
9965                     svname = reg_scan_name(pRExC_state,
9966                         SIZE_ONLY    /* reverse test from the others */
9967                         ? REG_RSN_RETURN_NAME
9968                         : REG_RSN_RETURN_NULL);
9969                     if (RExC_parse == name_start || *RExC_parse != paren)
9970                         vFAIL2("Sequence (?%c... not terminated",
9971                             paren=='>' ? '<' : paren);
9972                     if (SIZE_ONLY) {
9973                         HE *he_str;
9974                         SV *sv_dat = NULL;
9975                         if (!svname) /* shouldn't happen */
9976                             Perl_croak(aTHX_
9977                                 "panic: reg_scan_name returned NULL");
9978                         if (!RExC_paren_names) {
9979                             RExC_paren_names= newHV();
9980                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9981 #ifdef DEBUGGING
9982                             RExC_paren_name_list= newAV();
9983                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9984 #endif
9985                         }
9986                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9987                         if ( he_str )
9988                             sv_dat = HeVAL(he_str);
9989                         if ( ! sv_dat ) {
9990                             /* croak baby croak */
9991                             Perl_croak(aTHX_
9992                                 "panic: paren_name hash element allocation failed");
9993                         } else if ( SvPOK(sv_dat) ) {
9994                             /* (?|...) can mean we have dupes so scan to check
9995                                its already been stored. Maybe a flag indicating
9996                                we are inside such a construct would be useful,
9997                                but the arrays are likely to be quite small, so
9998                                for now we punt -- dmq */
9999                             IV count = SvIV(sv_dat);
10000                             I32 *pv = (I32*)SvPVX(sv_dat);
10001                             IV i;
10002                             for ( i = 0 ; i < count ; i++ ) {
10003                                 if ( pv[i] == RExC_npar ) {
10004                                     count = 0;
10005                                     break;
10006                                 }
10007                             }
10008                             if ( count ) {
10009                                 pv = (I32*)SvGROW(sv_dat,
10010                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10011                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10012                                 pv[count] = RExC_npar;
10013                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10014                             }
10015                         } else {
10016                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10017                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10018                                                                 sizeof(I32));
10019                             SvIOK_on(sv_dat);
10020                             SvIV_set(sv_dat, 1);
10021                         }
10022 #ifdef DEBUGGING
10023                         /* Yes this does cause a memory leak in debugging Perls
10024                          * */
10025                         if (!av_store(RExC_paren_name_list,
10026                                       RExC_npar, SvREFCNT_inc(svname)))
10027                             SvREFCNT_dec_NN(svname);
10028 #endif
10029
10030                         /*sv_dump(sv_dat);*/
10031                     }
10032                     nextchar(pRExC_state);
10033                     paren = 1;
10034                     goto capturing_parens;
10035                 }
10036                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10037                 RExC_in_lookbehind++;
10038                 RExC_parse++;
10039                 /* FALLTHROUGH */
10040             case '=':           /* (?=...) */
10041                 RExC_seen_zerolen++;
10042                 break;
10043             case '!':           /* (?!...) */
10044                 RExC_seen_zerolen++;
10045                 if (*RExC_parse == ')') {
10046                     ret=reg_node(pRExC_state, OPFAIL);
10047                     nextchar(pRExC_state);
10048                     return ret;
10049                 }
10050                 break;
10051             case '|':           /* (?|...) */
10052                 /* branch reset, behave like a (?:...) except that
10053                    buffers in alternations share the same numbers */
10054                 paren = ':';
10055                 after_freeze = freeze_paren = RExC_npar;
10056                 break;
10057             case ':':           /* (?:...) */
10058             case '>':           /* (?>...) */
10059                 break;
10060             case '$':           /* (?$...) */
10061             case '@':           /* (?@...) */
10062                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10063                 break;
10064             case '0' :           /* (?0) */
10065             case 'R' :           /* (?R) */
10066                 if (*RExC_parse != ')')
10067                     FAIL("Sequence (?R) not terminated");
10068                 ret = reg_node(pRExC_state, GOSTART);
10069                     RExC_seen |= REG_GOSTART_SEEN;
10070                 *flagp |= POSTPONED;
10071                 nextchar(pRExC_state);
10072                 return ret;
10073                 /*notreached*/
10074             /* named and numeric backreferences */
10075             case '&':            /* (?&NAME) */
10076                 parse_start = RExC_parse - 1;
10077               named_recursion:
10078                 {
10079                     SV *sv_dat = reg_scan_name(pRExC_state,
10080                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10081                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10082                 }
10083                 if (RExC_parse == RExC_end || *RExC_parse != ')')
10084                     vFAIL("Sequence (?&... not terminated");
10085                 goto gen_recurse_regop;
10086                 assert(0); /* NOT REACHED */
10087             case '+':
10088                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10089                     RExC_parse++;
10090                     vFAIL("Illegal pattern");
10091                 }
10092                 goto parse_recursion;
10093                 /* NOT REACHED*/
10094             case '-': /* (?-1) */
10095                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10096                     RExC_parse--; /* rewind to let it be handled later */
10097                     goto parse_flags;
10098                 }
10099                 /* FALLTHROUGH */
10100             case '1': case '2': case '3': case '4': /* (?1) */
10101             case '5': case '6': case '7': case '8': case '9':
10102                 RExC_parse--;
10103               parse_recursion:
10104                 {
10105                     bool is_neg = FALSE;
10106                     parse_start = RExC_parse - 1; /* MJD */
10107                     if (*RExC_parse == '-') {
10108                         RExC_parse++;
10109                         is_neg = TRUE;
10110                     }
10111                     num = grok_atou(RExC_parse, &endptr);
10112                     if (endptr)
10113                         RExC_parse = (char*)endptr;
10114                     if (is_neg) {
10115                         /* Some limit for num? */
10116                         num = -num;
10117                     }
10118                 }
10119                 if (*RExC_parse!=')')
10120                     vFAIL("Expecting close bracket");
10121
10122               gen_recurse_regop:
10123                 if ( paren == '-' ) {
10124                     /*
10125                     Diagram of capture buffer numbering.
10126                     Top line is the normal capture buffer numbers
10127                     Bottom line is the negative indexing as from
10128                     the X (the (?-2))
10129
10130                     +   1 2    3 4 5 X          6 7
10131                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10132                     -   5 4    3 2 1 X          x x
10133
10134                     */
10135                     num = RExC_npar + num;
10136                     if (num < 1)  {
10137                         RExC_parse++;
10138                         vFAIL("Reference to nonexistent group");
10139                     }
10140                 } else if ( paren == '+' ) {
10141                     num = RExC_npar + num - 1;
10142                 }
10143
10144                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10145                 if (!SIZE_ONLY) {
10146                     if (num > (I32)RExC_rx->nparens) {
10147                         RExC_parse++;
10148                         vFAIL("Reference to nonexistent group");
10149                     }
10150                     RExC_recurse_count++;
10151                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10152                         "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10153                               22, "|    |", (int)(depth * 2 + 1), "",
10154                               (UV)ARG(ret), (IV)ARG2L(ret)));
10155                 }
10156                 RExC_seen |= REG_RECURSE_SEEN;
10157                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10158                 Set_Node_Offset(ret, parse_start); /* MJD */
10159
10160                 *flagp |= POSTPONED;
10161                 nextchar(pRExC_state);
10162                 return ret;
10163
10164             assert(0); /* NOT REACHED */
10165
10166             case '?':           /* (??...) */
10167                 is_logical = 1;
10168                 if (*RExC_parse != '{') {
10169                     RExC_parse++;
10170                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10171                     vFAIL2utf8f(
10172                         "Sequence (%"UTF8f"...) not recognized",
10173                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10174                     /*NOTREACHED*/
10175                 }
10176                 *flagp |= POSTPONED;
10177                 paren = *RExC_parse++;
10178                 /* FALLTHROUGH */
10179             case '{':           /* (?{...}) */
10180             {
10181                 U32 n = 0;
10182                 struct reg_code_block *cb;
10183
10184                 RExC_seen_zerolen++;
10185
10186                 if (   !pRExC_state->num_code_blocks
10187                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
10188                     || pRExC_state->code_blocks[pRExC_state->code_index].start
10189                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10190                             - RExC_start)
10191                 ) {
10192                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
10193                         FAIL("panic: Sequence (?{...}): no code block found\n");
10194                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
10195                 }
10196                 /* this is a pre-compiled code block (?{...}) */
10197                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10198                 RExC_parse = RExC_start + cb->end;
10199                 if (!SIZE_ONLY) {
10200                     OP *o = cb->block;
10201                     if (cb->src_regex) {
10202                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10203                         RExC_rxi->data->data[n] =
10204                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
10205                         RExC_rxi->data->data[n+1] = (void*)o;
10206                     }
10207                     else {
10208                         n = add_data(pRExC_state,
10209                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10210                         RExC_rxi->data->data[n] = (void*)o;
10211                     }
10212                 }
10213                 pRExC_state->code_index++;
10214                 nextchar(pRExC_state);
10215
10216                 if (is_logical) {
10217                     regnode *eval;
10218                     ret = reg_node(pRExC_state, LOGICAL);
10219
10220                     eval = reg2Lanode(pRExC_state, EVAL,
10221                                        n,
10222
10223                                        /* for later propagation into (??{})
10224                                         * return value */
10225                                        RExC_flags & RXf_PMf_COMPILETIME
10226                                       );
10227                     if (!SIZE_ONLY) {
10228                         ret->flags = 2;
10229                     }
10230                     REGTAIL(pRExC_state, ret, eval);
10231                     /* deal with the length of this later - MJD */
10232                     return ret;
10233                 }
10234                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10235                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10236                 Set_Node_Offset(ret, parse_start);
10237                 return ret;
10238             }
10239             case '(':           /* (?(?{...})...) and (?(?=...)...) */
10240             {
10241                 int is_define= 0;
10242                 const int DEFINE_len = sizeof("DEFINE") - 1;
10243                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
10244                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10245                         || RExC_parse[1] == '<'
10246                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
10247                         I32 flag;
10248                         regnode *tail;
10249
10250                         ret = reg_node(pRExC_state, LOGICAL);
10251                         if (!SIZE_ONLY)
10252                             ret->flags = 1;
10253
10254                         tail = reg(pRExC_state, 1, &flag, depth+1);
10255                         if (flag & RESTART_UTF8) {
10256                             *flagp = RESTART_UTF8;
10257                             return NULL;
10258                         }
10259                         REGTAIL(pRExC_state, ret, tail);
10260                         goto insert_if;
10261                     }
10262                     /* Fall through to ‘Unknown switch condition’ at the
10263                        end of the if/else chain. */
10264                 }
10265                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10266                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10267                 {
10268                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
10269                     char *name_start= RExC_parse++;
10270                     U32 num = 0;
10271                     SV *sv_dat=reg_scan_name(pRExC_state,
10272                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10273                     if (RExC_parse == name_start || *RExC_parse != ch)
10274                         vFAIL2("Sequence (?(%c... not terminated",
10275                             (ch == '>' ? '<' : ch));
10276                     RExC_parse++;
10277                     if (!SIZE_ONLY) {
10278                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10279                         RExC_rxi->data->data[num]=(void*)sv_dat;
10280                         SvREFCNT_inc_simple_void(sv_dat);
10281                     }
10282                     ret = reganode(pRExC_state,NGROUPP,num);
10283                     goto insert_if_check_paren;
10284                 }
10285                 else if (strnEQ(RExC_parse, "DEFINE",
10286                                        MIN(DEFINE_len, RExC_end - RExC_parse)))
10287                 {
10288                     ret = reganode(pRExC_state,DEFINEP,0);
10289                     RExC_parse += DEFINE_len;
10290                     is_define = 1;
10291                     goto insert_if_check_paren;
10292                 }
10293                 else if (RExC_parse[0] == 'R') {
10294                     RExC_parse++;
10295                     parno = 0;
10296                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10297                         parno = grok_atou(RExC_parse, &endptr);
10298                         if (endptr)
10299                             RExC_parse = (char*)endptr;
10300                     } else if (RExC_parse[0] == '&') {
10301                         SV *sv_dat;
10302                         RExC_parse++;
10303                         sv_dat = reg_scan_name(pRExC_state,
10304                             SIZE_ONLY
10305                             ? REG_RSN_RETURN_NULL
10306                             : REG_RSN_RETURN_DATA);
10307                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10308                     }
10309                     ret = reganode(pRExC_state,INSUBP,parno);
10310                     goto insert_if_check_paren;
10311                 }
10312                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10313                     /* (?(1)...) */
10314                     char c;
10315                     char *tmp;
10316                     parno = grok_atou(RExC_parse, &endptr);
10317                     if (endptr)
10318                         RExC_parse = (char*)endptr;
10319                     ret = reganode(pRExC_state, GROUPP, parno);
10320
10321                  insert_if_check_paren:
10322                     if (*(tmp = nextchar(pRExC_state)) != ')') {
10323                         /* nextchar also skips comments, so undo its work
10324                          * and skip over the the next character.
10325                          */
10326                         RExC_parse = tmp;
10327                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10328                         vFAIL("Switch condition not recognized");
10329                     }
10330                   insert_if:
10331                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10332                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10333                     if (br == NULL) {
10334                         if (flags & RESTART_UTF8) {
10335                             *flagp = RESTART_UTF8;
10336                             return NULL;
10337                         }
10338                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10339                               (UV) flags);
10340                     } else
10341                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10342                                                           LONGJMP, 0));
10343                     c = *nextchar(pRExC_state);
10344                     if (flags&HASWIDTH)
10345                         *flagp |= HASWIDTH;
10346                     if (c == '|') {
10347                         if (is_define)
10348                             vFAIL("(?(DEFINE)....) does not allow branches");
10349
10350                         /* Fake one for optimizer.  */
10351                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10352
10353                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10354                             if (flags & RESTART_UTF8) {
10355                                 *flagp = RESTART_UTF8;
10356                                 return NULL;
10357                             }
10358                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10359                                   (UV) flags);
10360                         }
10361                         REGTAIL(pRExC_state, ret, lastbr);
10362                         if (flags&HASWIDTH)
10363                             *flagp |= HASWIDTH;
10364                         c = *nextchar(pRExC_state);
10365                     }
10366                     else
10367                         lastbr = NULL;
10368                     if (c != ')') {
10369                         if (RExC_parse>RExC_end)
10370                             vFAIL("Switch (?(condition)... not terminated");
10371                         else
10372                             vFAIL("Switch (?(condition)... contains too many branches");
10373                     }
10374                     ender = reg_node(pRExC_state, TAIL);
10375                     REGTAIL(pRExC_state, br, ender);
10376                     if (lastbr) {
10377                         REGTAIL(pRExC_state, lastbr, ender);
10378                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10379                     }
10380                     else
10381                         REGTAIL(pRExC_state, ret, ender);
10382                     RExC_size++; /* XXX WHY do we need this?!!
10383                                     For large programs it seems to be required
10384                                     but I can't figure out why. -- dmq*/
10385                     return ret;
10386                 }
10387                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10388                 vFAIL("Unknown switch condition (?(...))");
10389             }
10390             case '[':           /* (?[ ... ]) */
10391                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10392                                          oregcomp_parse);
10393             case 0:
10394                 RExC_parse--; /* for vFAIL to print correctly */
10395                 vFAIL("Sequence (? incomplete");
10396                 break;
10397             default: /* e.g., (?i) */
10398                 --RExC_parse;
10399               parse_flags:
10400                 parse_lparen_question_flags(pRExC_state);
10401                 if (UCHARAT(RExC_parse) != ':') {
10402                     nextchar(pRExC_state);
10403                     *flagp = TRYAGAIN;
10404                     return NULL;
10405                 }
10406                 paren = ':';
10407                 nextchar(pRExC_state);
10408                 ret = NULL;
10409                 goto parse_rest;
10410             } /* end switch */
10411         }
10412         else {                  /* (...) */
10413           capturing_parens:
10414             parno = RExC_npar;
10415             RExC_npar++;
10416
10417             ret = reganode(pRExC_state, OPEN, parno);
10418             if (!SIZE_ONLY ){
10419                 if (!RExC_nestroot)
10420                     RExC_nestroot = parno;
10421                 if (RExC_seen & REG_RECURSE_SEEN
10422                     && !RExC_open_parens[parno-1])
10423                 {
10424                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10425                         "%*s%*s Setting open paren #%"IVdf" to %d\n",
10426                         22, "|    |", (int)(depth * 2 + 1), "",
10427                         (IV)parno, REG_NODE_NUM(ret)));
10428                     RExC_open_parens[parno-1]= ret;
10429                 }
10430             }
10431             Set_Node_Length(ret, 1); /* MJD */
10432             Set_Node_Offset(ret, RExC_parse); /* MJD */
10433             is_open = 1;
10434         }
10435     }
10436     else                        /* ! paren */
10437         ret = NULL;
10438
10439    parse_rest:
10440     /* Pick up the branches, linking them together. */
10441     parse_start = RExC_parse;   /* MJD */
10442     br = regbranch(pRExC_state, &flags, 1,depth+1);
10443
10444     /*     branch_len = (paren != 0); */
10445
10446     if (br == NULL) {
10447         if (flags & RESTART_UTF8) {
10448             *flagp = RESTART_UTF8;
10449             return NULL;
10450         }
10451         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10452     }
10453     if (*RExC_parse == '|') {
10454         if (!SIZE_ONLY && RExC_extralen) {
10455             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10456         }
10457         else {                  /* MJD */
10458             reginsert(pRExC_state, BRANCH, br, depth+1);
10459             Set_Node_Length(br, paren != 0);
10460             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10461         }
10462         have_branch = 1;
10463         if (SIZE_ONLY)
10464             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10465     }
10466     else if (paren == ':') {
10467         *flagp |= flags&SIMPLE;
10468     }
10469     if (is_open) {                              /* Starts with OPEN. */
10470         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10471     }
10472     else if (paren != '?')              /* Not Conditional */
10473         ret = br;
10474     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10475     lastbr = br;
10476     while (*RExC_parse == '|') {
10477         if (!SIZE_ONLY && RExC_extralen) {
10478             ender = reganode(pRExC_state, LONGJMP,0);
10479
10480             /* Append to the previous. */
10481             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10482         }
10483         if (SIZE_ONLY)
10484             RExC_extralen += 2;         /* Account for LONGJMP. */
10485         nextchar(pRExC_state);
10486         if (freeze_paren) {
10487             if (RExC_npar > after_freeze)
10488                 after_freeze = RExC_npar;
10489             RExC_npar = freeze_paren;
10490         }
10491         br = regbranch(pRExC_state, &flags, 0, depth+1);
10492
10493         if (br == NULL) {
10494             if (flags & RESTART_UTF8) {
10495                 *flagp = RESTART_UTF8;
10496                 return NULL;
10497             }
10498             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10499         }
10500         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10501         lastbr = br;
10502         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10503     }
10504
10505     if (have_branch || paren != ':') {
10506         /* Make a closing node, and hook it on the end. */
10507         switch (paren) {
10508         case ':':
10509             ender = reg_node(pRExC_state, TAIL);
10510             break;
10511         case 1: case 2:
10512             ender = reganode(pRExC_state, CLOSE, parno);
10513             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10514                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10515                         "%*s%*s Setting close paren #%"IVdf" to %d\n",
10516                         22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10517                 RExC_close_parens[parno-1]= ender;
10518                 if (RExC_nestroot == parno)
10519                     RExC_nestroot = 0;
10520             }
10521             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10522             Set_Node_Length(ender,1); /* MJD */
10523             break;
10524         case '<':
10525         case ',':
10526         case '=':
10527         case '!':
10528             *flagp &= ~HASWIDTH;
10529             /* FALLTHROUGH */
10530         case '>':
10531             ender = reg_node(pRExC_state, SUCCEED);
10532             break;
10533         case 0:
10534             ender = reg_node(pRExC_state, END);
10535             if (!SIZE_ONLY) {
10536                 assert(!RExC_opend); /* there can only be one! */
10537                 RExC_opend = ender;
10538             }
10539             break;
10540         }
10541         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10542             DEBUG_PARSE_MSG("lsbr");
10543             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10544             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10545             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10546                           SvPV_nolen_const(RExC_mysv1),
10547                           (IV)REG_NODE_NUM(lastbr),
10548                           SvPV_nolen_const(RExC_mysv2),
10549                           (IV)REG_NODE_NUM(ender),
10550                           (IV)(ender - lastbr)
10551             );
10552         });
10553         REGTAIL(pRExC_state, lastbr, ender);
10554
10555         if (have_branch && !SIZE_ONLY) {
10556             char is_nothing= 1;
10557             if (depth==1)
10558                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10559
10560             /* Hook the tails of the branches to the closing node. */
10561             for (br = ret; br; br = regnext(br)) {
10562                 const U8 op = PL_regkind[OP(br)];
10563                 if (op == BRANCH) {
10564                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10565                     if ( OP(NEXTOPER(br)) != NOTHING
10566                          || regnext(NEXTOPER(br)) != ender)
10567                         is_nothing= 0;
10568                 }
10569                 else if (op == BRANCHJ) {
10570                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10571                     /* for now we always disable this optimisation * /
10572                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10573                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10574                     */
10575                         is_nothing= 0;
10576                 }
10577             }
10578             if (is_nothing) {
10579                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10580                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10581                     DEBUG_PARSE_MSG("NADA");
10582                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10583                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10584                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10585                                   SvPV_nolen_const(RExC_mysv1),
10586                                   (IV)REG_NODE_NUM(ret),
10587                                   SvPV_nolen_const(RExC_mysv2),
10588                                   (IV)REG_NODE_NUM(ender),
10589                                   (IV)(ender - ret)
10590                     );
10591                 });
10592                 OP(br)= NOTHING;
10593                 if (OP(ender) == TAIL) {
10594                     NEXT_OFF(br)= 0;
10595                     RExC_emit= br + 1;
10596                 } else {
10597                     regnode *opt;
10598                     for ( opt= br + 1; opt < ender ; opt++ )
10599                         OP(opt)= OPTIMIZED;
10600                     NEXT_OFF(br)= ender - br;
10601                 }
10602             }
10603         }
10604     }
10605
10606     {
10607         const char *p;
10608         static const char parens[] = "=!<,>";
10609
10610         if (paren && (p = strchr(parens, paren))) {
10611             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10612             int flag = (p - parens) > 1;
10613
10614             if (paren == '>')
10615                 node = SUSPEND, flag = 0;
10616             reginsert(pRExC_state, node,ret, depth+1);
10617             Set_Node_Cur_Length(ret, parse_start);
10618             Set_Node_Offset(ret, parse_start + 1);
10619             ret->flags = flag;
10620             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10621         }
10622     }
10623
10624     /* Check for proper termination. */
10625     if (paren) {
10626         /* restore original flags, but keep (?p) */
10627         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10628         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10629             RExC_parse = oregcomp_parse;
10630             vFAIL("Unmatched (");
10631         }
10632     }
10633     else if (!paren && RExC_parse < RExC_end) {
10634         if (*RExC_parse == ')') {
10635             RExC_parse++;
10636             vFAIL("Unmatched )");
10637         }
10638         else
10639             FAIL("Junk on end of regexp");      /* "Can't happen". */
10640         assert(0); /* NOTREACHED */
10641     }
10642
10643     if (RExC_in_lookbehind) {
10644         RExC_in_lookbehind--;
10645     }
10646     if (after_freeze > RExC_npar)
10647         RExC_npar = after_freeze;
10648     return(ret);
10649 }
10650
10651 /*
10652  - regbranch - one alternative of an | operator
10653  *
10654  * Implements the concatenation operator.
10655  *
10656  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10657  * restarted.
10658  */
10659 STATIC regnode *
10660 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10661 {
10662     regnode *ret;
10663     regnode *chain = NULL;
10664     regnode *latest;
10665     I32 flags = 0, c = 0;
10666     GET_RE_DEBUG_FLAGS_DECL;
10667
10668     PERL_ARGS_ASSERT_REGBRANCH;
10669
10670     DEBUG_PARSE("brnc");
10671
10672     if (first)
10673         ret = NULL;
10674     else {
10675         if (!SIZE_ONLY && RExC_extralen)
10676             ret = reganode(pRExC_state, BRANCHJ,0);
10677         else {
10678             ret = reg_node(pRExC_state, BRANCH);
10679             Set_Node_Length(ret, 1);
10680         }
10681     }
10682
10683     if (!first && SIZE_ONLY)
10684         RExC_extralen += 1;                     /* BRANCHJ */
10685
10686     *flagp = WORST;                     /* Tentatively. */
10687
10688     RExC_parse--;
10689     nextchar(pRExC_state);
10690     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10691         flags &= ~TRYAGAIN;
10692         latest = regpiece(pRExC_state, &flags,depth+1);
10693         if (latest == NULL) {
10694             if (flags & TRYAGAIN)
10695                 continue;
10696             if (flags & RESTART_UTF8) {
10697                 *flagp = RESTART_UTF8;
10698                 return NULL;
10699             }
10700             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10701         }
10702         else if (ret == NULL)
10703             ret = latest;
10704         *flagp |= flags&(HASWIDTH|POSTPONED);
10705         if (chain == NULL)      /* First piece. */
10706             *flagp |= flags&SPSTART;
10707         else {
10708             RExC_naughty++;
10709             REGTAIL(pRExC_state, chain, latest);
10710         }
10711         chain = latest;
10712         c++;
10713     }
10714     if (chain == NULL) {        /* Loop ran zero times. */
10715         chain = reg_node(pRExC_state, NOTHING);
10716         if (ret == NULL)
10717             ret = chain;
10718     }
10719     if (c == 1) {
10720         *flagp |= flags&SIMPLE;
10721     }
10722
10723     return ret;
10724 }
10725
10726 /*
10727  - regpiece - something followed by possible [*+?]
10728  *
10729  * Note that the branching code sequences used for ? and the general cases
10730  * of * and + are somewhat optimized:  they use the same NOTHING node as
10731  * both the endmarker for their branch list and the body of the last branch.
10732  * It might seem that this node could be dispensed with entirely, but the
10733  * endmarker role is not redundant.
10734  *
10735  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10736  * TRYAGAIN.
10737  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10738  * restarted.
10739  */
10740 STATIC regnode *
10741 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10742 {
10743     regnode *ret;
10744     char op;
10745     char *next;
10746     I32 flags;
10747     const char * const origparse = RExC_parse;
10748     I32 min;
10749     I32 max = REG_INFTY;
10750 #ifdef RE_TRACK_PATTERN_OFFSETS
10751     char *parse_start;
10752 #endif
10753     const char *maxpos = NULL;
10754
10755     /* Save the original in case we change the emitted regop to a FAIL. */
10756     regnode * const orig_emit = RExC_emit;
10757
10758     GET_RE_DEBUG_FLAGS_DECL;
10759
10760     PERL_ARGS_ASSERT_REGPIECE;
10761
10762     DEBUG_PARSE("piec");
10763
10764     ret = regatom(pRExC_state, &flags,depth+1);
10765     if (ret == NULL) {
10766         if (flags & (TRYAGAIN|RESTART_UTF8))
10767             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10768         else
10769             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10770         return(NULL);
10771     }
10772
10773     op = *RExC_parse;
10774
10775     if (op == '{' && regcurly(RExC_parse)) {
10776         maxpos = NULL;
10777 #ifdef RE_TRACK_PATTERN_OFFSETS
10778         parse_start = RExC_parse; /* MJD */
10779 #endif
10780         next = RExC_parse + 1;
10781         while (isDIGIT(*next) || *next == ',') {
10782             if (*next == ',') {
10783                 if (maxpos)
10784                     break;
10785                 else
10786                     maxpos = next;
10787             }
10788             next++;
10789         }
10790         if (*next == '}') {             /* got one */
10791             const char* endptr;
10792             if (!maxpos)
10793                 maxpos = next;
10794             RExC_parse++;
10795             min = grok_atou(RExC_parse, &endptr);
10796             if (*maxpos == ',')
10797                 maxpos++;
10798             else
10799                 maxpos = RExC_parse;
10800             max = grok_atou(maxpos, &endptr);
10801             if (!max && *maxpos != '0')
10802                 max = REG_INFTY;                /* meaning "infinity" */
10803             else if (max >= REG_INFTY)
10804                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10805             RExC_parse = next;
10806             nextchar(pRExC_state);
10807             if (max < min) {    /* If can't match, warn and optimize to fail
10808                                    unconditionally */
10809                 if (SIZE_ONLY) {
10810
10811                     /* We can't back off the size because we have to reserve
10812                      * enough space for all the things we are about to throw
10813                      * away, but we can shrink it by the ammount we are about
10814                      * to re-use here */
10815                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10816                 }
10817                 else {
10818                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10819                     RExC_emit = orig_emit;
10820                 }
10821                 ret = reg_node(pRExC_state, OPFAIL);
10822                 return ret;
10823             }
10824             else if (min == max
10825                      && RExC_parse < RExC_end
10826                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10827             {
10828                 if (PASS2) {
10829                     ckWARN2reg(RExC_parse + 1,
10830                                "Useless use of greediness modifier '%c'",
10831                                *RExC_parse);
10832                 }
10833                 /* Absorb the modifier, so later code doesn't see nor use
10834                     * it */
10835                 nextchar(pRExC_state);
10836             }
10837
10838         do_curly:
10839             if ((flags&SIMPLE)) {
10840                 RExC_naughty += 2 + RExC_naughty / 2;
10841                 reginsert(pRExC_state, CURLY, ret, depth+1);
10842                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10843                 Set_Node_Cur_Length(ret, parse_start);
10844             }
10845             else {
10846                 regnode * const w = reg_node(pRExC_state, WHILEM);
10847
10848                 w->flags = 0;
10849                 REGTAIL(pRExC_state, ret, w);
10850                 if (!SIZE_ONLY && RExC_extralen) {
10851                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10852                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10853                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10854                 }
10855                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10856                                 /* MJD hk */
10857                 Set_Node_Offset(ret, parse_start+1);
10858                 Set_Node_Length(ret,
10859                                 op == '{' ? (RExC_parse - parse_start) : 1);
10860
10861                 if (!SIZE_ONLY && RExC_extralen)
10862                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10863                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10864                 if (SIZE_ONLY)
10865                     RExC_whilem_seen++, RExC_extralen += 3;
10866                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10867             }
10868             ret->flags = 0;
10869
10870             if (min > 0)
10871                 *flagp = WORST;
10872             if (max > 0)
10873                 *flagp |= HASWIDTH;
10874             if (!SIZE_ONLY) {
10875                 ARG1_SET(ret, (U16)min);
10876                 ARG2_SET(ret, (U16)max);
10877             }
10878             if (max == REG_INFTY)
10879                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10880
10881             goto nest_check;
10882         }
10883     }
10884
10885     if (!ISMULT1(op)) {
10886         *flagp = flags;
10887         return(ret);
10888     }
10889
10890 #if 0                           /* Now runtime fix should be reliable. */
10891
10892     /* if this is reinstated, don't forget to put this back into perldiag:
10893
10894             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10895
10896            (F) The part of the regexp subject to either the * or + quantifier
10897            could match an empty string. The {#} shows in the regular
10898            expression about where the problem was discovered.
10899
10900     */
10901
10902     if (!(flags&HASWIDTH) && op != '?')
10903       vFAIL("Regexp *+ operand could be empty");
10904 #endif
10905
10906 #ifdef RE_TRACK_PATTERN_OFFSETS
10907     parse_start = RExC_parse;
10908 #endif
10909     nextchar(pRExC_state);
10910
10911     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10912
10913     if (op == '*' && (flags&SIMPLE)) {
10914         reginsert(pRExC_state, STAR, ret, depth+1);
10915         ret->flags = 0;
10916         RExC_naughty += 4;
10917         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10918     }
10919     else if (op == '*') {
10920         min = 0;
10921         goto do_curly;
10922     }
10923     else if (op == '+' && (flags&SIMPLE)) {
10924         reginsert(pRExC_state, PLUS, ret, depth+1);
10925         ret->flags = 0;
10926         RExC_naughty += 3;
10927         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10928     }
10929     else if (op == '+') {
10930         min = 1;
10931         goto do_curly;
10932     }
10933     else if (op == '?') {
10934         min = 0; max = 1;
10935         goto do_curly;
10936     }
10937   nest_check:
10938     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10939         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10940         ckWARN2reg(RExC_parse,
10941                    "%"UTF8f" matches null string many times",
10942                    UTF8fARG(UTF, (RExC_parse >= origparse
10943                                  ? RExC_parse - origparse
10944                                  : 0),
10945                    origparse));
10946         (void)ReREFCNT_inc(RExC_rx_sv);
10947     }
10948
10949     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10950         nextchar(pRExC_state);
10951         reginsert(pRExC_state, MINMOD, ret, depth+1);
10952         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10953     }
10954     else
10955     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10956         regnode *ender;
10957         nextchar(pRExC_state);
10958         ender = reg_node(pRExC_state, SUCCEED);
10959         REGTAIL(pRExC_state, ret, ender);
10960         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10961         ret->flags = 0;
10962         ender = reg_node(pRExC_state, TAIL);
10963         REGTAIL(pRExC_state, ret, ender);
10964     }
10965
10966     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10967         RExC_parse++;
10968         vFAIL("Nested quantifiers");
10969     }
10970
10971     return(ret);
10972 }
10973
10974 STATIC STRLEN
10975 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10976                       UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10977     )
10978 {
10979
10980  /* This is expected to be called by a parser routine that has recognized '\N'
10981    and needs to handle the rest. RExC_parse is expected to point at the first
10982    char following the N at the time of the call.  On successful return,
10983    RExC_parse has been updated to point to just after the sequence identified
10984    by this routine, <*flagp> has been updated, and the non-NULL input pointers
10985    have been set appropriately.
10986
10987    The typical case for this is \N{some character name}.  This is usually
10988    called while parsing the input, filling in or ready to fill in an EXACTish
10989    node, and the code point for the character should be returned, so that it
10990    can be added to the node, and parsing continued with the next input
10991    character.  But it may be that instead of a single character the \N{}
10992    expands to more than one, a named sequence.  In this case any following
10993    quantifier applies to the whole sequence, and it is easier, given the code
10994    structure that calls this, to handle it from a different area of the code.
10995    For this reason, the input parameters can be set so that it returns valid
10996    only on one or the other of these cases.
10997
10998    Another possibility is for the input to be an empty \N{}, which for
10999    backwards compatibility we accept, but generate a NOTHING node which should
11000    later get optimized out.  This is handled from the area of code which can
11001    handle a named sequence, so if called with the parameters for the other, it
11002    fails.
11003
11004    Still another possibility is for the \N to mean [^\n], and not a single
11005    character or explicit sequence at all.  This is determined by context.
11006    Again, this is handled from the area of code which can handle a named
11007    sequence, so if called with the parameters for the other, it also fails.
11008
11009    And the final possibility is for the \N to be called from within a bracketed
11010    character class.  In this case the [^\n] meaning makes no sense, and so is
11011    an error.  Other anomalous situations are left to the calling code to handle.
11012
11013    For non-single-quoted regexes, the tokenizer has attempted to decide which
11014    of the above applies, and in the case of a named sequence, has converted it
11015    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11016    where c1... are the characters in the sequence.  For single-quoted regexes,
11017    the tokenizer passes the \N sequence through unchanged; this code will not
11018    attempt to determine this nor expand those, instead raising a syntax error.
11019    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11020    or there is no '}', it signals that this \N occurrence means to match a
11021    non-newline. (This mostly was done because of [perl #56444].)
11022
11023    The API is somewhat convoluted due to historical and the above reasons.
11024
11025    The function raises an error (via vFAIL), and doesn't return for various
11026    syntax errors.  For other failures, it returns (STRLEN) -1.  For successes,
11027    it returns a count of how many characters were accounted for by it.  (This
11028    can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11029    points in the sequence.  It sets <node_p>, <valuep>, and/or
11030    <substitute_parse> on success.
11031
11032    If <valuep> is non-null, it means the caller can accept an input sequence
11033    consisting of a just a single code point; <*valuep> is set to the value
11034    of the only or first code point in the input.
11035
11036    If <substitute_parse> is non-null, it means the caller can accept an input
11037    sequence consisting of one or more code points; <*substitute_parse> is a
11038    newly created mortal SV* in this case, containing \x{} escapes representing
11039    those code points.
11040
11041    Both <valuep> and <substitute_parse> can be non-NULL.
11042
11043    If <node_p> is non-null, <substitute_parse> must be NULL.  This signifies
11044    that the caller can accept any legal sequence other than a single code
11045    point.  To wit, <*node_p> is set as follows:
11046     1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11047     2) \N{}:              points to a new NOTHING node; return is 0
11048     3) otherwise:         points to a new EXACT node containing the resolved
11049                           string; return is the number of code points in the
11050                           string.  This will never be 1.
11051    Note that failure is returned for single code point sequences if <valuep> is
11052    null and <node_p> is not.
11053  */
11054
11055     char * endbrace;    /* '}' following the name */
11056     char* p;
11057     char *endchar;      /* Points to '.' or '}' ending cur char in the input
11058                            stream */
11059     bool has_multiple_chars; /* true if the input stream contains a sequence of
11060                                 more than one character */
11061     bool in_char_class = substitute_parse != NULL;
11062     STRLEN count = 0;   /* Number of characters in this sequence */
11063
11064     GET_RE_DEBUG_FLAGS_DECL;
11065
11066     PERL_ARGS_ASSERT_GROK_BSLASH_N;
11067
11068     GET_RE_DEBUG_FLAGS;
11069
11070     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
11071     assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11072
11073     /* The [^\n] meaning of \N ignores spaces and comments under the /x
11074      * modifier.  The other meaning does not, so use a temporary until we find
11075      * out which we are being called with */
11076     p = (RExC_flags & RXf_PMf_EXTENDED)
11077         ? regpatws(pRExC_state, RExC_parse,
11078                                 TRUE) /* means recognize comments */
11079         : RExC_parse;
11080
11081     /* Disambiguate between \N meaning a named character versus \N meaning
11082      * [^\n].  The former is assumed when it can't be the latter. */
11083     if (*p != '{' || regcurly(p)) {
11084         RExC_parse = p;
11085         if (! node_p) {
11086             /* no bare \N allowed in a charclass */
11087             if (in_char_class) {
11088                 vFAIL("\\N in a character class must be a named character: \\N{...}");
11089             }
11090             return (STRLEN) -1;
11091         }
11092         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
11093                            current char */
11094         nextchar(pRExC_state);
11095         *node_p = reg_node(pRExC_state, REG_ANY);
11096         *flagp |= HASWIDTH|SIMPLE;
11097         RExC_naughty++;
11098         Set_Node_Length(*node_p, 1); /* MJD */
11099         return 1;
11100     }
11101
11102     /* Here, we have decided it should be a named character or sequence */
11103
11104     /* The test above made sure that the next real character is a '{', but
11105      * under the /x modifier, it could be separated by space (or a comment and
11106      * \n) and this is not allowed (for consistency with \x{...} and the
11107      * tokenizer handling of \N{NAME}). */
11108     if (*RExC_parse != '{') {
11109         vFAIL("Missing braces on \\N{}");
11110     }
11111
11112     RExC_parse++;       /* Skip past the '{' */
11113
11114     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11115         || ! (endbrace == RExC_parse            /* nothing between the {} */
11116               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
11117                                                  */
11118                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
11119                                                      */
11120     {
11121         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
11122         vFAIL("\\N{NAME} must be resolved by the lexer");
11123     }
11124
11125     if (endbrace == RExC_parse) {   /* empty: \N{} */
11126         if (node_p) {
11127             *node_p = reg_node(pRExC_state,NOTHING);
11128         }
11129         else if (! in_char_class) {
11130             return (STRLEN) -1;
11131         }
11132         nextchar(pRExC_state);
11133         return 0;
11134     }
11135
11136     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11137     RExC_parse += 2;    /* Skip past the 'U+' */
11138
11139     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11140
11141     /* Code points are separated by dots.  If none, there is only one code
11142      * point, and is terminated by the brace */
11143     has_multiple_chars = (endchar < endbrace);
11144
11145     /* We get the first code point if we want it, and either there is only one,
11146      * or we can accept both cases of one and more than one */
11147     if (valuep && (substitute_parse || ! has_multiple_chars)) {
11148         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11149         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11150                            | PERL_SCAN_DISALLOW_PREFIX
11151
11152                              /* No errors in the first pass (See [perl
11153                               * #122671].)  We let the code below find the
11154                               * errors when there are multiple chars. */
11155                            | ((SIZE_ONLY || has_multiple_chars)
11156                               ? PERL_SCAN_SILENT_ILLDIGIT
11157                               : 0);
11158
11159         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11160
11161         /* The tokenizer should have guaranteed validity, but it's possible to
11162          * bypass it by using single quoting, so check.  Don't do the check
11163          * here when there are multiple chars; we do it below anyway. */
11164         if (! has_multiple_chars) {
11165             if (length_of_hex == 0
11166                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11167             {
11168                 RExC_parse += length_of_hex;    /* Includes all the valid */
11169                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
11170                                 ? UTF8SKIP(RExC_parse)
11171                                 : 1;
11172                 /* Guard against malformed utf8 */
11173                 if (RExC_parse >= endchar) {
11174                     RExC_parse = endchar;
11175                 }
11176                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11177             }
11178
11179             RExC_parse = endbrace + 1;
11180             return 1;
11181         }
11182     }
11183
11184     /* Here, we should have already handled the case where a single character
11185      * is expected and found.  So it is a failure if we aren't expecting
11186      * multiple chars and got them; or didn't get them but wanted them.  We
11187      * fail without advancing the parse, so that the caller can try again with
11188      * different acceptance criteria */
11189     if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11190         RExC_parse = p;
11191         return (STRLEN) -1;
11192     }
11193
11194     {
11195
11196         /* What is done here is to convert this to a sub-pattern of the form
11197          * \x{char1}\x{char2}...
11198          * and then either return it in <*substitute_parse> if non-null; or
11199          * call reg recursively to parse it (enclosing in "(?: ... )" ).  That
11200          * way, it retains its atomicness, while not having to worry about
11201          * special handling that some code points may have.  toke.c has
11202          * converted the original Unicode values to native, so that we can just
11203          * pass on the hex values unchanged.  We do have to set a flag to keep
11204          * recoding from happening in the recursion */
11205
11206         SV * dummy = NULL;
11207         STRLEN len;
11208         char *orig_end = RExC_end;
11209         I32 flags;
11210
11211         if (substitute_parse) {
11212             *substitute_parse = newSVpvs("");
11213         }
11214         else {
11215             substitute_parse = &dummy;
11216             *substitute_parse = newSVpvs("?:");
11217         }
11218         *substitute_parse = sv_2mortal(*substitute_parse);
11219
11220         while (RExC_parse < endbrace) {
11221
11222             /* Convert to notation the rest of the code understands */
11223             sv_catpv(*substitute_parse, "\\x{");
11224             sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11225             sv_catpv(*substitute_parse, "}");
11226
11227             /* Point to the beginning of the next character in the sequence. */
11228             RExC_parse = endchar + 1;
11229             endchar = RExC_parse + strcspn(RExC_parse, ".}");
11230
11231             count++;
11232         }
11233         if (! in_char_class) {
11234             sv_catpv(*substitute_parse, ")");
11235         }
11236
11237         RExC_parse = SvPV(*substitute_parse, len);
11238
11239         /* Don't allow empty number */
11240         if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11241             RExC_parse = endbrace;
11242             vFAIL("Invalid hexadecimal number in \\N{U+...}");
11243         }
11244         RExC_end = RExC_parse + len;
11245
11246         /* The values are Unicode, and therefore not subject to recoding */
11247         RExC_override_recoding = 1;
11248
11249         if (node_p) {
11250             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11251                 if (flags & RESTART_UTF8) {
11252                     *flagp = RESTART_UTF8;
11253                     return (STRLEN) -1;
11254                 }
11255                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11256                     (UV) flags);
11257             }
11258             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11259         }
11260
11261         RExC_parse = endbrace;
11262         RExC_end = orig_end;
11263         RExC_override_recoding = 0;
11264
11265         nextchar(pRExC_state);
11266     }
11267
11268     return count;
11269 }
11270
11271
11272 /*
11273  * reg_recode
11274  *
11275  * It returns the code point in utf8 for the value in *encp.
11276  *    value: a code value in the source encoding
11277  *    encp:  a pointer to an Encode object
11278  *
11279  * If the result from Encode is not a single character,
11280  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11281  */
11282 STATIC UV
11283 S_reg_recode(pTHX_ const char value, SV **encp)
11284 {
11285     STRLEN numlen = 1;
11286     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11287     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11288     const STRLEN newlen = SvCUR(sv);
11289     UV uv = UNICODE_REPLACEMENT;
11290
11291     PERL_ARGS_ASSERT_REG_RECODE;
11292
11293     if (newlen)
11294         uv = SvUTF8(sv)
11295              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11296              : *(U8*)s;
11297
11298     if (!newlen || numlen != newlen) {
11299         uv = UNICODE_REPLACEMENT;
11300         *encp = NULL;
11301     }
11302     return uv;
11303 }
11304
11305 PERL_STATIC_INLINE U8
11306 S_compute_EXACTish(RExC_state_t *pRExC_state)
11307 {
11308     U8 op;
11309
11310     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11311
11312     if (! FOLD) {
11313         return EXACT;
11314     }
11315
11316     op = get_regex_charset(RExC_flags);
11317     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11318         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11319                  been, so there is no hole */
11320     }
11321
11322     return op + EXACTF;
11323 }
11324
11325 PERL_STATIC_INLINE void
11326 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11327                          regnode *node, I32* flagp, STRLEN len, UV code_point,
11328                          bool downgradable)
11329 {
11330     /* This knows the details about sizing an EXACTish node, setting flags for
11331      * it (by setting <*flagp>, and potentially populating it with a single
11332      * character.
11333      *
11334      * If <len> (the length in bytes) is non-zero, this function assumes that
11335      * the node has already been populated, and just does the sizing.  In this
11336      * case <code_point> should be the final code point that has already been
11337      * placed into the node.  This value will be ignored except that under some
11338      * circumstances <*flagp> is set based on it.
11339      *
11340      * If <len> is zero, the function assumes that the node is to contain only
11341      * the single character given by <code_point> and calculates what <len>
11342      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11343      * additionally will populate the node's STRING with <code_point> or its
11344      * fold if folding.
11345      *
11346      * In both cases <*flagp> is appropriately set
11347      *
11348      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11349      * 255, must be folded (the former only when the rules indicate it can
11350      * match 'ss')
11351      *
11352      * When it does the populating, it looks at the flag 'downgradable'.  If
11353      * true with a node that folds, it checks if the single code point
11354      * participates in a fold, and if not downgrades the node to an EXACT.
11355      * This helps the optimizer */
11356
11357     bool len_passed_in = cBOOL(len != 0);
11358     U8 character[UTF8_MAXBYTES_CASE+1];
11359
11360     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11361
11362     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11363      * sizing difference, and is extra work that is thrown away */
11364     if (downgradable && ! PASS2) {
11365         downgradable = FALSE;
11366     }
11367
11368     if (! len_passed_in) {
11369         if (UTF) {
11370             if (UVCHR_IS_INVARIANT(code_point)) {
11371                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11372                     *character = (U8) code_point;
11373                 }
11374                 else { /* Here is /i and not /l. (toFOLD() is defined on just
11375                           ASCII, which isn't the same thing as INVARIANT on
11376                           EBCDIC, but it works there, as the extra invariants
11377                           fold to themselves) */
11378                     *character = toFOLD((U8) code_point);
11379
11380                     /* We can downgrade to an EXACT node if this character
11381                      * isn't a folding one.  Note that this assumes that
11382                      * nothing above Latin1 folds to some other invariant than
11383                      * one of these alphabetics; otherwise we would also have
11384                      * to check:
11385                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11386                      *      || ASCII_FOLD_RESTRICTED))
11387                      */
11388                     if (downgradable && PL_fold[code_point] == code_point) {
11389                         OP(node) = EXACT;
11390                     }
11391                 }
11392                 len = 1;
11393             }
11394             else if (FOLD && (! LOC
11395                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11396             {   /* Folding, and ok to do so now */
11397                 UV folded = _to_uni_fold_flags(
11398                                    code_point,
11399                                    character,
11400                                    &len,
11401                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11402                                                       ? FOLD_FLAGS_NOMIX_ASCII
11403                                                       : 0));
11404                 if (downgradable
11405                     && folded == code_point /* This quickly rules out many
11406                                                cases, avoiding the
11407                                                _invlist_contains_cp() overhead
11408                                                for those.  */
11409                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11410                 {
11411                     OP(node) = EXACT;
11412                 }
11413             }
11414             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11415
11416                 /* Not folding this cp, and can output it directly */
11417                 *character = UTF8_TWO_BYTE_HI(code_point);
11418                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11419                 len = 2;
11420             }
11421             else {
11422                 uvchr_to_utf8( character, code_point);
11423                 len = UTF8SKIP(character);
11424             }
11425         } /* Else pattern isn't UTF8.  */
11426         else if (! FOLD) {
11427             *character = (U8) code_point;
11428             len = 1;
11429         } /* Else is folded non-UTF8 */
11430         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11431
11432             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11433              * comments at join_exact()); */
11434             *character = (U8) code_point;
11435             len = 1;
11436
11437             /* Can turn into an EXACT node if we know the fold at compile time,
11438              * and it folds to itself and doesn't particpate in other folds */
11439             if (downgradable
11440                 && ! LOC
11441                 && PL_fold_latin1[code_point] == code_point
11442                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11443                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11444             {
11445                 OP(node) = EXACT;
11446             }
11447         } /* else is Sharp s.  May need to fold it */
11448         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11449             *character = 's';
11450             *(character + 1) = 's';
11451             len = 2;
11452         }
11453         else {
11454             *character = LATIN_SMALL_LETTER_SHARP_S;
11455             len = 1;
11456         }
11457     }
11458
11459     if (SIZE_ONLY) {
11460         RExC_size += STR_SZ(len);
11461     }
11462     else {
11463         RExC_emit += STR_SZ(len);
11464         STR_LEN(node) = len;
11465         if (! len_passed_in) {
11466             Copy((char *) character, STRING(node), len, char);
11467         }
11468     }
11469
11470     *flagp |= HASWIDTH;
11471
11472     /* A single character node is SIMPLE, except for the special-cased SHARP S
11473      * under /di. */
11474     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11475         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11476             || ! FOLD || ! DEPENDS_SEMANTICS))
11477     {
11478         *flagp |= SIMPLE;
11479     }
11480
11481     /* The OP may not be well defined in PASS1 */
11482     if (PASS2 && OP(node) == EXACTFL) {
11483         RExC_contains_locale = 1;
11484     }
11485 }
11486
11487
11488 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11489  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11490
11491 static I32
11492 S_backref_value(char *p)
11493 {
11494     const char* endptr;
11495     UV val = grok_atou(p, &endptr);
11496     if (endptr == p || endptr == NULL || val > I32_MAX)
11497         return I32_MAX;
11498     return (I32)val;
11499 }
11500
11501
11502 /*
11503  - regatom - the lowest level
11504
11505    Try to identify anything special at the start of the pattern. If there
11506    is, then handle it as required. This may involve generating a single regop,
11507    such as for an assertion; or it may involve recursing, such as to
11508    handle a () structure.
11509
11510    If the string doesn't start with something special then we gobble up
11511    as much literal text as we can.
11512
11513    Once we have been able to handle whatever type of thing started the
11514    sequence, we return.
11515
11516    Note: we have to be careful with escapes, as they can be both literal
11517    and special, and in the case of \10 and friends, context determines which.
11518
11519    A summary of the code structure is:
11520
11521    switch (first_byte) {
11522         cases for each special:
11523             handle this special;
11524             break;
11525         case '\\':
11526             switch (2nd byte) {
11527                 cases for each unambiguous special:
11528                     handle this special;
11529                     break;
11530                 cases for each ambigous special/literal:
11531                     disambiguate;
11532                     if (special)  handle here
11533                     else goto defchar;
11534                 default: // unambiguously literal:
11535                     goto defchar;
11536             }
11537         default:  // is a literal char
11538             // FALL THROUGH
11539         defchar:
11540             create EXACTish node for literal;
11541             while (more input and node isn't full) {
11542                 switch (input_byte) {
11543                    cases for each special;
11544                        make sure parse pointer is set so that the next call to
11545                            regatom will see this special first
11546                        goto loopdone; // EXACTish node terminated by prev. char
11547                    default:
11548                        append char to EXACTISH node;
11549                 }
11550                 get next input byte;
11551             }
11552         loopdone:
11553    }
11554    return the generated node;
11555
11556    Specifically there are two separate switches for handling
11557    escape sequences, with the one for handling literal escapes requiring
11558    a dummy entry for all of the special escapes that are actually handled
11559    by the other.
11560
11561    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11562    TRYAGAIN.
11563    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11564    restarted.
11565    Otherwise does not return NULL.
11566 */
11567
11568 STATIC regnode *
11569 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11570 {
11571     regnode *ret = NULL;
11572     I32 flags = 0;
11573     char *parse_start = RExC_parse;
11574     U8 op;
11575     int invert = 0;
11576     U8 arg;
11577
11578     GET_RE_DEBUG_FLAGS_DECL;
11579
11580     *flagp = WORST;             /* Tentatively. */
11581
11582     DEBUG_PARSE("atom");
11583
11584     PERL_ARGS_ASSERT_REGATOM;
11585
11586 tryagain:
11587     switch ((U8)*RExC_parse) {
11588     case '^':
11589         RExC_seen_zerolen++;
11590         nextchar(pRExC_state);
11591         if (RExC_flags & RXf_PMf_MULTILINE)
11592             ret = reg_node(pRExC_state, MBOL);
11593         else
11594             ret = reg_node(pRExC_state, SBOL);
11595         Set_Node_Length(ret, 1); /* MJD */
11596         break;
11597     case '$':
11598         nextchar(pRExC_state);
11599         if (*RExC_parse)
11600             RExC_seen_zerolen++;
11601         if (RExC_flags & RXf_PMf_MULTILINE)
11602             ret = reg_node(pRExC_state, MEOL);
11603         else
11604             ret = reg_node(pRExC_state, SEOL);
11605         Set_Node_Length(ret, 1); /* MJD */
11606         break;
11607     case '.':
11608         nextchar(pRExC_state);
11609         if (RExC_flags & RXf_PMf_SINGLELINE)
11610             ret = reg_node(pRExC_state, SANY);
11611         else
11612             ret = reg_node(pRExC_state, REG_ANY);
11613         *flagp |= HASWIDTH|SIMPLE;
11614         RExC_naughty++;
11615         Set_Node_Length(ret, 1); /* MJD */
11616         break;
11617     case '[':
11618     {
11619         char * const oregcomp_parse = ++RExC_parse;
11620         ret = regclass(pRExC_state, flagp,depth+1,
11621                        FALSE, /* means parse the whole char class */
11622                        TRUE, /* allow multi-char folds */
11623                        FALSE, /* don't silence non-portable warnings. */
11624                        NULL);
11625         if (*RExC_parse != ']') {
11626             RExC_parse = oregcomp_parse;
11627             vFAIL("Unmatched [");
11628         }
11629         if (ret == NULL) {
11630             if (*flagp & RESTART_UTF8)
11631                 return NULL;
11632             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11633                   (UV) *flagp);
11634         }
11635         nextchar(pRExC_state);
11636         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11637         break;
11638     }
11639     case '(':
11640         nextchar(pRExC_state);
11641         ret = reg(pRExC_state, 2, &flags,depth+1);
11642         if (ret == NULL) {
11643                 if (flags & TRYAGAIN) {
11644                     if (RExC_parse == RExC_end) {
11645                          /* Make parent create an empty node if needed. */
11646                         *flagp |= TRYAGAIN;
11647                         return(NULL);
11648                     }
11649                     goto tryagain;
11650                 }
11651                 if (flags & RESTART_UTF8) {
11652                     *flagp = RESTART_UTF8;
11653                     return NULL;
11654                 }
11655                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11656                                                                  (UV) flags);
11657         }
11658         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11659         break;
11660     case '|':
11661     case ')':
11662         if (flags & TRYAGAIN) {
11663             *flagp |= TRYAGAIN;
11664             return NULL;
11665         }
11666         vFAIL("Internal urp");
11667                                 /* Supposed to be caught earlier. */
11668         break;
11669     case '?':
11670     case '+':
11671     case '*':
11672         RExC_parse++;
11673         vFAIL("Quantifier follows nothing");
11674         break;
11675     case '\\':
11676         /* Special Escapes
11677
11678            This switch handles escape sequences that resolve to some kind
11679            of special regop and not to literal text. Escape sequnces that
11680            resolve to literal text are handled below in the switch marked
11681            "Literal Escapes".
11682
11683            Every entry in this switch *must* have a corresponding entry
11684            in the literal escape switch. However, the opposite is not
11685            required, as the default for this switch is to jump to the
11686            literal text handling code.
11687         */
11688         switch ((U8)*++RExC_parse) {
11689         /* Special Escapes */
11690         case 'A':
11691             RExC_seen_zerolen++;
11692             ret = reg_node(pRExC_state, SBOL);
11693             /* SBOL is shared with /^/ so we set the flags so we can tell
11694              * /\A/ from /^/ in split. We check ret because first pass we
11695              * have no regop struct to set the flags on. */
11696             if (PASS2)
11697                 ret->flags = 1;
11698             *flagp |= SIMPLE;
11699             goto finish_meta_pat;
11700         case 'G':
11701             ret = reg_node(pRExC_state, GPOS);
11702             RExC_seen |= REG_GPOS_SEEN;
11703             *flagp |= SIMPLE;
11704             goto finish_meta_pat;
11705         case 'K':
11706             RExC_seen_zerolen++;
11707             ret = reg_node(pRExC_state, KEEPS);
11708             *flagp |= SIMPLE;
11709             /* XXX:dmq : disabling in-place substitution seems to
11710              * be necessary here to avoid cases of memory corruption, as
11711              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11712              */
11713             RExC_seen |= REG_LOOKBEHIND_SEEN;
11714             goto finish_meta_pat;
11715         case 'Z':
11716             ret = reg_node(pRExC_state, SEOL);
11717             *flagp |= SIMPLE;
11718             RExC_seen_zerolen++;                /* Do not optimize RE away */
11719             goto finish_meta_pat;
11720         case 'z':
11721             ret = reg_node(pRExC_state, EOS);
11722             *flagp |= SIMPLE;
11723             RExC_seen_zerolen++;                /* Do not optimize RE away */
11724             goto finish_meta_pat;
11725         case 'C':
11726             ret = reg_node(pRExC_state, CANY);
11727             RExC_seen |= REG_CANY_SEEN;
11728             *flagp |= HASWIDTH|SIMPLE;
11729             if (PASS2) {
11730                 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11731             }
11732             goto finish_meta_pat;
11733         case 'X':
11734             ret = reg_node(pRExC_state, CLUMP);
11735             *flagp |= HASWIDTH;
11736             goto finish_meta_pat;
11737
11738         case 'W':
11739             invert = 1;
11740             /* FALLTHROUGH */
11741         case 'w':
11742             arg = ANYOF_WORDCHAR;
11743             goto join_posix;
11744
11745         case 'b':
11746             RExC_seen_zerolen++;
11747             RExC_seen |= REG_LOOKBEHIND_SEEN;
11748             op = BOUND + get_regex_charset(RExC_flags);
11749             if (op > BOUNDA) {  /* /aa is same as /a */
11750                 op = BOUNDA;
11751             }
11752             else if (op == BOUNDL) {
11753                 RExC_contains_locale = 1;
11754             }
11755             ret = reg_node(pRExC_state, op);
11756             FLAGS(ret) = get_regex_charset(RExC_flags);
11757             *flagp |= SIMPLE;
11758             if ((U8) *(RExC_parse + 1) == '{') {
11759                 /* diag_listed_as: Use "%s" instead of "%s" */
11760                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11761             }
11762             goto finish_meta_pat;
11763         case 'B':
11764             RExC_seen_zerolen++;
11765             RExC_seen |= REG_LOOKBEHIND_SEEN;
11766             op = NBOUND + get_regex_charset(RExC_flags);
11767             if (op > NBOUNDA) { /* /aa is same as /a */
11768                 op = NBOUNDA;
11769             }
11770             else if (op == NBOUNDL) {
11771                 RExC_contains_locale = 1;
11772             }
11773             ret = reg_node(pRExC_state, op);
11774             FLAGS(ret) = get_regex_charset(RExC_flags);
11775             *flagp |= SIMPLE;
11776             if ((U8) *(RExC_parse + 1) == '{') {
11777                 /* diag_listed_as: Use "%s" instead of "%s" */
11778                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11779             }
11780             goto finish_meta_pat;
11781
11782         case 'D':
11783             invert = 1;
11784             /* FALLTHROUGH */
11785         case 'd':
11786             arg = ANYOF_DIGIT;
11787             goto join_posix;
11788
11789         case 'R':
11790             ret = reg_node(pRExC_state, LNBREAK);
11791             *flagp |= HASWIDTH|SIMPLE;
11792             goto finish_meta_pat;
11793
11794         case 'H':
11795             invert = 1;
11796             /* FALLTHROUGH */
11797         case 'h':
11798             arg = ANYOF_BLANK;
11799             op = POSIXU;
11800             goto join_posix_op_known;
11801
11802         case 'V':
11803             invert = 1;
11804             /* FALLTHROUGH */
11805         case 'v':
11806             arg = ANYOF_VERTWS;
11807             op = POSIXU;
11808             goto join_posix_op_known;
11809
11810         case 'S':
11811             invert = 1;
11812             /* FALLTHROUGH */
11813         case 's':
11814             arg = ANYOF_SPACE;
11815
11816         join_posix:
11817
11818             op = POSIXD + get_regex_charset(RExC_flags);
11819             if (op > POSIXA) {  /* /aa is same as /a */
11820                 op = POSIXA;
11821             }
11822             else if (op == POSIXL) {
11823                 RExC_contains_locale = 1;
11824             }
11825
11826         join_posix_op_known:
11827
11828             if (invert) {
11829                 op += NPOSIXD - POSIXD;
11830             }
11831
11832             ret = reg_node(pRExC_state, op);
11833             if (! SIZE_ONLY) {
11834                 FLAGS(ret) = namedclass_to_classnum(arg);
11835             }
11836
11837             *flagp |= HASWIDTH|SIMPLE;
11838             /* FALLTHROUGH */
11839
11840          finish_meta_pat:
11841             nextchar(pRExC_state);
11842             Set_Node_Length(ret, 2); /* MJD */
11843             break;
11844         case 'p':
11845         case 'P':
11846             {
11847 #ifdef DEBUGGING
11848                 char* parse_start = RExC_parse - 2;
11849 #endif
11850
11851                 RExC_parse--;
11852
11853                 ret = regclass(pRExC_state, flagp,depth+1,
11854                                TRUE, /* means just parse this element */
11855                                FALSE, /* don't allow multi-char folds */
11856                                FALSE, /* don't silence non-portable warnings.
11857                                          It would be a bug if these returned
11858                                          non-portables */
11859                                NULL);
11860                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11861                    are allowed.  */
11862                 if (!ret)
11863                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11864                           (UV) *flagp);
11865
11866                 RExC_parse--;
11867
11868                 Set_Node_Offset(ret, parse_start + 2);
11869                 Set_Node_Cur_Length(ret, parse_start);
11870                 nextchar(pRExC_state);
11871             }
11872             break;
11873         case 'N':
11874             /* Handle \N and \N{NAME} with multiple code points here and not
11875              * below because it can be multicharacter. join_exact() will join
11876              * them up later on.  Also this makes sure that things like
11877              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11878              * The options to the grok function call causes it to fail if the
11879              * sequence is just a single code point.  We then go treat it as
11880              * just another character in the current EXACT node, and hence it
11881              * gets uniform treatment with all the other characters.  The
11882              * special treatment for quantifiers is not needed for such single
11883              * character sequences */
11884             ++RExC_parse;
11885             if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11886                                              depth, FALSE))
11887             {
11888                 if (*flagp & RESTART_UTF8)
11889                     return NULL;
11890                 RExC_parse--;
11891                 goto defchar;
11892             }
11893             break;
11894         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11895         parse_named_seq:
11896         {
11897             char ch= RExC_parse[1];
11898             if (ch != '<' && ch != '\'' && ch != '{') {
11899                 RExC_parse++;
11900                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11901                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11902             } else {
11903                 /* this pretty much dupes the code for (?P=...) in reg(), if
11904                    you change this make sure you change that */
11905                 char* name_start = (RExC_parse += 2);
11906                 U32 num = 0;
11907                 SV *sv_dat = reg_scan_name(pRExC_state,
11908                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11909                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11910                 if (RExC_parse == name_start || *RExC_parse != ch)
11911                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11912                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11913
11914                 if (!SIZE_ONLY) {
11915                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11916                     RExC_rxi->data->data[num]=(void*)sv_dat;
11917                     SvREFCNT_inc_simple_void(sv_dat);
11918                 }
11919
11920                 RExC_sawback = 1;
11921                 ret = reganode(pRExC_state,
11922                                ((! FOLD)
11923                                  ? NREF
11924                                  : (ASCII_FOLD_RESTRICTED)
11925                                    ? NREFFA
11926                                    : (AT_LEAST_UNI_SEMANTICS)
11927                                      ? NREFFU
11928                                      : (LOC)
11929                                        ? NREFFL
11930                                        : NREFF),
11931                                 num);
11932                 *flagp |= HASWIDTH;
11933
11934                 /* override incorrect value set in reganode MJD */
11935                 Set_Node_Offset(ret, parse_start+1);
11936                 Set_Node_Cur_Length(ret, parse_start);
11937                 nextchar(pRExC_state);
11938
11939             }
11940             break;
11941         }
11942         case 'g':
11943         case '1': case '2': case '3': case '4':
11944         case '5': case '6': case '7': case '8': case '9':
11945             {
11946                 I32 num;
11947                 bool hasbrace = 0;
11948
11949                 if (*RExC_parse == 'g') {
11950                     bool isrel = 0;
11951
11952                     RExC_parse++;
11953                     if (*RExC_parse == '{') {
11954                         RExC_parse++;
11955                         hasbrace = 1;
11956                     }
11957                     if (*RExC_parse == '-') {
11958                         RExC_parse++;
11959                         isrel = 1;
11960                     }
11961                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11962                         if (isrel) RExC_parse--;
11963                         RExC_parse -= 2;
11964                         goto parse_named_seq;
11965                     }
11966
11967                     num = S_backref_value(RExC_parse);
11968                     if (num == 0)
11969                         vFAIL("Reference to invalid group 0");
11970                     else if (num == I32_MAX) {
11971                          if (isDIGIT(*RExC_parse))
11972                             vFAIL("Reference to nonexistent group");
11973                         else
11974                             vFAIL("Unterminated \\g... pattern");
11975                     }
11976
11977                     if (isrel) {
11978                         num = RExC_npar - num;
11979                         if (num < 1)
11980                             vFAIL("Reference to nonexistent or unclosed group");
11981                     }
11982                 }
11983                 else {
11984                     num = S_backref_value(RExC_parse);
11985                     /* bare \NNN might be backref or octal - if it is larger than or equal
11986                      * RExC_npar then it is assumed to be and octal escape.
11987                      * Note RExC_npar is +1 from the actual number of parens*/
11988                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11989                             && *RExC_parse != '8' && *RExC_parse != '9'))
11990                     {
11991                         /* Probably a character specified in octal, e.g. \35 */
11992                         goto defchar;
11993                     }
11994                 }
11995
11996                 /* at this point RExC_parse definitely points to a backref
11997                  * number */
11998                 {
11999 #ifdef RE_TRACK_PATTERN_OFFSETS
12000                     char * const parse_start = RExC_parse - 1; /* MJD */
12001 #endif
12002                     while (isDIGIT(*RExC_parse))
12003                         RExC_parse++;
12004                     if (hasbrace) {
12005                         if (*RExC_parse != '}')
12006                             vFAIL("Unterminated \\g{...} pattern");
12007                         RExC_parse++;
12008                     }
12009                     if (!SIZE_ONLY) {
12010                         if (num > (I32)RExC_rx->nparens)
12011                             vFAIL("Reference to nonexistent group");
12012                     }
12013                     RExC_sawback = 1;
12014                     ret = reganode(pRExC_state,
12015                                    ((! FOLD)
12016                                      ? REF
12017                                      : (ASCII_FOLD_RESTRICTED)
12018                                        ? REFFA
12019                                        : (AT_LEAST_UNI_SEMANTICS)
12020                                          ? REFFU
12021                                          : (LOC)
12022                                            ? REFFL
12023                                            : REFF),
12024                                     num);
12025                     *flagp |= HASWIDTH;
12026
12027                     /* override incorrect value set in reganode MJD */
12028                     Set_Node_Offset(ret, parse_start+1);
12029                     Set_Node_Cur_Length(ret, parse_start);
12030                     RExC_parse--;
12031                     nextchar(pRExC_state);
12032                 }
12033             }
12034             break;
12035         case '\0':
12036             if (RExC_parse >= RExC_end)
12037                 FAIL("Trailing \\");
12038             /* FALLTHROUGH */
12039         default:
12040             /* Do not generate "unrecognized" warnings here, we fall
12041                back into the quick-grab loop below */
12042             parse_start--;
12043             goto defchar;
12044         }
12045         break;
12046
12047     case '#':
12048         if (RExC_flags & RXf_PMf_EXTENDED) {
12049             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12050             if (RExC_parse < RExC_end)
12051                 goto tryagain;
12052         }
12053         /* FALLTHROUGH */
12054
12055     default:
12056
12057             parse_start = RExC_parse - 1;
12058
12059             RExC_parse++;
12060
12061         defchar: {
12062             STRLEN len = 0;
12063             UV ender = 0;
12064             char *p;
12065             char *s;
12066 #define MAX_NODE_STRING_SIZE 127
12067             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12068             char *s0;
12069             U8 upper_parse = MAX_NODE_STRING_SIZE;
12070             U8 node_type = compute_EXACTish(pRExC_state);
12071             bool next_is_quantifier;
12072             char * oldp = NULL;
12073
12074             /* We can convert EXACTF nodes to EXACTFU if they contain only
12075              * characters that match identically regardless of the target
12076              * string's UTF8ness.  The reason to do this is that EXACTF is not
12077              * trie-able, EXACTFU is.
12078              *
12079              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12080              * contain only above-Latin1 characters (hence must be in UTF8),
12081              * which don't participate in folds with Latin1-range characters,
12082              * as the latter's folds aren't known until runtime.  (We don't
12083              * need to figure this out until pass 2) */
12084             bool maybe_exactfu = PASS2
12085                                && (node_type == EXACTF || node_type == EXACTFL);
12086
12087             /* If a folding node contains only code points that don't
12088              * participate in folds, it can be changed into an EXACT node,
12089              * which allows the optimizer more things to look for */
12090             bool maybe_exact;
12091
12092             ret = reg_node(pRExC_state, node_type);
12093
12094             /* In pass1, folded, we use a temporary buffer instead of the
12095              * actual node, as the node doesn't exist yet */
12096             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12097
12098             s0 = s;
12099
12100         reparse:
12101
12102             /* We do the EXACTFish to EXACT node only if folding.  (And we
12103              * don't need to figure this out until pass 2) */
12104             maybe_exact = FOLD && PASS2;
12105
12106             /* XXX The node can hold up to 255 bytes, yet this only goes to
12107              * 127.  I (khw) do not know why.  Keeping it somewhat less than
12108              * 255 allows us to not have to worry about overflow due to
12109              * converting to utf8 and fold expansion, but that value is
12110              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
12111              * split up by this limit into a single one using the real max of
12112              * 255.  Even at 127, this breaks under rare circumstances.  If
12113              * folding, we do not want to split a node at a character that is a
12114              * non-final in a multi-char fold, as an input string could just
12115              * happen to want to match across the node boundary.  The join
12116              * would solve that problem if the join actually happens.  But a
12117              * series of more than two nodes in a row each of 127 would cause
12118              * the first join to succeed to get to 254, but then there wouldn't
12119              * be room for the next one, which could at be one of those split
12120              * multi-char folds.  I don't know of any fool-proof solution.  One
12121              * could back off to end with only a code point that isn't such a
12122              * non-final, but it is possible for there not to be any in the
12123              * entire node. */
12124             for (p = RExC_parse - 1;
12125                  len < upper_parse && p < RExC_end;
12126                  len++)
12127             {
12128                 oldp = p;
12129
12130                 if (RExC_flags & RXf_PMf_EXTENDED)
12131                     p = regpatws(pRExC_state, p,
12132                                           TRUE); /* means recognize comments */
12133                 switch ((U8)*p) {
12134                 case '^':
12135                 case '$':
12136                 case '.':
12137                 case '[':
12138                 case '(':
12139                 case ')':
12140                 case '|':
12141                     goto loopdone;
12142                 case '\\':
12143                     /* Literal Escapes Switch
12144
12145                        This switch is meant to handle escape sequences that
12146                        resolve to a literal character.
12147
12148                        Every escape sequence that represents something
12149                        else, like an assertion or a char class, is handled
12150                        in the switch marked 'Special Escapes' above in this
12151                        routine, but also has an entry here as anything that
12152                        isn't explicitly mentioned here will be treated as
12153                        an unescaped equivalent literal.
12154                     */
12155
12156                     switch ((U8)*++p) {
12157                     /* These are all the special escapes. */
12158                     case 'A':             /* Start assertion */
12159                     case 'b': case 'B':   /* Word-boundary assertion*/
12160                     case 'C':             /* Single char !DANGEROUS! */
12161                     case 'd': case 'D':   /* digit class */
12162                     case 'g': case 'G':   /* generic-backref, pos assertion */
12163                     case 'h': case 'H':   /* HORIZWS */
12164                     case 'k': case 'K':   /* named backref, keep marker */
12165                     case 'p': case 'P':   /* Unicode property */
12166                               case 'R':   /* LNBREAK */
12167                     case 's': case 'S':   /* space class */
12168                     case 'v': case 'V':   /* VERTWS */
12169                     case 'w': case 'W':   /* word class */
12170                     case 'X':             /* eXtended Unicode "combining
12171                                              character sequence" */
12172                     case 'z': case 'Z':   /* End of line/string assertion */
12173                         --p;
12174                         goto loopdone;
12175
12176                     /* Anything after here is an escape that resolves to a
12177                        literal. (Except digits, which may or may not)
12178                      */
12179                     case 'n':
12180                         ender = '\n';
12181                         p++;
12182                         break;
12183                     case 'N': /* Handle a single-code point named character. */
12184                         /* The options cause it to fail if a multiple code
12185                          * point sequence.  Handle those in the switch() above
12186                          * */
12187                         RExC_parse = p + 1;
12188                         if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12189                                                          &ender,
12190                                                          flagp,
12191                                                          depth,
12192                                                          FALSE
12193                         )) {
12194                             if (*flagp & RESTART_UTF8)
12195                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12196                             RExC_parse = p = oldp;
12197                             goto loopdone;
12198                         }
12199                         p = RExC_parse;
12200                         if (ender > 0xff) {
12201                             REQUIRE_UTF8;
12202                         }
12203                         break;
12204                     case 'r':
12205                         ender = '\r';
12206                         p++;
12207                         break;
12208                     case 't':
12209                         ender = '\t';
12210                         p++;
12211                         break;
12212                     case 'f':
12213                         ender = '\f';
12214                         p++;
12215                         break;
12216                     case 'e':
12217                         ender = ESC_NATIVE;
12218                         p++;
12219                         break;
12220                     case 'a':
12221                         ender = '\a';
12222                         p++;
12223                         break;
12224                     case 'o':
12225                         {
12226                             UV result;
12227                             const char* error_msg;
12228
12229                             bool valid = grok_bslash_o(&p,
12230                                                        &result,
12231                                                        &error_msg,
12232                                                        PASS2, /* out warnings */
12233                                                        FALSE, /* not strict */
12234                                                        TRUE, /* Output warnings
12235                                                                 for non-
12236                                                                 portables */
12237                                                        UTF);
12238                             if (! valid) {
12239                                 RExC_parse = p; /* going to die anyway; point
12240                                                    to exact spot of failure */
12241                                 vFAIL(error_msg);
12242                             }
12243                             ender = result;
12244                             if (PL_encoding && ender < 0x100) {
12245                                 goto recode_encoding;
12246                             }
12247                             if (ender > 0xff) {
12248                                 REQUIRE_UTF8;
12249                             }
12250                             break;
12251                         }
12252                     case 'x':
12253                         {
12254                             UV result = UV_MAX; /* initialize to erroneous
12255                                                    value */
12256                             const char* error_msg;
12257
12258                             bool valid = grok_bslash_x(&p,
12259                                                        &result,
12260                                                        &error_msg,
12261                                                        PASS2, /* out warnings */
12262                                                        FALSE, /* not strict */
12263                                                        TRUE, /* Output warnings
12264                                                                 for non-
12265                                                                 portables */
12266                                                        UTF);
12267                             if (! valid) {
12268                                 RExC_parse = p; /* going to die anyway; point
12269                                                    to exact spot of failure */
12270                                 vFAIL(error_msg);
12271                             }
12272                             ender = result;
12273
12274                             if (PL_encoding && ender < 0x100) {
12275                                 goto recode_encoding;
12276                             }
12277                             if (ender > 0xff) {
12278                                 REQUIRE_UTF8;
12279                             }
12280                             break;
12281                         }
12282                     case 'c':
12283                         p++;
12284                         ender = grok_bslash_c(*p++, PASS2);
12285                         break;
12286                     case '8': case '9': /* must be a backreference */
12287                         --p;
12288                         goto loopdone;
12289                     case '1': case '2': case '3':case '4':
12290                     case '5': case '6': case '7':
12291                         /* When we parse backslash escapes there is ambiguity
12292                          * between backreferences and octal escapes. Any escape
12293                          * from \1 - \9 is a backreference, any multi-digit
12294                          * escape which does not start with 0 and which when
12295                          * evaluated as decimal could refer to an already
12296                          * parsed capture buffer is a backslash. Anything else
12297                          * is octal.
12298                          *
12299                          * Note this implies that \118 could be interpreted as
12300                          * 118 OR as "\11" . "8" depending on whether there
12301                          * were 118 capture buffers defined already in the
12302                          * pattern.  */
12303
12304                         /* NOTE, RExC_npar is 1 more than the actual number of
12305                          * parens we have seen so far, hence the < RExC_npar below. */
12306
12307                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12308                         {  /* Not to be treated as an octal constant, go
12309                                    find backref */
12310                             --p;
12311                             goto loopdone;
12312                         }
12313                         /* FALLTHROUGH */
12314                     case '0':
12315                         {
12316                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12317                             STRLEN numlen = 3;
12318                             ender = grok_oct(p, &numlen, &flags, NULL);
12319                             if (ender > 0xff) {
12320                                 REQUIRE_UTF8;
12321                             }
12322                             p += numlen;
12323                             if (PASS2   /* like \08, \178 */
12324                                 && numlen < 3
12325                                 && p < RExC_end
12326                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12327                             {
12328                                 reg_warn_non_literal_string(
12329                                          p + 1,
12330                                          form_short_octal_warning(p, numlen));
12331                             }
12332                         }
12333                         if (PL_encoding && ender < 0x100)
12334                             goto recode_encoding;
12335                         break;
12336                     recode_encoding:
12337                         if (! RExC_override_recoding) {
12338                             SV* enc = PL_encoding;
12339                             ender = reg_recode((const char)(U8)ender, &enc);
12340                             if (!enc && PASS2)
12341                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12342                             REQUIRE_UTF8;
12343                         }
12344                         break;
12345                     case '\0':
12346                         if (p >= RExC_end)
12347                             FAIL("Trailing \\");
12348                         /* FALLTHROUGH */
12349                     default:
12350                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12351                             /* Include any { following the alpha to emphasize
12352                              * that it could be part of an escape at some point
12353                              * in the future */
12354                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12355                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12356                         }
12357                         goto normal_default;
12358                     } /* End of switch on '\' */
12359                     break;
12360                 case '{':
12361                     /* Currently we don't warn when the lbrace is at the start
12362                      * of a construct.  This catches it in the middle of a
12363                      * literal string, or when its the first thing after
12364                      * something like "\b" */
12365                     if (! SIZE_ONLY
12366                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12367                     {
12368                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12369                     }
12370                     /*FALLTHROUGH*/
12371                 default:    /* A literal character */
12372                   normal_default:
12373                     if (UTF8_IS_START(*p) && UTF) {
12374                         STRLEN numlen;
12375                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12376                                                &numlen, UTF8_ALLOW_DEFAULT);
12377                         p += numlen;
12378                     }
12379                     else
12380                         ender = (U8) *p++;
12381                     break;
12382                 } /* End of switch on the literal */
12383
12384                 /* Here, have looked at the literal character and <ender>
12385                  * contains its ordinal, <p> points to the character after it
12386                  */
12387
12388                 if ( RExC_flags & RXf_PMf_EXTENDED)
12389                     p = regpatws(pRExC_state, p,
12390                                           TRUE); /* means recognize comments */
12391
12392                 /* If the next thing is a quantifier, it applies to this
12393                  * character only, which means that this character has to be in
12394                  * its own node and can't just be appended to the string in an
12395                  * existing node, so if there are already other characters in
12396                  * the node, close the node with just them, and set up to do
12397                  * this character again next time through, when it will be the
12398                  * only thing in its new node */
12399                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12400                 {
12401                     p = oldp;
12402                     goto loopdone;
12403                 }
12404
12405                 if (! FOLD   /* The simple case, just append the literal */
12406                     || (LOC  /* Also don't fold for tricky chars under /l */
12407                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12408                 {
12409                     if (UTF) {
12410                         const STRLEN unilen = reguni(pRExC_state, ender, s);
12411                         if (unilen > 0) {
12412                            s   += unilen;
12413                            len += unilen;
12414                         }
12415
12416                         /* The loop increments <len> each time, as all but this
12417                          * path (and one other) through it add a single byte to
12418                          * the EXACTish node.  But this one has changed len to
12419                          * be the correct final value, so subtract one to
12420                          * cancel out the increment that follows */
12421                         len--;
12422                     }
12423                     else {
12424                         REGC((char)ender, s++);
12425                     }
12426
12427                     /* Can get here if folding only if is one of the /l
12428                      * characters whose fold depends on the locale.  The
12429                      * occurrence of any of these indicate that we can't
12430                      * simplify things */
12431                     if (FOLD) {
12432                         maybe_exact = FALSE;
12433                         maybe_exactfu = FALSE;
12434                     }
12435                 }
12436                 else             /* FOLD */
12437                      if (! ( UTF
12438                         /* See comments for join_exact() as to why we fold this
12439                          * non-UTF at compile time */
12440                         || (node_type == EXACTFU
12441                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12442                 {
12443                     /* Here, are folding and are not UTF-8 encoded; therefore
12444                      * the character must be in the range 0-255, and is not /l
12445                      * (Not /l because we already handled these under /l in
12446                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12447                     if (IS_IN_SOME_FOLD_L1(ender)) {
12448                         maybe_exact = FALSE;
12449
12450                         /* See if the character's fold differs between /d and
12451                          * /u.  This includes the multi-char fold SHARP S to
12452                          * 'ss' */
12453                         if (maybe_exactfu
12454                             && (PL_fold[ender] != PL_fold_latin1[ender]
12455                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12456                                 || (len > 0
12457                                    && isALPHA_FOLD_EQ(ender, 's')
12458                                    && isALPHA_FOLD_EQ(*(s-1), 's'))))
12459                         {
12460                             maybe_exactfu = FALSE;
12461                         }
12462                     }
12463
12464                     /* Even when folding, we store just the input character, as
12465                      * we have an array that finds its fold quickly */
12466                     *(s++) = (char) ender;
12467                 }
12468                 else {  /* FOLD and UTF */
12469                     /* Unlike the non-fold case, we do actually have to
12470                      * calculate the results here in pass 1.  This is for two
12471                      * reasons, the folded length may be longer than the
12472                      * unfolded, and we have to calculate how many EXACTish
12473                      * nodes it will take; and we may run out of room in a node
12474                      * in the middle of a potential multi-char fold, and have
12475                      * to back off accordingly.  (Hence we can't use REGC for
12476                      * the simple case just below.) */
12477
12478                     UV folded;
12479                     if (isASCII_uni(ender)) {
12480                         folded = toFOLD(ender);
12481                         *(s)++ = (U8) folded;
12482                     }
12483                     else {
12484                         STRLEN foldlen;
12485
12486                         folded = _to_uni_fold_flags(
12487                                      ender,
12488                                      (U8 *) s,
12489                                      &foldlen,
12490                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12491                                                         ? FOLD_FLAGS_NOMIX_ASCII
12492                                                         : 0));
12493                         s += foldlen;
12494
12495                         /* The loop increments <len> each time, as all but this
12496                          * path (and one other) through it add a single byte to
12497                          * the EXACTish node.  But this one has changed len to
12498                          * be the correct final value, so subtract one to
12499                          * cancel out the increment that follows */
12500                         len += foldlen - 1;
12501                     }
12502                     /* If this node only contains non-folding code points so
12503                      * far, see if this new one is also non-folding */
12504                     if (maybe_exact) {
12505                         if (folded != ender) {
12506                             maybe_exact = FALSE;
12507                         }
12508                         else {
12509                             /* Here the fold is the original; we have to check
12510                              * further to see if anything folds to it */
12511                             if (_invlist_contains_cp(PL_utf8_foldable,
12512                                                         ender))
12513                             {
12514                                 maybe_exact = FALSE;
12515                             }
12516                         }
12517                     }
12518                     ender = folded;
12519                 }
12520
12521                 if (next_is_quantifier) {
12522
12523                     /* Here, the next input is a quantifier, and to get here,
12524                      * the current character is the only one in the node.
12525                      * Also, here <len> doesn't include the final byte for this
12526                      * character */
12527                     len++;
12528                     goto loopdone;
12529                 }
12530
12531             } /* End of loop through literal characters */
12532
12533             /* Here we have either exhausted the input or ran out of room in
12534              * the node.  (If we encountered a character that can't be in the
12535              * node, transfer is made directly to <loopdone>, and so we
12536              * wouldn't have fallen off the end of the loop.)  In the latter
12537              * case, we artificially have to split the node into two, because
12538              * we just don't have enough space to hold everything.  This
12539              * creates a problem if the final character participates in a
12540              * multi-character fold in the non-final position, as a match that
12541              * should have occurred won't, due to the way nodes are matched,
12542              * and our artificial boundary.  So back off until we find a non-
12543              * problematic character -- one that isn't at the beginning or
12544              * middle of such a fold.  (Either it doesn't participate in any
12545              * folds, or appears only in the final position of all the folds it
12546              * does participate in.)  A better solution with far fewer false
12547              * positives, and that would fill the nodes more completely, would
12548              * be to actually have available all the multi-character folds to
12549              * test against, and to back-off only far enough to be sure that
12550              * this node isn't ending with a partial one.  <upper_parse> is set
12551              * further below (if we need to reparse the node) to include just
12552              * up through that final non-problematic character that this code
12553              * identifies, so when it is set to less than the full node, we can
12554              * skip the rest of this */
12555             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12556
12557                 const STRLEN full_len = len;
12558
12559                 assert(len >= MAX_NODE_STRING_SIZE);
12560
12561                 /* Here, <s> points to the final byte of the final character.
12562                  * Look backwards through the string until find a non-
12563                  * problematic character */
12564
12565                 if (! UTF) {
12566
12567                     /* This has no multi-char folds to non-UTF characters */
12568                     if (ASCII_FOLD_RESTRICTED) {
12569                         goto loopdone;
12570                     }
12571
12572                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12573                     len = s - s0 + 1;
12574                 }
12575                 else {
12576                     if (!  PL_NonL1NonFinalFold) {
12577                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12578                                         NonL1_Perl_Non_Final_Folds_invlist);
12579                     }
12580
12581                     /* Point to the first byte of the final character */
12582                     s = (char *) utf8_hop((U8 *) s, -1);
12583
12584                     while (s >= s0) {   /* Search backwards until find
12585                                            non-problematic char */
12586                         if (UTF8_IS_INVARIANT(*s)) {
12587
12588                             /* There are no ascii characters that participate
12589                              * in multi-char folds under /aa.  In EBCDIC, the
12590                              * non-ascii invariants are all control characters,
12591                              * so don't ever participate in any folds. */
12592                             if (ASCII_FOLD_RESTRICTED
12593                                 || ! IS_NON_FINAL_FOLD(*s))
12594                             {
12595                                 break;
12596                             }
12597                         }
12598                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12599                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12600                                                                   *s, *(s+1))))
12601                             {
12602                                 break;
12603                             }
12604                         }
12605                         else if (! _invlist_contains_cp(
12606                                         PL_NonL1NonFinalFold,
12607                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12608                         {
12609                             break;
12610                         }
12611
12612                         /* Here, the current character is problematic in that
12613                          * it does occur in the non-final position of some
12614                          * fold, so try the character before it, but have to
12615                          * special case the very first byte in the string, so
12616                          * we don't read outside the string */
12617                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12618                     } /* End of loop backwards through the string */
12619
12620                     /* If there were only problematic characters in the string,
12621                      * <s> will point to before s0, in which case the length
12622                      * should be 0, otherwise include the length of the
12623                      * non-problematic character just found */
12624                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12625                 }
12626
12627                 /* Here, have found the final character, if any, that is
12628                  * non-problematic as far as ending the node without splitting
12629                  * it across a potential multi-char fold.  <len> contains the
12630                  * number of bytes in the node up-to and including that
12631                  * character, or is 0 if there is no such character, meaning
12632                  * the whole node contains only problematic characters.  In
12633                  * this case, give up and just take the node as-is.  We can't
12634                  * do any better */
12635                 if (len == 0) {
12636                     len = full_len;
12637
12638                     /* If the node ends in an 's' we make sure it stays EXACTF,
12639                      * as if it turns into an EXACTFU, it could later get
12640                      * joined with another 's' that would then wrongly match
12641                      * the sharp s */
12642                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12643                     {
12644                         maybe_exactfu = FALSE;
12645                     }
12646                 } else {
12647
12648                     /* Here, the node does contain some characters that aren't
12649                      * problematic.  If one such is the final character in the
12650                      * node, we are done */
12651                     if (len == full_len) {
12652                         goto loopdone;
12653                     }
12654                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12655
12656                         /* If the final character is problematic, but the
12657                          * penultimate is not, back-off that last character to
12658                          * later start a new node with it */
12659                         p = oldp;
12660                         goto loopdone;
12661                     }
12662
12663                     /* Here, the final non-problematic character is earlier
12664                      * in the input than the penultimate character.  What we do
12665                      * is reparse from the beginning, going up only as far as
12666                      * this final ok one, thus guaranteeing that the node ends
12667                      * in an acceptable character.  The reason we reparse is
12668                      * that we know how far in the character is, but we don't
12669                      * know how to correlate its position with the input parse.
12670                      * An alternate implementation would be to build that
12671                      * correlation as we go along during the original parse,
12672                      * but that would entail extra work for every node, whereas
12673                      * this code gets executed only when the string is too
12674                      * large for the node, and the final two characters are
12675                      * problematic, an infrequent occurrence.  Yet another
12676                      * possible strategy would be to save the tail of the
12677                      * string, and the next time regatom is called, initialize
12678                      * with that.  The problem with this is that unless you
12679                      * back off one more character, you won't be guaranteed
12680                      * regatom will get called again, unless regbranch,
12681                      * regpiece ... are also changed.  If you do back off that
12682                      * extra character, so that there is input guaranteed to
12683                      * force calling regatom, you can't handle the case where
12684                      * just the first character in the node is acceptable.  I
12685                      * (khw) decided to try this method which doesn't have that
12686                      * pitfall; if performance issues are found, we can do a
12687                      * combination of the current approach plus that one */
12688                     upper_parse = len;
12689                     len = 0;
12690                     s = s0;
12691                     goto reparse;
12692                 }
12693             }   /* End of verifying node ends with an appropriate char */
12694
12695         loopdone:   /* Jumped to when encounters something that shouldn't be in
12696                        the node */
12697
12698             /* I (khw) don't know if you can get here with zero length, but the
12699              * old code handled this situation by creating a zero-length EXACT
12700              * node.  Might as well be NOTHING instead */
12701             if (len == 0) {
12702                 OP(ret) = NOTHING;
12703             }
12704             else {
12705                 if (FOLD) {
12706                     /* If 'maybe_exact' is still set here, means there are no
12707                      * code points in the node that participate in folds;
12708                      * similarly for 'maybe_exactfu' and code points that match
12709                      * differently depending on UTF8ness of the target string
12710                      * (for /u), or depending on locale for /l */
12711                     if (maybe_exact) {
12712                         OP(ret) = EXACT;
12713                     }
12714                     else if (maybe_exactfu) {
12715                         OP(ret) = EXACTFU;
12716                     }
12717                 }
12718                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12719                                            FALSE /* Don't look to see if could
12720                                                     be turned into an EXACT
12721                                                     node, as we have already
12722                                                     computed that */
12723                                           );
12724             }
12725
12726             RExC_parse = p - 1;
12727             Set_Node_Cur_Length(ret, parse_start);
12728             nextchar(pRExC_state);
12729             {
12730                 /* len is STRLEN which is unsigned, need to copy to signed */
12731                 IV iv = len;
12732                 if (iv < 0)
12733                     vFAIL("Internal disaster");
12734             }
12735
12736         } /* End of label 'defchar:' */
12737         break;
12738     } /* End of giant switch on input character */
12739
12740     return(ret);
12741 }
12742
12743 STATIC char *
12744 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12745 {
12746     /* Returns the next non-pattern-white space, non-comment character (the
12747      * latter only if 'recognize_comment is true) in the string p, which is
12748      * ended by RExC_end.  See also reg_skipcomment */
12749     const char *e = RExC_end;
12750
12751     PERL_ARGS_ASSERT_REGPATWS;
12752
12753     while (p < e) {
12754         STRLEN len;
12755         if ((len = is_PATWS_safe(p, e, UTF))) {
12756             p += len;
12757         }
12758         else if (recognize_comment && *p == '#') {
12759             p = reg_skipcomment(pRExC_state, p);
12760         }
12761         else
12762             break;
12763     }
12764     return p;
12765 }
12766
12767 STATIC void
12768 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12769 {
12770     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12771      * sets up the bitmap and any flags, removing those code points from the
12772      * inversion list, setting it to NULL should it become completely empty */
12773
12774     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12775     assert(PL_regkind[OP(node)] == ANYOF);
12776
12777     ANYOF_BITMAP_ZERO(node);
12778     if (*invlist_ptr) {
12779
12780         /* This gets set if we actually need to modify things */
12781         bool change_invlist = FALSE;
12782
12783         UV start, end;
12784
12785         /* Start looking through *invlist_ptr */
12786         invlist_iterinit(*invlist_ptr);
12787         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12788             UV high;
12789             int i;
12790
12791             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12792                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12793             }
12794             else if (end >= NUM_ANYOF_CODE_POINTS) {
12795                 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12796             }
12797
12798             /* Quit if are above what we should change */
12799             if (start >= NUM_ANYOF_CODE_POINTS) {
12800                 break;
12801             }
12802
12803             change_invlist = TRUE;
12804
12805             /* Set all the bits in the range, up to the max that we are doing */
12806             high = (end < NUM_ANYOF_CODE_POINTS - 1)
12807                    ? end
12808                    : NUM_ANYOF_CODE_POINTS - 1;
12809             for (i = start; i <= (int) high; i++) {
12810                 if (! ANYOF_BITMAP_TEST(node, i)) {
12811                     ANYOF_BITMAP_SET(node, i);
12812                 }
12813             }
12814         }
12815         invlist_iterfinish(*invlist_ptr);
12816
12817         /* Done with loop; remove any code points that are in the bitmap from
12818          * *invlist_ptr; similarly for code points above the bitmap if we have
12819          * a flag to match all of them anyways */
12820         if (change_invlist) {
12821             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12822         }
12823         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12824             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12825         }
12826
12827         /* If have completely emptied it, remove it completely */
12828         if (_invlist_len(*invlist_ptr) == 0) {
12829             SvREFCNT_dec_NN(*invlist_ptr);
12830             *invlist_ptr = NULL;
12831         }
12832     }
12833 }
12834
12835 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12836    Character classes ([:foo:]) can also be negated ([:^foo:]).
12837    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12838    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12839    but trigger failures because they are currently unimplemented. */
12840
12841 #define POSIXCC_DONE(c)   ((c) == ':')
12842 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12843 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12844
12845 PERL_STATIC_INLINE I32
12846 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12847 {
12848     I32 namedclass = OOB_NAMEDCLASS;
12849
12850     PERL_ARGS_ASSERT_REGPPOSIXCC;
12851
12852     if (value == '[' && RExC_parse + 1 < RExC_end &&
12853         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12854         POSIXCC(UCHARAT(RExC_parse)))
12855     {
12856         const char c = UCHARAT(RExC_parse);
12857         char* const s = RExC_parse++;
12858
12859         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12860             RExC_parse++;
12861         if (RExC_parse == RExC_end) {
12862             if (strict) {
12863
12864                 /* Try to give a better location for the error (than the end of
12865                  * the string) by looking for the matching ']' */
12866                 RExC_parse = s;
12867                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12868                     RExC_parse++;
12869                 }
12870                 vFAIL2("Unmatched '%c' in POSIX class", c);
12871             }
12872             /* Grandfather lone [:, [=, [. */
12873             RExC_parse = s;
12874         }
12875         else {
12876             const char* const t = RExC_parse++; /* skip over the c */
12877             assert(*t == c);
12878
12879             if (UCHARAT(RExC_parse) == ']') {
12880                 const char *posixcc = s + 1;
12881                 RExC_parse++; /* skip over the ending ] */
12882
12883                 if (*s == ':') {
12884                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12885                     const I32 skip = t - posixcc;
12886
12887                     /* Initially switch on the length of the name.  */
12888                     switch (skip) {
12889                     case 4:
12890                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12891                                                           this is the Perl \w
12892                                                         */
12893                             namedclass = ANYOF_WORDCHAR;
12894                         break;
12895                     case 5:
12896                         /* Names all of length 5.  */
12897                         /* alnum alpha ascii blank cntrl digit graph lower
12898                            print punct space upper  */
12899                         /* Offset 4 gives the best switch position.  */
12900                         switch (posixcc[4]) {
12901                         case 'a':
12902                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12903                                 namedclass = ANYOF_ALPHA;
12904                             break;
12905                         case 'e':
12906                             if (memEQ(posixcc, "spac", 4)) /* space */
12907                                 namedclass = ANYOF_PSXSPC;
12908                             break;
12909                         case 'h':
12910                             if (memEQ(posixcc, "grap", 4)) /* graph */
12911                                 namedclass = ANYOF_GRAPH;
12912                             break;
12913                         case 'i':
12914                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12915                                 namedclass = ANYOF_ASCII;
12916                             break;
12917                         case 'k':
12918                             if (memEQ(posixcc, "blan", 4)) /* blank */
12919                                 namedclass = ANYOF_BLANK;
12920                             break;
12921                         case 'l':
12922                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12923                                 namedclass = ANYOF_CNTRL;
12924                             break;
12925                         case 'm':
12926                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12927                                 namedclass = ANYOF_ALPHANUMERIC;
12928                             break;
12929                         case 'r':
12930                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12931                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12932                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12933                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12934                             break;
12935                         case 't':
12936                             if (memEQ(posixcc, "digi", 4)) /* digit */
12937                                 namedclass = ANYOF_DIGIT;
12938                             else if (memEQ(posixcc, "prin", 4)) /* print */
12939                                 namedclass = ANYOF_PRINT;
12940                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12941                                 namedclass = ANYOF_PUNCT;
12942                             break;
12943                         }
12944                         break;
12945                     case 6:
12946                         if (memEQ(posixcc, "xdigit", 6))
12947                             namedclass = ANYOF_XDIGIT;
12948                         break;
12949                     }
12950
12951                     if (namedclass == OOB_NAMEDCLASS)
12952                         vFAIL2utf8f(
12953                             "POSIX class [:%"UTF8f":] unknown",
12954                             UTF8fARG(UTF, t - s - 1, s + 1));
12955
12956                     /* The #defines are structured so each complement is +1 to
12957                      * the normal one */
12958                     if (complement) {
12959                         namedclass++;
12960                     }
12961                     assert (posixcc[skip] == ':');
12962                     assert (posixcc[skip+1] == ']');
12963                 } else if (!SIZE_ONLY) {
12964                     /* [[=foo=]] and [[.foo.]] are still future. */
12965
12966                     /* adjust RExC_parse so the warning shows after
12967                        the class closes */
12968                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12969                         RExC_parse++;
12970                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12971                 }
12972             } else {
12973                 /* Maternal grandfather:
12974                  * "[:" ending in ":" but not in ":]" */
12975                 if (strict) {
12976                     vFAIL("Unmatched '[' in POSIX class");
12977                 }
12978
12979                 /* Grandfather lone [:, [=, [. */
12980                 RExC_parse = s;
12981             }
12982         }
12983     }
12984
12985     return namedclass;
12986 }
12987
12988 STATIC bool
12989 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12990 {
12991     /* This applies some heuristics at the current parse position (which should
12992      * be at a '[') to see if what follows might be intended to be a [:posix:]
12993      * class.  It returns true if it really is a posix class, of course, but it
12994      * also can return true if it thinks that what was intended was a posix
12995      * class that didn't quite make it.
12996      *
12997      * It will return true for
12998      *      [:alphanumerics:
12999      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
13000      *                         ')' indicating the end of the (?[
13001      *      [:any garbage including %^&$ punctuation:]
13002      *
13003      * This is designed to be called only from S_handle_regex_sets; it could be
13004      * easily adapted to be called from the spot at the beginning of regclass()
13005      * that checks to see in a normal bracketed class if the surrounding []
13006      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
13007      * change long-standing behavior, so I (khw) didn't do that */
13008     char* p = RExC_parse + 1;
13009     char first_char = *p;
13010
13011     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13012
13013     assert(*(p - 1) == '[');
13014
13015     if (! POSIXCC(first_char)) {
13016         return FALSE;
13017     }
13018
13019     p++;
13020     while (p < RExC_end && isWORDCHAR(*p)) p++;
13021
13022     if (p >= RExC_end) {
13023         return FALSE;
13024     }
13025
13026     if (p - RExC_parse > 2    /* Got at least 1 word character */
13027         && (*p == first_char
13028             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13029     {
13030         return TRUE;
13031     }
13032
13033     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13034
13035     return (p
13036             && p - RExC_parse > 2 /* [:] evaluates to colon;
13037                                       [::] is a bad posix class. */
13038             && first_char == *(p - 1));
13039 }
13040
13041 STATIC regnode *
13042 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13043                     I32 *flagp, U32 depth,
13044                     char * const oregcomp_parse)
13045 {
13046     /* Handle the (?[...]) construct to do set operations */
13047
13048     U8 curchar;
13049     UV start, end;      /* End points of code point ranges */
13050     SV* result_string;
13051     char *save_end, *save_parse;
13052     SV* final;
13053     STRLEN len;
13054     regnode* node;
13055     AV* stack;
13056     const bool save_fold = FOLD;
13057
13058     GET_RE_DEBUG_FLAGS_DECL;
13059
13060     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13061
13062     if (LOC) {
13063         vFAIL("(?[...]) not valid in locale");
13064     }
13065     RExC_uni_semantics = 1;
13066
13067     /* This will return only an ANYOF regnode, or (unlikely) something smaller
13068      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
13069      * call regclass to handle '[]' so as to not have to reinvent its parsing
13070      * rules here (throwing away the size it computes each time).  And, we exit
13071      * upon an unescaped ']' that isn't one ending a regclass.  To do both
13072      * these things, we need to realize that something preceded by a backslash
13073      * is escaped, so we have to keep track of backslashes */
13074     if (PASS2) {
13075         Perl_ck_warner_d(aTHX_
13076             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13077             "The regex_sets feature is experimental" REPORT_LOCATION,
13078                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13079                 UTF8fARG(UTF,
13080                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13081                          RExC_precomp + (RExC_parse - RExC_precomp)));
13082     }
13083     else {
13084         UV depth = 0; /* how many nested (?[...]) constructs */
13085
13086         while (RExC_parse < RExC_end) {
13087             SV* current = NULL;
13088             RExC_parse = regpatws(pRExC_state, RExC_parse,
13089                                           TRUE); /* means recognize comments */
13090             switch (*RExC_parse) {
13091                 case '?':
13092                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
13093                     /* FALLTHROUGH */
13094                 default:
13095                     break;
13096                 case '\\':
13097                     /* Skip the next byte (which could cause us to end up in
13098                      * the middle of a UTF-8 character, but since none of those
13099                      * are confusable with anything we currently handle in this
13100                      * switch (invariants all), it's safe.  We'll just hit the
13101                      * default: case next time and keep on incrementing until
13102                      * we find one of the invariants we do handle. */
13103                     RExC_parse++;
13104                     break;
13105                 case '[':
13106                 {
13107                     /* If this looks like it is a [:posix:] class, leave the
13108                      * parse pointer at the '[' to fool regclass() into
13109                      * thinking it is part of a '[[:posix:]]'.  That function
13110                      * will use strict checking to force a syntax error if it
13111                      * doesn't work out to a legitimate class */
13112                     bool is_posix_class
13113                                     = could_it_be_a_POSIX_class(pRExC_state);
13114                     if (! is_posix_class) {
13115                         RExC_parse++;
13116                     }
13117
13118                     /* regclass() can only return RESTART_UTF8 if multi-char
13119                        folds are allowed.  */
13120                     if (!regclass(pRExC_state, flagp,depth+1,
13121                                   is_posix_class, /* parse the whole char
13122                                                      class only if not a
13123                                                      posix class */
13124                                   FALSE, /* don't allow multi-char folds */
13125                                   TRUE, /* silence non-portable warnings. */
13126                                   &current))
13127                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13128                               (UV) *flagp);
13129
13130                     /* function call leaves parse pointing to the ']', except
13131                      * if we faked it */
13132                     if (is_posix_class) {
13133                         RExC_parse--;
13134                     }
13135
13136                     SvREFCNT_dec(current);   /* In case it returned something */
13137                     break;
13138                 }
13139
13140                 case ']':
13141                     if (depth--) break;
13142                     RExC_parse++;
13143                     if (RExC_parse < RExC_end
13144                         && *RExC_parse == ')')
13145                     {
13146                         node = reganode(pRExC_state, ANYOF, 0);
13147                         RExC_size += ANYOF_SKIP;
13148                         nextchar(pRExC_state);
13149                         Set_Node_Length(node,
13150                                 RExC_parse - oregcomp_parse + 1); /* MJD */
13151                         return node;
13152                     }
13153                     goto no_close;
13154             }
13155             RExC_parse++;
13156         }
13157
13158         no_close:
13159         FAIL("Syntax error in (?[...])");
13160     }
13161
13162     /* Pass 2 only after this.  Everything in this construct is a
13163      * metacharacter.  Operands begin with either a '\' (for an escape
13164      * sequence), or a '[' for a bracketed character class.  Any other
13165      * character should be an operator, or parenthesis for grouping.  Both
13166      * types of operands are handled by calling regclass() to parse them.  It
13167      * is called with a parameter to indicate to return the computed inversion
13168      * list.  The parsing here is implemented via a stack.  Each entry on the
13169      * stack is a single character representing one of the operators, or the
13170      * '('; or else a pointer to an operand inversion list. */
13171
13172 #define IS_OPERAND(a)  (! SvIOK(a))
13173
13174     /* The stack starts empty.  It is a syntax error if the first thing parsed
13175      * is a binary operator; everything else is pushed on the stack.  When an
13176      * operand is parsed, the top of the stack is examined.  If it is a binary
13177      * operator, the item before it should be an operand, and both are replaced
13178      * by the result of doing that operation on the new operand and the one on
13179      * the stack.   Thus a sequence of binary operands is reduced to a single
13180      * one before the next one is parsed.
13181      *
13182      * A unary operator may immediately follow a binary in the input, for
13183      * example
13184      *      [a] + ! [b]
13185      * When an operand is parsed and the top of the stack is a unary operator,
13186      * the operation is performed, and then the stack is rechecked to see if
13187      * this new operand is part of a binary operation; if so, it is handled as
13188      * above.
13189      *
13190      * A '(' is simply pushed on the stack; it is valid only if the stack is
13191      * empty, or the top element of the stack is an operator or another '('
13192      * (for which the parenthesized expression will become an operand).  By the
13193      * time the corresponding ')' is parsed everything in between should have
13194      * been parsed and evaluated to a single operand (or else is a syntax
13195      * error), and is handled as a regular operand */
13196
13197     sv_2mortal((SV *)(stack = newAV()));
13198
13199     while (RExC_parse < RExC_end) {
13200         I32 top_index = av_tindex(stack);
13201         SV** top_ptr;
13202         SV* current = NULL;
13203
13204         /* Skip white space */
13205         RExC_parse = regpatws(pRExC_state, RExC_parse,
13206                                          TRUE /* means recognize comments */ );
13207         if (RExC_parse >= RExC_end) {
13208             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13209         }
13210         if ((curchar = UCHARAT(RExC_parse)) == ']') {
13211             break;
13212         }
13213
13214         switch (curchar) {
13215
13216             case '?':
13217                 if (av_tindex(stack) >= 0   /* This makes sure that we can
13218                                                safely subtract 1 from
13219                                                RExC_parse in the next clause.
13220                                                If we have something on the
13221                                                stack, we have parsed something
13222                                              */
13223                     && UCHARAT(RExC_parse - 1) == '('
13224                     && RExC_parse < RExC_end)
13225                 {
13226                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13227                      * This happens when we have some thing like
13228                      *
13229                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13230                      *   ...
13231                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
13232                      *
13233                      * Here we would be handling the interpolated
13234                      * '$thai_or_lao'.  We handle this by a recursive call to
13235                      * ourselves which returns the inversion list the
13236                      * interpolated expression evaluates to.  We use the flags
13237                      * from the interpolated pattern. */
13238                     U32 save_flags = RExC_flags;
13239                     const char * const save_parse = ++RExC_parse;
13240
13241                     parse_lparen_question_flags(pRExC_state);
13242
13243                     if (RExC_parse == save_parse  /* Makes sure there was at
13244                                                      least one flag (or this
13245                                                      embedding wasn't compiled)
13246                                                    */
13247                         || RExC_parse >= RExC_end - 4
13248                         || UCHARAT(RExC_parse) != ':'
13249                         || UCHARAT(++RExC_parse) != '('
13250                         || UCHARAT(++RExC_parse) != '?'
13251                         || UCHARAT(++RExC_parse) != '[')
13252                     {
13253
13254                         /* In combination with the above, this moves the
13255                          * pointer to the point just after the first erroneous
13256                          * character (or if there are no flags, to where they
13257                          * should have been) */
13258                         if (RExC_parse >= RExC_end - 4) {
13259                             RExC_parse = RExC_end;
13260                         }
13261                         else if (RExC_parse != save_parse) {
13262                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13263                         }
13264                         vFAIL("Expecting '(?flags:(?[...'");
13265                     }
13266                     RExC_parse++;
13267                     (void) handle_regex_sets(pRExC_state, &current, flagp,
13268                                                     depth+1, oregcomp_parse);
13269
13270                     /* Here, 'current' contains the embedded expression's
13271                      * inversion list, and RExC_parse points to the trailing
13272                      * ']'; the next character should be the ')' which will be
13273                      * paired with the '(' that has been put on the stack, so
13274                      * the whole embedded expression reduces to '(operand)' */
13275                     RExC_parse++;
13276
13277                     RExC_flags = save_flags;
13278                     goto handle_operand;
13279                 }
13280                 /* FALLTHROUGH */
13281
13282             default:
13283                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13284                 vFAIL("Unexpected character");
13285
13286             case '\\':
13287                 /* regclass() can only return RESTART_UTF8 if multi-char
13288                    folds are allowed.  */
13289                 if (!regclass(pRExC_state, flagp,depth+1,
13290                               TRUE, /* means parse just the next thing */
13291                               FALSE, /* don't allow multi-char folds */
13292                               FALSE, /* don't silence non-portable warnings.  */
13293                               &current))
13294                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13295                           (UV) *flagp);
13296                 /* regclass() will return with parsing just the \ sequence,
13297                  * leaving the parse pointer at the next thing to parse */
13298                 RExC_parse--;
13299                 goto handle_operand;
13300
13301             case '[':   /* Is a bracketed character class */
13302             {
13303                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13304
13305                 if (! is_posix_class) {
13306                     RExC_parse++;
13307                 }
13308
13309                 /* regclass() can only return RESTART_UTF8 if multi-char
13310                    folds are allowed.  */
13311                 if(!regclass(pRExC_state, flagp,depth+1,
13312                              is_posix_class, /* parse the whole char class
13313                                                 only if not a posix class */
13314                              FALSE, /* don't allow multi-char folds */
13315                              FALSE, /* don't silence non-portable warnings.  */
13316                              &current))
13317                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13318                           (UV) *flagp);
13319                 /* function call leaves parse pointing to the ']', except if we
13320                  * faked it */
13321                 if (is_posix_class) {
13322                     RExC_parse--;
13323                 }
13324
13325                 goto handle_operand;
13326             }
13327
13328             case '&':
13329             case '|':
13330             case '+':
13331             case '-':
13332             case '^':
13333                 if (top_index < 0
13334                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13335                     || ! IS_OPERAND(*top_ptr))
13336                 {
13337                     RExC_parse++;
13338                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13339                 }
13340                 av_push(stack, newSVuv(curchar));
13341                 break;
13342
13343             case '!':
13344                 av_push(stack, newSVuv(curchar));
13345                 break;
13346
13347             case '(':
13348                 if (top_index >= 0) {
13349                     top_ptr = av_fetch(stack, top_index, FALSE);
13350                     assert(top_ptr);
13351                     if (IS_OPERAND(*top_ptr)) {
13352                         RExC_parse++;
13353                         vFAIL("Unexpected '(' with no preceding operator");
13354                     }
13355                 }
13356                 av_push(stack, newSVuv(curchar));
13357                 break;
13358
13359             case ')':
13360             {
13361                 SV* lparen;
13362                 if (top_index < 1
13363                     || ! (current = av_pop(stack))
13364                     || ! IS_OPERAND(current)
13365                     || ! (lparen = av_pop(stack))
13366                     || IS_OPERAND(lparen)
13367                     || SvUV(lparen) != '(')
13368                 {
13369                     SvREFCNT_dec(current);
13370                     RExC_parse++;
13371                     vFAIL("Unexpected ')'");
13372                 }
13373                 top_index -= 2;
13374                 SvREFCNT_dec_NN(lparen);
13375
13376                 /* FALLTHROUGH */
13377             }
13378
13379               handle_operand:
13380
13381                 /* Here, we have an operand to process, in 'current' */
13382
13383                 if (top_index < 0) {    /* Just push if stack is empty */
13384                     av_push(stack, current);
13385                 }
13386                 else {
13387                     SV* top = av_pop(stack);
13388                     SV *prev = NULL;
13389                     char current_operator;
13390
13391                     if (IS_OPERAND(top)) {
13392                         SvREFCNT_dec_NN(top);
13393                         SvREFCNT_dec_NN(current);
13394                         vFAIL("Operand with no preceding operator");
13395                     }
13396                     current_operator = (char) SvUV(top);
13397                     switch (current_operator) {
13398                         case '(':   /* Push the '(' back on followed by the new
13399                                        operand */
13400                             av_push(stack, top);
13401                             av_push(stack, current);
13402                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13403                                                    just after the 'break', so
13404                                                    it doesn't get wrongly freed
13405                                                  */
13406                             break;
13407
13408                         case '!':
13409                             _invlist_invert(current);
13410
13411                             /* Unlike binary operators, the top of the stack,
13412                              * now that this unary one has been popped off, may
13413                              * legally be an operator, and we now have operand
13414                              * for it. */
13415                             top_index--;
13416                             SvREFCNT_dec_NN(top);
13417                             goto handle_operand;
13418
13419                         case '&':
13420                             prev = av_pop(stack);
13421                             _invlist_intersection(prev,
13422                                                    current,
13423                                                    &current);
13424                             av_push(stack, current);
13425                             break;
13426
13427                         case '|':
13428                         case '+':
13429                             prev = av_pop(stack);
13430                             _invlist_union(prev, current, &current);
13431                             av_push(stack, current);
13432                             break;
13433
13434                         case '-':
13435                             prev = av_pop(stack);;
13436                             _invlist_subtract(prev, current, &current);
13437                             av_push(stack, current);
13438                             break;
13439
13440                         case '^':   /* The union minus the intersection */
13441                         {
13442                             SV* i = NULL;
13443                             SV* u = NULL;
13444                             SV* element;
13445
13446                             prev = av_pop(stack);
13447                             _invlist_union(prev, current, &u);
13448                             _invlist_intersection(prev, current, &i);
13449                             /* _invlist_subtract will overwrite current
13450                                 without freeing what it already contains */
13451                             element = current;
13452                             _invlist_subtract(u, i, &current);
13453                             av_push(stack, current);
13454                             SvREFCNT_dec_NN(i);
13455                             SvREFCNT_dec_NN(u);
13456                             SvREFCNT_dec_NN(element);
13457                             break;
13458                         }
13459
13460                         default:
13461                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13462                 }
13463                 SvREFCNT_dec_NN(top);
13464                 SvREFCNT_dec(prev);
13465             }
13466         }
13467
13468         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13469     }
13470
13471     if (av_tindex(stack) < 0   /* Was empty */
13472         || ((final = av_pop(stack)) == NULL)
13473         || ! IS_OPERAND(final)
13474         || av_tindex(stack) >= 0)  /* More left on stack */
13475     {
13476         vFAIL("Incomplete expression within '(?[ ])'");
13477     }
13478
13479     /* Here, 'final' is the resultant inversion list from evaluating the
13480      * expression.  Return it if so requested */
13481     if (return_invlist) {
13482         *return_invlist = final;
13483         return END;
13484     }
13485
13486     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13487      * expecting a string of ranges and individual code points */
13488     invlist_iterinit(final);
13489     result_string = newSVpvs("");
13490     while (invlist_iternext(final, &start, &end)) {
13491         if (start == end) {
13492             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13493         }
13494         else {
13495             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13496                                                      start,          end);
13497         }
13498     }
13499
13500     save_parse = RExC_parse;
13501     RExC_parse = SvPV(result_string, len);
13502     save_end = RExC_end;
13503     RExC_end = RExC_parse + len;
13504
13505     /* We turn off folding around the call, as the class we have constructed
13506      * already has all folding taken into consideration, and we don't want
13507      * regclass() to add to that */
13508     RExC_flags &= ~RXf_PMf_FOLD;
13509     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13510      */
13511     node = regclass(pRExC_state, flagp,depth+1,
13512                     FALSE, /* means parse the whole char class */
13513                     FALSE, /* don't allow multi-char folds */
13514                     TRUE, /* silence non-portable warnings.  The above may very
13515                              well have generated non-portable code points, but
13516                              they're valid on this machine */
13517                     NULL);
13518     if (!node)
13519         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13520                     PTR2UV(flagp));
13521     if (save_fold) {
13522         RExC_flags |= RXf_PMf_FOLD;
13523     }
13524     RExC_parse = save_parse + 1;
13525     RExC_end = save_end;
13526     SvREFCNT_dec_NN(final);
13527     SvREFCNT_dec_NN(result_string);
13528
13529     nextchar(pRExC_state);
13530     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13531     return node;
13532 }
13533 #undef IS_OPERAND
13534
13535 STATIC void
13536 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13537 {
13538     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13539      * innocent-looking character class, like /[ks]/i won't have to go out to
13540      * disk to find the possible matches.
13541      *
13542      * This should be called only for a Latin1-range code points, cp, which is
13543      * known to be involved in a simple fold with other code points above
13544      * Latin1.  It would give false results if /aa has been specified.
13545      * Multi-char folds are outside the scope of this, and must be handled
13546      * specially.
13547      *
13548      * XXX It would be better to generate these via regen, in case a new
13549      * version of the Unicode standard adds new mappings, though that is not
13550      * really likely, and may be caught by the default: case of the switch
13551      * below. */
13552
13553     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13554
13555     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13556
13557     switch (cp) {
13558         case 'k':
13559         case 'K':
13560           *invlist =
13561              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13562             break;
13563         case 's':
13564         case 'S':
13565           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13566             break;
13567         case MICRO_SIGN:
13568           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13569           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13570             break;
13571         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13572         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13573           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13574             break;
13575         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13576           *invlist = add_cp_to_invlist(*invlist,
13577                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13578             break;
13579         case LATIN_SMALL_LETTER_SHARP_S:
13580           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13581             break;
13582         default:
13583             /* Use deprecated warning to increase the chances of this being
13584              * output */
13585             if (PASS2) {
13586                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13587             }
13588             break;
13589     }
13590 }
13591
13592 STATIC AV *
13593 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13594 {
13595     /* This adds the string scalar <multi_string> to the array
13596      * <multi_char_matches>.  <multi_string> is known to have exactly
13597      * <cp_count> code points in it.  This is used when constructing a
13598      * bracketed character class and we find something that needs to match more
13599      * than a single character.
13600      *
13601      * <multi_char_matches> is actually an array of arrays.  Each top-level
13602      * element is an array that contains all the strings known so far that are
13603      * the same length.  And that length (in number of code points) is the same
13604      * as the index of the top-level array.  Hence, the [2] element is an
13605      * array, each element thereof is a string containing TWO code points;
13606      * while element [3] is for strings of THREE characters, and so on.  Since
13607      * this is for multi-char strings there can never be a [0] nor [1] element.
13608      *
13609      * When we rewrite the character class below, we will do so such that the
13610      * longest strings are written first, so that it prefers the longest
13611      * matching strings first.  This is done even if it turns out that any
13612      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
13613      * Christiansen has agreed that this is ok.  This makes the test for the
13614      * ligature 'ffi' come before the test for 'ff', for example */
13615
13616     AV* this_array;
13617     AV** this_array_ptr;
13618
13619     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13620
13621     if (! multi_char_matches) {
13622         multi_char_matches = newAV();
13623     }
13624
13625     if (av_exists(multi_char_matches, cp_count)) {
13626         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13627         this_array = *this_array_ptr;
13628     }
13629     else {
13630         this_array = newAV();
13631         av_store(multi_char_matches, cp_count,
13632                  (SV*) this_array);
13633     }
13634     av_push(this_array, multi_string);
13635
13636     return multi_char_matches;
13637 }
13638
13639 /* The names of properties whose definitions are not known at compile time are
13640  * stored in this SV, after a constant heading.  So if the length has been
13641  * changed since initialization, then there is a run-time definition. */
13642 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13643                                         (SvCUR(listsv) != initial_listsv_len)
13644
13645 STATIC regnode *
13646 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13647                  const bool stop_at_1,  /* Just parse the next thing, don't
13648                                            look for a full character class */
13649                  bool allow_multi_folds,
13650                  const bool silence_non_portable,   /* Don't output warnings
13651                                                        about too large
13652                                                        characters */
13653                  SV** ret_invlist)  /* Return an inversion list, not a node */
13654 {
13655     /* parse a bracketed class specification.  Most of these will produce an
13656      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13657      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13658      * under /i with multi-character folds: it will be rewritten following the
13659      * paradigm of this example, where the <multi-fold>s are characters which
13660      * fold to multiple character sequences:
13661      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13662      * gets effectively rewritten as:
13663      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13664      * reg() gets called (recursively) on the rewritten version, and this
13665      * function will return what it constructs.  (Actually the <multi-fold>s
13666      * aren't physically removed from the [abcdefghi], it's just that they are
13667      * ignored in the recursion by means of a flag:
13668      * <RExC_in_multi_char_class>.)
13669      *
13670      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13671      * characters, with the corresponding bit set if that character is in the
13672      * list.  For characters above this, a range list or swash is used.  There
13673      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13674      * determinable at compile time
13675      *
13676      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13677      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13678      */
13679
13680     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13681     IV range = 0;
13682     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13683     regnode *ret;
13684     STRLEN numlen;
13685     IV namedclass = OOB_NAMEDCLASS;
13686     char *rangebegin = NULL;
13687     bool need_class = 0;
13688     SV *listsv = NULL;
13689     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13690                                       than just initialized.  */
13691     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13692     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13693                                extended beyond the Latin1 range.  These have to
13694                                be kept separate from other code points for much
13695                                of this function because their handling  is
13696                                different under /i, and for most classes under
13697                                /d as well */
13698     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13699                                separate for a while from the non-complemented
13700                                versions because of complications with /d
13701                                matching */
13702     UV element_count = 0;   /* Number of distinct elements in the class.
13703                                Optimizations may be possible if this is tiny */
13704     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13705                                        character; used under /i */
13706     UV n;
13707     char * stop_ptr = RExC_end;    /* where to stop parsing */
13708     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13709                                                    space? */
13710     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13711
13712     /* Unicode properties are stored in a swash; this holds the current one
13713      * being parsed.  If this swash is the only above-latin1 component of the
13714      * character class, an optimization is to pass it directly on to the
13715      * execution engine.  Otherwise, it is set to NULL to indicate that there
13716      * are other things in the class that have to be dealt with at execution
13717      * time */
13718     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13719
13720     /* Set if a component of this character class is user-defined; just passed
13721      * on to the engine */
13722     bool has_user_defined_property = FALSE;
13723
13724     /* inversion list of code points this node matches only when the target
13725      * string is in UTF-8.  (Because is under /d) */
13726     SV* depends_list = NULL;
13727
13728     /* Inversion list of code points this node matches regardless of things
13729      * like locale, folding, utf8ness of the target string */
13730     SV* cp_list = NULL;
13731
13732     /* Like cp_list, but code points on this list need to be checked for things
13733      * that fold to/from them under /i */
13734     SV* cp_foldable_list = NULL;
13735
13736     /* Like cp_list, but code points on this list are valid only when the
13737      * runtime locale is UTF-8 */
13738     SV* only_utf8_locale_list = NULL;
13739
13740 #ifdef EBCDIC
13741     /* In a range, counts how many 0-2 of the ends of it came from literals,
13742      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13743     UV literal_endpoint = 0;
13744 #endif
13745     bool invert = FALSE;    /* Is this class to be complemented */
13746
13747     bool warn_super = ALWAYS_WARN_SUPER;
13748
13749     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13750         case we need to change the emitted regop to an EXACT. */
13751     const char * orig_parse = RExC_parse;
13752     const SSize_t orig_size = RExC_size;
13753     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13754     GET_RE_DEBUG_FLAGS_DECL;
13755
13756     PERL_ARGS_ASSERT_REGCLASS;
13757 #ifndef DEBUGGING
13758     PERL_UNUSED_ARG(depth);
13759 #endif
13760
13761     DEBUG_PARSE("clas");
13762
13763     /* Assume we are going to generate an ANYOF node. */
13764     ret = reganode(pRExC_state, ANYOF, 0);
13765
13766     if (SIZE_ONLY) {
13767         RExC_size += ANYOF_SKIP;
13768         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13769     }
13770     else {
13771         ANYOF_FLAGS(ret) = 0;
13772
13773         RExC_emit += ANYOF_SKIP;
13774         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13775         initial_listsv_len = SvCUR(listsv);
13776         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13777     }
13778
13779     if (skip_white) {
13780         RExC_parse = regpatws(pRExC_state, RExC_parse,
13781                               FALSE /* means don't recognize comments */ );
13782     }
13783
13784     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13785         RExC_parse++;
13786         invert = TRUE;
13787         allow_multi_folds = FALSE;
13788         RExC_naughty++;
13789         if (skip_white) {
13790             RExC_parse = regpatws(pRExC_state, RExC_parse,
13791                                   FALSE /* means don't recognize comments */ );
13792         }
13793     }
13794
13795     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13796     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13797         const char *s = RExC_parse;
13798         const char  c = *s++;
13799
13800         while (isWORDCHAR(*s))
13801             s++;
13802         if (*s && c == *s && s[1] == ']') {
13803             SAVEFREESV(RExC_rx_sv);
13804             ckWARN3reg(s+2,
13805                        "POSIX syntax [%c %c] belongs inside character classes",
13806                        c, c);
13807             (void)ReREFCNT_inc(RExC_rx_sv);
13808         }
13809     }
13810
13811     /* If the caller wants us to just parse a single element, accomplish this
13812      * by faking the loop ending condition */
13813     if (stop_at_1 && RExC_end > RExC_parse) {
13814         stop_ptr = RExC_parse + 1;
13815     }
13816
13817     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13818     if (UCHARAT(RExC_parse) == ']')
13819         goto charclassloop;
13820
13821     while (1) {
13822         if  (RExC_parse >= stop_ptr) {
13823             break;
13824         }
13825
13826         if (skip_white) {
13827             RExC_parse = regpatws(pRExC_state, RExC_parse,
13828                                   FALSE /* means don't recognize comments */ );
13829         }
13830
13831         if  (UCHARAT(RExC_parse) == ']') {
13832             break;
13833         }
13834
13835     charclassloop:
13836
13837         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13838         save_value = value;
13839         save_prevvalue = prevvalue;
13840
13841         if (!range) {
13842             rangebegin = RExC_parse;
13843             element_count++;
13844         }
13845         if (UTF) {
13846             value = utf8n_to_uvchr((U8*)RExC_parse,
13847                                    RExC_end - RExC_parse,
13848                                    &numlen, UTF8_ALLOW_DEFAULT);
13849             RExC_parse += numlen;
13850         }
13851         else
13852             value = UCHARAT(RExC_parse++);
13853
13854         if (value == '['
13855             && RExC_parse < RExC_end
13856             && POSIXCC(UCHARAT(RExC_parse)))
13857         {
13858             namedclass = regpposixcc(pRExC_state, value, strict);
13859         }
13860         else if (value != '\\') {
13861 #ifdef EBCDIC
13862             literal_endpoint++;
13863 #endif
13864         }
13865         else {
13866             /* Is a backslash; get the code point of the char after it */
13867             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13868                 value = utf8n_to_uvchr((U8*)RExC_parse,
13869                                    RExC_end - RExC_parse,
13870                                    &numlen, UTF8_ALLOW_DEFAULT);
13871                 RExC_parse += numlen;
13872             }
13873             else
13874                 value = UCHARAT(RExC_parse++);
13875
13876             /* Some compilers cannot handle switching on 64-bit integer
13877              * values, therefore value cannot be an UV.  Yes, this will
13878              * be a problem later if we want switch on Unicode.
13879              * A similar issue a little bit later when switching on
13880              * namedclass. --jhi */
13881
13882             /* If the \ is escaping white space when white space is being
13883              * skipped, it means that that white space is wanted literally, and
13884              * is already in 'value'.  Otherwise, need to translate the escape
13885              * into what it signifies. */
13886             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13887
13888             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13889             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13890             case 's':   namedclass = ANYOF_SPACE;       break;
13891             case 'S':   namedclass = ANYOF_NSPACE;      break;
13892             case 'd':   namedclass = ANYOF_DIGIT;       break;
13893             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13894             case 'v':   namedclass = ANYOF_VERTWS;      break;
13895             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13896             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13897             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13898             case 'N':  /* Handle \N{NAME} in class */
13899                 {
13900                     SV *as_text;
13901                     STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13902                                                     flagp, depth, &as_text);
13903                     if (*flagp & RESTART_UTF8)
13904                         FAIL("panic: grok_bslash_N set RESTART_UTF8");
13905                     if (cp_count != 1) {    /* The typical case drops through */
13906                         assert(cp_count != (STRLEN) -1);
13907                         if (cp_count == 0) {
13908                             if (strict) {
13909                                 RExC_parse++;   /* Position after the "}" */
13910                                 vFAIL("Zero length \\N{}");
13911                             }
13912                             else if (PASS2) {
13913                                 ckWARNreg(RExC_parse,
13914                                         "Ignoring zero length \\N{} in character class");
13915                             }
13916                         }
13917                         else { /* cp_count > 1 */
13918                             if (! RExC_in_multi_char_class) {
13919                                 if (invert || range || *RExC_parse == '-') {
13920                                     if (strict) {
13921                                         RExC_parse--;
13922                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13923                                     }
13924                                     else if (PASS2) {
13925                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13926                                     }
13927                                 }
13928                                 else {
13929                                     multi_char_matches
13930                                         = add_multi_match(multi_char_matches,
13931                                                           as_text,
13932                                                           cp_count);
13933                                 }
13934                                 break; /* <value> contains the first code
13935                                           point. Drop out of the switch to
13936                                           process it */
13937                             }
13938                         } /* End of cp_count != 1 */
13939
13940                         /* This element should not be processed further in this
13941                          * class */
13942                         element_count--;
13943                         value = save_value;
13944                         prevvalue = save_prevvalue;
13945                         continue;   /* Back to top of loop to get next char */
13946                     }
13947                     /* Here, is a single code point, and <value> contains it */
13948 #ifdef EBCDIC
13949                     /* We consider named characters to be literal characters */
13950                     literal_endpoint++;
13951 #endif
13952                 }
13953                 break;
13954             case 'p':
13955             case 'P':
13956                 {
13957                 char *e;
13958
13959                 /* We will handle any undefined properties ourselves */
13960                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13961                                        /* And we actually would prefer to get
13962                                         * the straight inversion list of the
13963                                         * swash, since we will be accessing it
13964                                         * anyway, to save a little time */
13965                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13966
13967                 if (RExC_parse >= RExC_end)
13968                     vFAIL2("Empty \\%c{}", (U8)value);
13969                 if (*RExC_parse == '{') {
13970                     const U8 c = (U8)value;
13971                     e = strchr(RExC_parse++, '}');
13972                     if (!e)
13973                         vFAIL2("Missing right brace on \\%c{}", c);
13974                     while (isSPACE(*RExC_parse))
13975                         RExC_parse++;
13976                     if (e == RExC_parse)
13977                         vFAIL2("Empty \\%c{}", c);
13978                     n = e - RExC_parse;
13979                     while (isSPACE(*(RExC_parse + n - 1)))
13980                         n--;
13981                 }
13982                 else {
13983                     e = RExC_parse;
13984                     n = 1;
13985                 }
13986                 if (!SIZE_ONLY) {
13987                     SV* invlist;
13988                     char* name;
13989
13990                     if (UCHARAT(RExC_parse) == '^') {
13991                          RExC_parse++;
13992                          n--;
13993                          /* toggle.  (The rhs xor gets the single bit that
13994                           * differs between P and p; the other xor inverts just
13995                           * that bit) */
13996                          value ^= 'P' ^ 'p';
13997
13998                          while (isSPACE(*RExC_parse)) {
13999                               RExC_parse++;
14000                               n--;
14001                          }
14002                     }
14003                     /* Try to get the definition of the property into
14004                      * <invlist>.  If /i is in effect, the effective property
14005                      * will have its name be <__NAME_i>.  The design is
14006                      * discussed in commit
14007                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14008                     name = savepv(Perl_form(aTHX_
14009                                           "%s%.*s%s\n",
14010                                           (FOLD) ? "__" : "",
14011                                           (int)n,
14012                                           RExC_parse,
14013                                           (FOLD) ? "_i" : ""
14014                                 ));
14015
14016                     /* Look up the property name, and get its swash and
14017                      * inversion list, if the property is found  */
14018                     if (swash) {
14019                         SvREFCNT_dec_NN(swash);
14020                     }
14021                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
14022                                              1, /* binary */
14023                                              0, /* not tr/// */
14024                                              NULL, /* No inversion list */
14025                                              &swash_init_flags
14026                                             );
14027                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14028                         HV* curpkg = (IN_PERL_COMPILETIME)
14029                                       ? PL_curstash
14030                                       : CopSTASH(PL_curcop);
14031                         if (swash) {
14032                             SvREFCNT_dec_NN(swash);
14033                             swash = NULL;
14034                         }
14035
14036                         /* Here didn't find it.  It could be a user-defined
14037                          * property that will be available at run-time.  If we
14038                          * accept only compile-time properties, is an error;
14039                          * otherwise add it to the list for run-time look up */
14040                         if (ret_invlist) {
14041                             RExC_parse = e + 1;
14042                             vFAIL2utf8f(
14043                                 "Property '%"UTF8f"' is unknown",
14044                                 UTF8fARG(UTF, n, name));
14045                         }
14046
14047                         /* If the property name doesn't already have a package
14048                          * name, add the current one to it so that it can be
14049                          * referred to outside it. [perl #121777] */
14050                         if (curpkg && ! instr(name, "::")) {
14051                             char* pkgname = HvNAME(curpkg);
14052                             if (strNE(pkgname, "main")) {
14053                                 char* full_name = Perl_form(aTHX_
14054                                                             "%s::%s",
14055                                                             pkgname,
14056                                                             name);
14057                                 n = strlen(full_name);
14058                                 Safefree(name);
14059                                 name = savepvn(full_name, n);
14060                             }
14061                         }
14062                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14063                                         (value == 'p' ? '+' : '!'),
14064                                         UTF8fARG(UTF, n, name));
14065                         has_user_defined_property = TRUE;
14066
14067                         /* We don't know yet, so have to assume that the
14068                          * property could match something in the Latin1 range,
14069                          * hence something that isn't utf8.  Note that this
14070                          * would cause things in <depends_list> to match
14071                          * inappropriately, except that any \p{}, including
14072                          * this one forces Unicode semantics, which means there
14073                          * is no <depends_list> */
14074                         ANYOF_FLAGS(ret)
14075                                       |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14076                     }
14077                     else {
14078
14079                         /* Here, did get the swash and its inversion list.  If
14080                          * the swash is from a user-defined property, then this
14081                          * whole character class should be regarded as such */
14082                         if (swash_init_flags
14083                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14084                         {
14085                             has_user_defined_property = TRUE;
14086                         }
14087                         else if
14088                             /* We warn on matching an above-Unicode code point
14089                              * if the match would return true, except don't
14090                              * warn for \p{All}, which has exactly one element
14091                              * = 0 */
14092                             (_invlist_contains_cp(invlist, 0x110000)
14093                                 && (! (_invlist_len(invlist) == 1
14094                                        && *invlist_array(invlist) == 0)))
14095                         {
14096                             warn_super = TRUE;
14097                         }
14098
14099
14100                         /* Invert if asking for the complement */
14101                         if (value == 'P') {
14102                             _invlist_union_complement_2nd(properties,
14103                                                           invlist,
14104                                                           &properties);
14105
14106                             /* The swash can't be used as-is, because we've
14107                              * inverted things; delay removing it to here after
14108                              * have copied its invlist above */
14109                             SvREFCNT_dec_NN(swash);
14110                             swash = NULL;
14111                         }
14112                         else {
14113                             _invlist_union(properties, invlist, &properties);
14114                         }
14115                     }
14116                     Safefree(name);
14117                 }
14118                 RExC_parse = e + 1;
14119                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
14120                                                 named */
14121
14122                 /* \p means they want Unicode semantics */
14123                 RExC_uni_semantics = 1;
14124                 }
14125                 break;
14126             case 'n':   value = '\n';                   break;
14127             case 'r':   value = '\r';                   break;
14128             case 't':   value = '\t';                   break;
14129             case 'f':   value = '\f';                   break;
14130             case 'b':   value = '\b';                   break;
14131             case 'e':   value = ESC_NATIVE;             break;
14132             case 'a':   value = '\a';                   break;
14133             case 'o':
14134                 RExC_parse--;   /* function expects to be pointed at the 'o' */
14135                 {
14136                     const char* error_msg;
14137                     bool valid = grok_bslash_o(&RExC_parse,
14138                                                &value,
14139                                                &error_msg,
14140                                                PASS2,   /* warnings only in
14141                                                            pass 2 */
14142                                                strict,
14143                                                silence_non_portable,
14144                                                UTF);
14145                     if (! valid) {
14146                         vFAIL(error_msg);
14147                     }
14148                 }
14149                 if (PL_encoding && value < 0x100) {
14150                     goto recode_encoding;
14151                 }
14152                 break;
14153             case 'x':
14154                 RExC_parse--;   /* function expects to be pointed at the 'x' */
14155                 {
14156                     const char* error_msg;
14157                     bool valid = grok_bslash_x(&RExC_parse,
14158                                                &value,
14159                                                &error_msg,
14160                                                PASS2, /* Output warnings */
14161                                                strict,
14162                                                silence_non_portable,
14163                                                UTF);
14164                     if (! valid) {
14165                         vFAIL(error_msg);
14166                     }
14167                 }
14168                 if (PL_encoding && value < 0x100)
14169                     goto recode_encoding;
14170                 break;
14171             case 'c':
14172                 value = grok_bslash_c(*RExC_parse++, PASS2);
14173                 break;
14174             case '0': case '1': case '2': case '3': case '4':
14175             case '5': case '6': case '7':
14176                 {
14177                     /* Take 1-3 octal digits */
14178                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14179                     numlen = (strict) ? 4 : 3;
14180                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14181                     RExC_parse += numlen;
14182                     if (numlen != 3) {
14183                         if (strict) {
14184                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14185                             vFAIL("Need exactly 3 octal digits");
14186                         }
14187                         else if (! SIZE_ONLY /* like \08, \178 */
14188                                  && numlen < 3
14189                                  && RExC_parse < RExC_end
14190                                  && isDIGIT(*RExC_parse)
14191                                  && ckWARN(WARN_REGEXP))
14192                         {
14193                             SAVEFREESV(RExC_rx_sv);
14194                             reg_warn_non_literal_string(
14195                                  RExC_parse + 1,
14196                                  form_short_octal_warning(RExC_parse, numlen));
14197                             (void)ReREFCNT_inc(RExC_rx_sv);
14198                         }
14199                     }
14200                     if (PL_encoding && value < 0x100)
14201                         goto recode_encoding;
14202                     break;
14203                 }
14204             recode_encoding:
14205                 if (! RExC_override_recoding) {
14206                     SV* enc = PL_encoding;
14207                     value = reg_recode((const char)(U8)value, &enc);
14208                     if (!enc) {
14209                         if (strict) {
14210                             vFAIL("Invalid escape in the specified encoding");
14211                         }
14212                         else if (PASS2) {
14213                             ckWARNreg(RExC_parse,
14214                                   "Invalid escape in the specified encoding");
14215                         }
14216                     }
14217                     break;
14218                 }
14219             default:
14220                 /* Allow \_ to not give an error */
14221                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14222                     if (strict) {
14223                         vFAIL2("Unrecognized escape \\%c in character class",
14224                                (int)value);
14225                     }
14226                     else {
14227                         SAVEFREESV(RExC_rx_sv);
14228                         ckWARN2reg(RExC_parse,
14229                             "Unrecognized escape \\%c in character class passed through",
14230                             (int)value);
14231                         (void)ReREFCNT_inc(RExC_rx_sv);
14232                     }
14233                 }
14234                 break;
14235             }   /* End of switch on char following backslash */
14236         } /* end of handling backslash escape sequences */
14237
14238         /* Here, we have the current token in 'value' */
14239
14240         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14241             U8 classnum;
14242
14243             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
14244              * literal, as is the character that began the false range, i.e.
14245              * the 'a' in the examples */
14246             if (range) {
14247                 if (!SIZE_ONLY) {
14248                     const int w = (RExC_parse >= rangebegin)
14249                                   ? RExC_parse - rangebegin
14250                                   : 0;
14251                     if (strict) {
14252                         vFAIL2utf8f(
14253                             "False [] range \"%"UTF8f"\"",
14254                             UTF8fARG(UTF, w, rangebegin));
14255                     }
14256                     else {
14257                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14258                         ckWARN2reg(RExC_parse,
14259                             "False [] range \"%"UTF8f"\"",
14260                             UTF8fARG(UTF, w, rangebegin));
14261                         (void)ReREFCNT_inc(RExC_rx_sv);
14262                         cp_list = add_cp_to_invlist(cp_list, '-');
14263                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14264                                                              prevvalue);
14265                     }
14266                 }
14267
14268                 range = 0; /* this was not a true range */
14269                 element_count += 2; /* So counts for three values */
14270             }
14271
14272             classnum = namedclass_to_classnum(namedclass);
14273
14274             if (LOC && namedclass < ANYOF_POSIXL_MAX
14275 #ifndef HAS_ISASCII
14276                 && classnum != _CC_ASCII
14277 #endif
14278             ) {
14279                 /* What the Posix classes (like \w, [:space:]) match in locale
14280                  * isn't knowable under locale until actual match time.  Room
14281                  * must be reserved (one time per outer bracketed class) to
14282                  * store such classes.  The space will contain a bit for each
14283                  * named class that is to be matched against.  This isn't
14284                  * needed for \p{} and pseudo-classes, as they are not affected
14285                  * by locale, and hence are dealt with separately */
14286                 if (! need_class) {
14287                     need_class = 1;
14288                     if (SIZE_ONLY) {
14289                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14290                     }
14291                     else {
14292                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14293                     }
14294                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14295                     ANYOF_POSIXL_ZERO(ret);
14296                 }
14297
14298                 /* Coverity thinks it is possible for this to be negative; both
14299                  * jhi and khw think it's not, but be safer */
14300                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14301                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14302
14303                 /* See if it already matches the complement of this POSIX
14304                  * class */
14305                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14306                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14307                                                             ? -1
14308                                                             : 1)))
14309                 {
14310                     posixl_matches_all = TRUE;
14311                     break;  /* No need to continue.  Since it matches both
14312                                e.g., \w and \W, it matches everything, and the
14313                                bracketed class can be optimized into qr/./s */
14314                 }
14315
14316                 /* Add this class to those that should be checked at runtime */
14317                 ANYOF_POSIXL_SET(ret, namedclass);
14318
14319                 /* The above-Latin1 characters are not subject to locale rules.
14320                  * Just add them, in the second pass, to the
14321                  * unconditionally-matched list */
14322                 if (! SIZE_ONLY) {
14323                     SV* scratch_list = NULL;
14324
14325                     /* Get the list of the above-Latin1 code points this
14326                      * matches */
14327                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14328                                           PL_XPosix_ptrs[classnum],
14329
14330                                           /* Odd numbers are complements, like
14331                                            * NDIGIT, NASCII, ... */
14332                                           namedclass % 2 != 0,
14333                                           &scratch_list);
14334                     /* Checking if 'cp_list' is NULL first saves an extra
14335                      * clone.  Its reference count will be decremented at the
14336                      * next union, etc, or if this is the only instance, at the
14337                      * end of the routine */
14338                     if (! cp_list) {
14339                         cp_list = scratch_list;
14340                     }
14341                     else {
14342                         _invlist_union(cp_list, scratch_list, &cp_list);
14343                         SvREFCNT_dec_NN(scratch_list);
14344                     }
14345                     continue;   /* Go get next character */
14346                 }
14347             }
14348             else if (! SIZE_ONLY) {
14349
14350                 /* Here, not in pass1 (in that pass we skip calculating the
14351                  * contents of this class), and is /l, or is a POSIX class for
14352                  * which /l doesn't matter (or is a Unicode property, which is
14353                  * skipped here). */
14354                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
14355                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14356
14357                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
14358                          * nor /l make a difference in what these match,
14359                          * therefore we just add what they match to cp_list. */
14360                         if (classnum != _CC_VERTSPACE) {
14361                             assert(   namedclass == ANYOF_HORIZWS
14362                                    || namedclass == ANYOF_NHORIZWS);
14363
14364                             /* It turns out that \h is just a synonym for
14365                              * XPosixBlank */
14366                             classnum = _CC_BLANK;
14367                         }
14368
14369                         _invlist_union_maybe_complement_2nd(
14370                                 cp_list,
14371                                 PL_XPosix_ptrs[classnum],
14372                                 namedclass % 2 != 0,    /* Complement if odd
14373                                                           (NHORIZWS, NVERTWS)
14374                                                         */
14375                                 &cp_list);
14376                     }
14377                 }
14378                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
14379                            complement and use nposixes */
14380                     SV** posixes_ptr = namedclass % 2 == 0
14381                                        ? &posixes
14382                                        : &nposixes;
14383                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
14384                     _invlist_union_maybe_complement_2nd(
14385                                                      *posixes_ptr,
14386                                                      *source_ptr,
14387                                                      namedclass % 2 != 0,
14388                                                      posixes_ptr);
14389                 }
14390             }
14391         } /* end of namedclass \blah */
14392
14393         if (skip_white) {
14394             RExC_parse = regpatws(pRExC_state, RExC_parse,
14395                                 FALSE /* means don't recognize comments */ );
14396         }
14397
14398         /* If 'range' is set, 'value' is the ending of a range--check its
14399          * validity.  (If value isn't a single code point in the case of a
14400          * range, we should have figured that out above in the code that
14401          * catches false ranges).  Later, we will handle each individual code
14402          * point in the range.  If 'range' isn't set, this could be the
14403          * beginning of a range, so check for that by looking ahead to see if
14404          * the next real character to be processed is the range indicator--the
14405          * minus sign */
14406
14407         if (range) {
14408             if (prevvalue > value) /* b-a */ {
14409                 const int w = RExC_parse - rangebegin;
14410                 vFAIL2utf8f(
14411                     "Invalid [] range \"%"UTF8f"\"",
14412                     UTF8fARG(UTF, w, rangebegin));
14413                 range = 0; /* not a valid range */
14414             }
14415         }
14416         else {
14417             prevvalue = value; /* save the beginning of the potential range */
14418             if (! stop_at_1     /* Can't be a range if parsing just one thing */
14419                 && *RExC_parse == '-')
14420             {
14421                 char* next_char_ptr = RExC_parse + 1;
14422                 if (skip_white) {   /* Get the next real char after the '-' */
14423                     next_char_ptr = regpatws(pRExC_state,
14424                                              RExC_parse + 1,
14425                                              FALSE); /* means don't recognize
14426                                                         comments */
14427                 }
14428
14429                 /* If the '-' is at the end of the class (just before the ']',
14430                  * it is a literal minus; otherwise it is a range */
14431                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14432                     RExC_parse = next_char_ptr;
14433
14434                     /* a bad range like \w-, [:word:]- ? */
14435                     if (namedclass > OOB_NAMEDCLASS) {
14436                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14437                             const int w = RExC_parse >= rangebegin
14438                                           ?  RExC_parse - rangebegin
14439                                           : 0;
14440                             if (strict) {
14441                                 vFAIL4("False [] range \"%*.*s\"",
14442                                     w, w, rangebegin);
14443                             }
14444                             else if (PASS2) {
14445                                 vWARN4(RExC_parse,
14446                                     "False [] range \"%*.*s\"",
14447                                     w, w, rangebegin);
14448                             }
14449                         }
14450                         if (!SIZE_ONLY) {
14451                             cp_list = add_cp_to_invlist(cp_list, '-');
14452                         }
14453                         element_count++;
14454                     } else
14455                         range = 1;      /* yeah, it's a range! */
14456                     continue;   /* but do it the next time */
14457                 }
14458             }
14459         }
14460
14461         if (namedclass > OOB_NAMEDCLASS) {
14462             continue;
14463         }
14464
14465         /* Here, we have a single value this time through the loop, and
14466          * <prevvalue> is the beginning of the range, if any; or <value> if
14467          * not. */
14468
14469         /* non-Latin1 code point implies unicode semantics.  Must be set in
14470          * pass1 so is there for the whole of pass 2 */
14471         if (value > 255) {
14472             RExC_uni_semantics = 1;
14473         }
14474
14475         /* Ready to process either the single value, or the completed range.
14476          * For single-valued non-inverted ranges, we consider the possibility
14477          * of multi-char folds.  (We made a conscious decision to not do this
14478          * for the other cases because it can often lead to non-intuitive
14479          * results.  For example, you have the peculiar case that:
14480          *  "s s" =~ /^[^\xDF]+$/i => Y
14481          *  "ss"  =~ /^[^\xDF]+$/i => N
14482          *
14483          * See [perl #89750] */
14484         if (FOLD && allow_multi_folds && value == prevvalue) {
14485             if (value == LATIN_SMALL_LETTER_SHARP_S
14486                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14487                                                         value)))
14488             {
14489                 /* Here <value> is indeed a multi-char fold.  Get what it is */
14490
14491                 U8 foldbuf[UTF8_MAXBYTES_CASE];
14492                 STRLEN foldlen;
14493
14494                 UV folded = _to_uni_fold_flags(
14495                                 value,
14496                                 foldbuf,
14497                                 &foldlen,
14498                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14499                                                    ? FOLD_FLAGS_NOMIX_ASCII
14500                                                    : 0)
14501                                 );
14502
14503                 /* Here, <folded> should be the first character of the
14504                  * multi-char fold of <value>, with <foldbuf> containing the
14505                  * whole thing.  But, if this fold is not allowed (because of
14506                  * the flags), <fold> will be the same as <value>, and should
14507                  * be processed like any other character, so skip the special
14508                  * handling */
14509                 if (folded != value) {
14510
14511                     /* Skip if we are recursed, currently parsing the class
14512                      * again.  Otherwise add this character to the list of
14513                      * multi-char folds. */
14514                     if (! RExC_in_multi_char_class) {
14515                         STRLEN cp_count = utf8_length(foldbuf,
14516                                                       foldbuf + foldlen);
14517                         SV* multi_fold = sv_2mortal(newSVpvs(""));
14518
14519                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14520
14521                         multi_char_matches
14522                                         = add_multi_match(multi_char_matches,
14523                                                           multi_fold,
14524                                                           cp_count);
14525
14526                     }
14527
14528                     /* This element should not be processed further in this
14529                      * class */
14530                     element_count--;
14531                     value = save_value;
14532                     prevvalue = save_prevvalue;
14533                     continue;
14534                 }
14535             }
14536         }
14537
14538         /* Deal with this element of the class */
14539         if (! SIZE_ONLY) {
14540 #ifndef EBCDIC
14541             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14542                                                      prevvalue, value);
14543 #else
14544             SV* this_range = _new_invlist(1);
14545             _append_range_to_invlist(this_range, prevvalue, value);
14546
14547             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14548              * If this range was specified using something like 'i-j', we want
14549              * to include only the 'i' and the 'j', and not anything in
14550              * between, so exclude non-ASCII, non-alphabetics from it.
14551              * However, if the range was specified with something like
14552              * [\x89-\x91] or [\x89-j], all code points within it should be
14553              * included.  literal_endpoint==2 means both ends of the range used
14554              * a literal character, not \x{foo} */
14555             if (literal_endpoint == 2
14556                 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14557                     || (isUPPER_A(prevvalue) && isUPPER_A(value))))
14558             {
14559                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14560                                       &this_range);
14561
14562                 /* Since 'this_range' now only contains ascii, the intersection
14563                  * of it with anything will still yield only ascii */
14564                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14565                                       &this_range);
14566             }
14567             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14568             literal_endpoint = 0;
14569             SvREFCNT_dec_NN(this_range);
14570 #endif
14571         }
14572
14573         range = 0; /* this range (if it was one) is done now */
14574     } /* End of loop through all the text within the brackets */
14575
14576     /* If anything in the class expands to more than one character, we have to
14577      * deal with them by building up a substitute parse string, and recursively
14578      * calling reg() on it, instead of proceeding */
14579     if (multi_char_matches) {
14580         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14581         I32 cp_count;
14582         STRLEN len;
14583         char *save_end = RExC_end;
14584         char *save_parse = RExC_parse;
14585         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14586                                        a "|" */
14587         I32 reg_flags;
14588
14589         assert(! invert);
14590 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14591            because too confusing */
14592         if (invert) {
14593             sv_catpv(substitute_parse, "(?:");
14594         }
14595 #endif
14596
14597         /* Look at the longest folds first */
14598         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14599
14600             if (av_exists(multi_char_matches, cp_count)) {
14601                 AV** this_array_ptr;
14602                 SV* this_sequence;
14603
14604                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14605                                                  cp_count, FALSE);
14606                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14607                                                                 &PL_sv_undef)
14608                 {
14609                     if (! first_time) {
14610                         sv_catpv(substitute_parse, "|");
14611                     }
14612                     first_time = FALSE;
14613
14614                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14615                 }
14616             }
14617         }
14618
14619         /* If the character class contains anything else besides these
14620          * multi-character folds, have to include it in recursive parsing */
14621         if (element_count) {
14622             sv_catpv(substitute_parse, "|[");
14623             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14624             sv_catpv(substitute_parse, "]");
14625         }
14626
14627         sv_catpv(substitute_parse, ")");
14628 #if 0
14629         if (invert) {
14630             /* This is a way to get the parse to skip forward a whole named
14631              * sequence instead of matching the 2nd character when it fails the
14632              * first */
14633             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14634         }
14635 #endif
14636
14637         RExC_parse = SvPV(substitute_parse, len);
14638         RExC_end = RExC_parse + len;
14639         RExC_in_multi_char_class = 1;
14640         RExC_override_recoding = 1;
14641         RExC_emit = (regnode *)orig_emit;
14642
14643         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14644
14645         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14646
14647         RExC_parse = save_parse;
14648         RExC_end = save_end;
14649         RExC_in_multi_char_class = 0;
14650         RExC_override_recoding = 0;
14651         SvREFCNT_dec_NN(multi_char_matches);
14652         return ret;
14653     }
14654
14655     /* Here, we've gone through the entire class and dealt with multi-char
14656      * folds.  We are now in a position that we can do some checks to see if we
14657      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14658      * Currently we only do two checks:
14659      * 1) is in the unlikely event that the user has specified both, eg. \w and
14660      *    \W under /l, then the class matches everything.  (This optimization
14661      *    is done only to make the optimizer code run later work.)
14662      * 2) if the character class contains only a single element (including a
14663      *    single range), we see if there is an equivalent node for it.
14664      * Other checks are possible */
14665     if (! ret_invlist   /* Can't optimize if returning the constructed
14666                            inversion list */
14667         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14668     {
14669         U8 op = END;
14670         U8 arg = 0;
14671
14672         if (UNLIKELY(posixl_matches_all)) {
14673             op = SANY;
14674         }
14675         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14676                                                    \w or [:digit:] or \p{foo}
14677                                                  */
14678
14679             /* All named classes are mapped into POSIXish nodes, with its FLAG
14680              * argument giving which class it is */
14681             switch ((I32)namedclass) {
14682                 case ANYOF_UNIPROP:
14683                     break;
14684
14685                 /* These don't depend on the charset modifiers.  They always
14686                  * match under /u rules */
14687                 case ANYOF_NHORIZWS:
14688                 case ANYOF_HORIZWS:
14689                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14690                     /* FALLTHROUGH */
14691
14692                 case ANYOF_NVERTWS:
14693                 case ANYOF_VERTWS:
14694                     op = POSIXU;
14695                     goto join_posix;
14696
14697                 /* The actual POSIXish node for all the rest depends on the
14698                  * charset modifier.  The ones in the first set depend only on
14699                  * ASCII or, if available on this platform, locale */
14700                 case ANYOF_ASCII:
14701                 case ANYOF_NASCII:
14702 #ifdef HAS_ISASCII
14703                     op = (LOC) ? POSIXL : POSIXA;
14704 #else
14705                     op = POSIXA;
14706 #endif
14707                     goto join_posix;
14708
14709                 case ANYOF_NCASED:
14710                 case ANYOF_LOWER:
14711                 case ANYOF_NLOWER:
14712                 case ANYOF_UPPER:
14713                 case ANYOF_NUPPER:
14714                     /* under /a could be alpha */
14715                     if (FOLD) {
14716                         if (ASCII_RESTRICTED) {
14717                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14718                         }
14719                         else if (! LOC) {
14720                             break;
14721                         }
14722                     }
14723                     /* FALLTHROUGH */
14724
14725                 /* The rest have more possibilities depending on the charset.
14726                  * We take advantage of the enum ordering of the charset
14727                  * modifiers to get the exact node type, */
14728                 default:
14729                     op = POSIXD + get_regex_charset(RExC_flags);
14730                     if (op > POSIXA) { /* /aa is same as /a */
14731                         op = POSIXA;
14732                     }
14733
14734                 join_posix:
14735                     /* The odd numbered ones are the complements of the
14736                      * next-lower even number one */
14737                     if (namedclass % 2 == 1) {
14738                         invert = ! invert;
14739                         namedclass--;
14740                     }
14741                     arg = namedclass_to_classnum(namedclass);
14742                     break;
14743             }
14744         }
14745         else if (value == prevvalue) {
14746
14747             /* Here, the class consists of just a single code point */
14748
14749             if (invert) {
14750                 if (! LOC && value == '\n') {
14751                     op = REG_ANY; /* Optimize [^\n] */
14752                     *flagp |= HASWIDTH|SIMPLE;
14753                     RExC_naughty++;
14754                 }
14755             }
14756             else if (value < 256 || UTF) {
14757
14758                 /* Optimize a single value into an EXACTish node, but not if it
14759                  * would require converting the pattern to UTF-8. */
14760                 op = compute_EXACTish(pRExC_state);
14761             }
14762         } /* Otherwise is a range */
14763         else if (! LOC) {   /* locale could vary these */
14764             if (prevvalue == '0') {
14765                 if (value == '9') {
14766                     arg = _CC_DIGIT;
14767                     op = POSIXA;
14768                 }
14769             }
14770             else if (prevvalue == 'A') {
14771                 if (value == 'Z'
14772 #ifdef EBCDIC
14773                     && literal_endpoint == 2
14774 #endif
14775                 ) {
14776                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14777                     op = POSIXA;
14778                 }
14779             }
14780             else if (prevvalue == 'a') {
14781                 if (value == 'z'
14782 #ifdef EBCDIC
14783                     && literal_endpoint == 2
14784 #endif
14785                 ) {
14786                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14787                     op = POSIXA;
14788                 }
14789             }
14790         }
14791
14792         /* Here, we have changed <op> away from its initial value iff we found
14793          * an optimization */
14794         if (op != END) {
14795
14796             /* Throw away this ANYOF regnode, and emit the calculated one,
14797              * which should correspond to the beginning, not current, state of
14798              * the parse */
14799             const char * cur_parse = RExC_parse;
14800             RExC_parse = (char *)orig_parse;
14801             if ( SIZE_ONLY) {
14802                 if (! LOC) {
14803
14804                     /* To get locale nodes to not use the full ANYOF size would
14805                      * require moving the code above that writes the portions
14806                      * of it that aren't in other nodes to after this point.
14807                      * e.g.  ANYOF_POSIXL_SET */
14808                     RExC_size = orig_size;
14809                 }
14810             }
14811             else {
14812                 RExC_emit = (regnode *)orig_emit;
14813                 if (PL_regkind[op] == POSIXD) {
14814                     if (op == POSIXL) {
14815                         RExC_contains_locale = 1;
14816                     }
14817                     if (invert) {
14818                         op += NPOSIXD - POSIXD;
14819                     }
14820                 }
14821             }
14822
14823             ret = reg_node(pRExC_state, op);
14824
14825             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14826                 if (! SIZE_ONLY) {
14827                     FLAGS(ret) = arg;
14828                 }
14829                 *flagp |= HASWIDTH|SIMPLE;
14830             }
14831             else if (PL_regkind[op] == EXACT) {
14832                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14833                                            TRUE /* downgradable to EXACT */
14834                                            );
14835             }
14836
14837             RExC_parse = (char *) cur_parse;
14838
14839             SvREFCNT_dec(posixes);
14840             SvREFCNT_dec(nposixes);
14841             SvREFCNT_dec(cp_list);
14842             SvREFCNT_dec(cp_foldable_list);
14843             return ret;
14844         }
14845     }
14846
14847     if (SIZE_ONLY)
14848         return ret;
14849     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14850
14851     /* If folding, we calculate all characters that could fold to or from the
14852      * ones already on the list */
14853     if (cp_foldable_list) {
14854         if (FOLD) {
14855             UV start, end;      /* End points of code point ranges */
14856
14857             SV* fold_intersection = NULL;
14858             SV** use_list;
14859
14860             /* Our calculated list will be for Unicode rules.  For locale
14861              * matching, we have to keep a separate list that is consulted at
14862              * runtime only when the locale indicates Unicode rules.  For
14863              * non-locale, we just use to the general list */
14864             if (LOC) {
14865                 use_list = &only_utf8_locale_list;
14866             }
14867             else {
14868                 use_list = &cp_list;
14869             }
14870
14871             /* Only the characters in this class that participate in folds need
14872              * be checked.  Get the intersection of this class and all the
14873              * possible characters that are foldable.  This can quickly narrow
14874              * down a large class */
14875             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14876                                   &fold_intersection);
14877
14878             /* The folds for all the Latin1 characters are hard-coded into this
14879              * program, but we have to go out to disk to get the others. */
14880             if (invlist_highest(cp_foldable_list) >= 256) {
14881
14882                 /* This is a hash that for a particular fold gives all
14883                  * characters that are involved in it */
14884                 if (! PL_utf8_foldclosures) {
14885                     _load_PL_utf8_foldclosures();
14886                 }
14887             }
14888
14889             /* Now look at the foldable characters in this class individually */
14890             invlist_iterinit(fold_intersection);
14891             while (invlist_iternext(fold_intersection, &start, &end)) {
14892                 UV j;
14893
14894                 /* Look at every character in the range */
14895                 for (j = start; j <= end; j++) {
14896                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14897                     STRLEN foldlen;
14898                     SV** listp;
14899
14900                     if (j < 256) {
14901
14902                         if (IS_IN_SOME_FOLD_L1(j)) {
14903
14904                             /* ASCII is always matched; non-ASCII is matched
14905                              * only under Unicode rules (which could happen
14906                              * under /l if the locale is a UTF-8 one */
14907                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14908                                 *use_list = add_cp_to_invlist(*use_list,
14909                                                             PL_fold_latin1[j]);
14910                             }
14911                             else {
14912                                 depends_list =
14913                                  add_cp_to_invlist(depends_list,
14914                                                    PL_fold_latin1[j]);
14915                             }
14916                         }
14917
14918                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14919                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14920                         {
14921                             add_above_Latin1_folds(pRExC_state,
14922                                                    (U8) j,
14923                                                    use_list);
14924                         }
14925                         continue;
14926                     }
14927
14928                     /* Here is an above Latin1 character.  We don't have the
14929                      * rules hard-coded for it.  First, get its fold.  This is
14930                      * the simple fold, as the multi-character folds have been
14931                      * handled earlier and separated out */
14932                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14933                                                         (ASCII_FOLD_RESTRICTED)
14934                                                         ? FOLD_FLAGS_NOMIX_ASCII
14935                                                         : 0);
14936
14937                     /* Single character fold of above Latin1.  Add everything in
14938                     * its fold closure to the list that this node should match.
14939                     * The fold closures data structure is a hash with the keys
14940                     * being the UTF-8 of every character that is folded to, like
14941                     * 'k', and the values each an array of all code points that
14942                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14943                     * Multi-character folds are not included */
14944                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14945                                         (char *) foldbuf, foldlen, FALSE)))
14946                     {
14947                         AV* list = (AV*) *listp;
14948                         IV k;
14949                         for (k = 0; k <= av_tindex(list); k++) {
14950                             SV** c_p = av_fetch(list, k, FALSE);
14951                             UV c;
14952                             assert(c_p);
14953
14954                             c = SvUV(*c_p);
14955
14956                             /* /aa doesn't allow folds between ASCII and non- */
14957                             if ((ASCII_FOLD_RESTRICTED
14958                                 && (isASCII(c) != isASCII(j))))
14959                             {
14960                                 continue;
14961                             }
14962
14963                             /* Folds under /l which cross the 255/256 boundary
14964                              * are added to a separate list.  (These are valid
14965                              * only when the locale is UTF-8.) */
14966                             if (c < 256 && LOC) {
14967                                 *use_list = add_cp_to_invlist(*use_list, c);
14968                                 continue;
14969                             }
14970
14971                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14972                             {
14973                                 cp_list = add_cp_to_invlist(cp_list, c);
14974                             }
14975                             else {
14976                                 /* Similarly folds involving non-ascii Latin1
14977                                 * characters under /d are added to their list */
14978                                 depends_list = add_cp_to_invlist(depends_list,
14979                                                                  c);
14980                             }
14981                         }
14982                     }
14983                 }
14984             }
14985             SvREFCNT_dec_NN(fold_intersection);
14986         }
14987
14988         /* Now that we have finished adding all the folds, there is no reason
14989          * to keep the foldable list separate */
14990         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14991         SvREFCNT_dec_NN(cp_foldable_list);
14992     }
14993
14994     /* And combine the result (if any) with any inversion list from posix
14995      * classes.  The lists are kept separate up to now because we don't want to
14996      * fold the classes (folding of those is automatically handled by the swash
14997      * fetching code) */
14998     if (posixes || nposixes) {
14999         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15000             /* Under /a and /aa, nothing above ASCII matches these */
15001             _invlist_intersection(posixes,
15002                                   PL_XPosix_ptrs[_CC_ASCII],
15003                                   &posixes);
15004         }
15005         if (nposixes) {
15006             if (DEPENDS_SEMANTICS) {
15007                 /* Under /d, everything in the upper half of the Latin1 range
15008                  * matches these complements */
15009                 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15010             }
15011             else if (AT_LEAST_ASCII_RESTRICTED) {
15012                 /* Under /a and /aa, everything above ASCII matches these
15013                  * complements */
15014                 _invlist_union_complement_2nd(nposixes,
15015                                               PL_XPosix_ptrs[_CC_ASCII],
15016                                               &nposixes);
15017             }
15018             if (posixes) {
15019                 _invlist_union(posixes, nposixes, &posixes);
15020                 SvREFCNT_dec_NN(nposixes);
15021             }
15022             else {
15023                 posixes = nposixes;
15024             }
15025         }
15026         if (! DEPENDS_SEMANTICS) {
15027             if (cp_list) {
15028                 _invlist_union(cp_list, posixes, &cp_list);
15029                 SvREFCNT_dec_NN(posixes);
15030             }
15031             else {
15032                 cp_list = posixes;
15033             }
15034         }
15035         else {
15036             /* Under /d, we put into a separate list the Latin1 things that
15037              * match only when the target string is utf8 */
15038             SV* nonascii_but_latin1_properties = NULL;
15039             _invlist_intersection(posixes, PL_UpperLatin1,
15040                                   &nonascii_but_latin1_properties);
15041             _invlist_subtract(posixes, nonascii_but_latin1_properties,
15042                               &posixes);
15043             if (cp_list) {
15044                 _invlist_union(cp_list, posixes, &cp_list);
15045                 SvREFCNT_dec_NN(posixes);
15046             }
15047             else {
15048                 cp_list = posixes;
15049             }
15050
15051             if (depends_list) {
15052                 _invlist_union(depends_list, nonascii_but_latin1_properties,
15053                                &depends_list);
15054                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15055             }
15056             else {
15057                 depends_list = nonascii_but_latin1_properties;
15058             }
15059         }
15060     }
15061
15062     /* And combine the result (if any) with any inversion list from properties.
15063      * The lists are kept separate up to now so that we can distinguish the two
15064      * in regards to matching above-Unicode.  A run-time warning is generated
15065      * if a Unicode property is matched against a non-Unicode code point. But,
15066      * we allow user-defined properties to match anything, without any warning,
15067      * and we also suppress the warning if there is a portion of the character
15068      * class that isn't a Unicode property, and which matches above Unicode, \W
15069      * or [\x{110000}] for example.
15070      * (Note that in this case, unlike the Posix one above, there is no
15071      * <depends_list>, because having a Unicode property forces Unicode
15072      * semantics */
15073     if (properties) {
15074         if (cp_list) {
15075
15076             /* If it matters to the final outcome, see if a non-property
15077              * component of the class matches above Unicode.  If so, the
15078              * warning gets suppressed.  This is true even if just a single
15079              * such code point is specified, as though not strictly correct if
15080              * another such code point is matched against, the fact that they
15081              * are using above-Unicode code points indicates they should know
15082              * the issues involved */
15083             if (warn_super) {
15084                 warn_super = ! (invert
15085                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15086             }
15087
15088             _invlist_union(properties, cp_list, &cp_list);
15089             SvREFCNT_dec_NN(properties);
15090         }
15091         else {
15092             cp_list = properties;
15093         }
15094
15095         if (warn_super) {
15096             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15097         }
15098     }
15099
15100     /* Here, we have calculated what code points should be in the character
15101      * class.
15102      *
15103      * Now we can see about various optimizations.  Fold calculation (which we
15104      * did above) needs to take place before inversion.  Otherwise /[^k]/i
15105      * would invert to include K, which under /i would match k, which it
15106      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
15107      * folded until runtime */
15108
15109     /* If we didn't do folding, it's because some information isn't available
15110      * until runtime; set the run-time fold flag for these.  (We don't have to
15111      * worry about properties folding, as that is taken care of by the swash
15112      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
15113      * locales, or the class matches at least one 0-255 range code point */
15114     if (LOC && FOLD) {
15115         if (only_utf8_locale_list) {
15116             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15117         }
15118         else if (cp_list) { /* Look to see if there a 0-255 code point is in
15119                                the list */
15120             UV start, end;
15121             invlist_iterinit(cp_list);
15122             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15123                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15124             }
15125             invlist_iterfinish(cp_list);
15126         }
15127     }
15128
15129     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15130      * at compile time.  Besides not inverting folded locale now, we can't
15131      * invert if there are things such as \w, which aren't known until runtime
15132      * */
15133     if (cp_list
15134         && invert
15135         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15136         && ! depends_list
15137         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15138     {
15139         _invlist_invert(cp_list);
15140
15141         /* Any swash can't be used as-is, because we've inverted things */
15142         if (swash) {
15143             SvREFCNT_dec_NN(swash);
15144             swash = NULL;
15145         }
15146
15147         /* Clear the invert flag since have just done it here */
15148         invert = FALSE;
15149     }
15150
15151     if (ret_invlist) {
15152         *ret_invlist = cp_list;
15153         SvREFCNT_dec(swash);
15154
15155         /* Discard the generated node */
15156         if (SIZE_ONLY) {
15157             RExC_size = orig_size;
15158         }
15159         else {
15160             RExC_emit = orig_emit;
15161         }
15162         return orig_emit;
15163     }
15164
15165     /* Some character classes are equivalent to other nodes.  Such nodes take
15166      * up less room and generally fewer operations to execute than ANYOF nodes.
15167      * Above, we checked for and optimized into some such equivalents for
15168      * certain common classes that are easy to test.  Getting to this point in
15169      * the code means that the class didn't get optimized there.  Since this
15170      * code is only executed in Pass 2, it is too late to save space--it has
15171      * been allocated in Pass 1, and currently isn't given back.  But turning
15172      * things into an EXACTish node can allow the optimizer to join it to any
15173      * adjacent such nodes.  And if the class is equivalent to things like /./,
15174      * expensive run-time swashes can be avoided.  Now that we have more
15175      * complete information, we can find things necessarily missed by the
15176      * earlier code.  I (khw) am not sure how much to look for here.  It would
15177      * be easy, but perhaps too slow, to check any candidates against all the
15178      * node types they could possibly match using _invlistEQ(). */
15179
15180     if (cp_list
15181         && ! invert
15182         && ! depends_list
15183         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15184         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15185
15186            /* We don't optimize if we are supposed to make sure all non-Unicode
15187             * code points raise a warning, as only ANYOF nodes have this check.
15188             * */
15189         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15190     {
15191         UV start, end;
15192         U8 op = END;  /* The optimzation node-type */
15193         const char * cur_parse= RExC_parse;
15194
15195         invlist_iterinit(cp_list);
15196         if (! invlist_iternext(cp_list, &start, &end)) {
15197
15198             /* Here, the list is empty.  This happens, for example, when a
15199              * Unicode property is the only thing in the character class, and
15200              * it doesn't match anything.  (perluniprops.pod notes such
15201              * properties) */
15202             op = OPFAIL;
15203             *flagp |= HASWIDTH|SIMPLE;
15204         }
15205         else if (start == end) {    /* The range is a single code point */
15206             if (! invlist_iternext(cp_list, &start, &end)
15207
15208                     /* Don't do this optimization if it would require changing
15209                      * the pattern to UTF-8 */
15210                 && (start < 256 || UTF))
15211             {
15212                 /* Here, the list contains a single code point.  Can optimize
15213                  * into an EXACTish node */
15214
15215                 value = start;
15216
15217                 if (! FOLD) {
15218                     op = EXACT;
15219                 }
15220                 else if (LOC) {
15221
15222                     /* A locale node under folding with one code point can be
15223                      * an EXACTFL, as its fold won't be calculated until
15224                      * runtime */
15225                     op = EXACTFL;
15226                 }
15227                 else {
15228
15229                     /* Here, we are generally folding, but there is only one
15230                      * code point to match.  If we have to, we use an EXACT
15231                      * node, but it would be better for joining with adjacent
15232                      * nodes in the optimization pass if we used the same
15233                      * EXACTFish node that any such are likely to be.  We can
15234                      * do this iff the code point doesn't participate in any
15235                      * folds.  For example, an EXACTF of a colon is the same as
15236                      * an EXACT one, since nothing folds to or from a colon. */
15237                     if (value < 256) {
15238                         if (IS_IN_SOME_FOLD_L1(value)) {
15239                             op = EXACT;
15240                         }
15241                     }
15242                     else {
15243                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15244                             op = EXACT;
15245                         }
15246                     }
15247
15248                     /* If we haven't found the node type, above, it means we
15249                      * can use the prevailing one */
15250                     if (op == END) {
15251                         op = compute_EXACTish(pRExC_state);
15252                     }
15253                 }
15254             }
15255         }
15256         else if (start == 0) {
15257             if (end == UV_MAX) {
15258                 op = SANY;
15259                 *flagp |= HASWIDTH|SIMPLE;
15260                 RExC_naughty++;
15261             }
15262             else if (end == '\n' - 1
15263                     && invlist_iternext(cp_list, &start, &end)
15264                     && start == '\n' + 1 && end == UV_MAX)
15265             {
15266                 op = REG_ANY;
15267                 *flagp |= HASWIDTH|SIMPLE;
15268                 RExC_naughty++;
15269             }
15270         }
15271         invlist_iterfinish(cp_list);
15272
15273         if (op != END) {
15274             RExC_parse = (char *)orig_parse;
15275             RExC_emit = (regnode *)orig_emit;
15276
15277             ret = reg_node(pRExC_state, op);
15278
15279             RExC_parse = (char *)cur_parse;
15280
15281             if (PL_regkind[op] == EXACT) {
15282                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15283                                            TRUE /* downgradable to EXACT */
15284                                           );
15285             }
15286
15287             SvREFCNT_dec_NN(cp_list);
15288             return ret;
15289         }
15290     }
15291
15292     /* Here, <cp_list> contains all the code points we can determine at
15293      * compile time that match under all conditions.  Go through it, and
15294      * for things that belong in the bitmap, put them there, and delete from
15295      * <cp_list>.  While we are at it, see if everything above 255 is in the
15296      * list, and if so, set a flag to speed up execution */
15297
15298     populate_ANYOF_from_invlist(ret, &cp_list);
15299
15300     if (invert) {
15301         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15302     }
15303
15304     /* Here, the bitmap has been populated with all the Latin1 code points that
15305      * always match.  Can now add to the overall list those that match only
15306      * when the target string is UTF-8 (<depends_list>). */
15307     if (depends_list) {
15308         if (cp_list) {
15309             _invlist_union(cp_list, depends_list, &cp_list);
15310             SvREFCNT_dec_NN(depends_list);
15311         }
15312         else {
15313             cp_list = depends_list;
15314         }
15315         ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15316     }
15317
15318     /* If there is a swash and more than one element, we can't use the swash in
15319      * the optimization below. */
15320     if (swash && element_count > 1) {
15321         SvREFCNT_dec_NN(swash);
15322         swash = NULL;
15323     }
15324
15325     /* Note that the optimization of using 'swash' if it is the only thing in
15326      * the class doesn't have us change swash at all, so it can include things
15327      * that are also in the bitmap; otherwise we have purposely deleted that
15328      * duplicate information */
15329     set_ANYOF_arg(pRExC_state, ret, cp_list,
15330                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15331                    ? listsv : NULL,
15332                   only_utf8_locale_list,
15333                   swash, has_user_defined_property);
15334
15335     *flagp |= HASWIDTH|SIMPLE;
15336
15337     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15338         RExC_contains_locale = 1;
15339     }
15340
15341     return ret;
15342 }
15343
15344 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15345
15346 STATIC void
15347 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15348                 regnode* const node,
15349                 SV* const cp_list,
15350                 SV* const runtime_defns,
15351                 SV* const only_utf8_locale_list,
15352                 SV* const swash,
15353                 const bool has_user_defined_property)
15354 {
15355     /* Sets the arg field of an ANYOF-type node 'node', using information about
15356      * the node passed-in.  If there is nothing outside the node's bitmap, the
15357      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
15358      * the count returned by add_data(), having allocated and stored an array,
15359      * av, that that count references, as follows:
15360      *  av[0] stores the character class description in its textual form.
15361      *        This is used later (regexec.c:Perl_regclass_swash()) to
15362      *        initialize the appropriate swash, and is also useful for dumping
15363      *        the regnode.  This is set to &PL_sv_undef if the textual
15364      *        description is not needed at run-time (as happens if the other
15365      *        elements completely define the class)
15366      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15367      *        computed from av[0].  But if no further computation need be done,
15368      *        the swash is stored here now (and av[0] is &PL_sv_undef).
15369      *  av[2] stores the inversion list of code points that match only if the
15370      *        current locale is UTF-8
15371      *  av[3] stores the cp_list inversion list for use in addition or instead
15372      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15373      *        (Otherwise everything needed is already in av[0] and av[1])
15374      *  av[4] is set if any component of the class is from a user-defined
15375      *        property; used only if av[3] exists */
15376
15377     UV n;
15378
15379     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15380
15381     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15382         assert(! (ANYOF_FLAGS(node)
15383                   & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15384                      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15385         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15386     }
15387     else {
15388         AV * const av = newAV();
15389         SV *rv;
15390
15391         assert(ANYOF_FLAGS(node)
15392                & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15393                   |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15394
15395         av_store(av, 0, (runtime_defns)
15396                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15397         if (swash) {
15398             assert(cp_list);
15399             av_store(av, 1, swash);
15400             SvREFCNT_dec_NN(cp_list);
15401         }
15402         else {
15403             av_store(av, 1, &PL_sv_undef);
15404             if (cp_list) {
15405                 av_store(av, 3, cp_list);
15406                 av_store(av, 4, newSVuv(has_user_defined_property));
15407             }
15408         }
15409
15410         if (only_utf8_locale_list) {
15411             av_store(av, 2, only_utf8_locale_list);
15412         }
15413         else {
15414             av_store(av, 2, &PL_sv_undef);
15415         }
15416
15417         rv = newRV_noinc(MUTABLE_SV(av));
15418         n = add_data(pRExC_state, STR_WITH_LEN("s"));
15419         RExC_rxi->data->data[n] = (void*)rv;
15420         ARG_SET(node, n);
15421     }
15422 }
15423
15424 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15425 SV *
15426 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15427                                         const regnode* node,
15428                                         bool doinit,
15429                                         SV** listsvp,
15430                                         SV** only_utf8_locale_ptr,
15431                                         SV*  exclude_list)
15432
15433 {
15434     /* For internal core use only.
15435      * Returns the swash for the input 'node' in the regex 'prog'.
15436      * If <doinit> is 'true', will attempt to create the swash if not already
15437      *    done.
15438      * If <listsvp> is non-null, will return the printable contents of the
15439      *    swash.  This can be used to get debugging information even before the
15440      *    swash exists, by calling this function with 'doinit' set to false, in
15441      *    which case the components that will be used to eventually create the
15442      *    swash are returned  (in a printable form).
15443      * If <exclude_list> is not NULL, it is an inversion list of things to
15444      *    exclude from what's returned in <listsvp>.
15445      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
15446      * that, in spite of this function's name, the swash it returns may include
15447      * the bitmap data as well */
15448
15449     SV *sw  = NULL;
15450     SV *si  = NULL;         /* Input swash initialization string */
15451     SV*  invlist = NULL;
15452
15453     RXi_GET_DECL(prog,progi);
15454     const struct reg_data * const data = prog ? progi->data : NULL;
15455
15456     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15457
15458     assert(ANYOF_FLAGS(node)
15459         & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15460            |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15461
15462     if (data && data->count) {
15463         const U32 n = ARG(node);
15464
15465         if (data->what[n] == 's') {
15466             SV * const rv = MUTABLE_SV(data->data[n]);
15467             AV * const av = MUTABLE_AV(SvRV(rv));
15468             SV **const ary = AvARRAY(av);
15469             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15470
15471             si = *ary;  /* ary[0] = the string to initialize the swash with */
15472
15473             /* Elements 3 and 4 are either both present or both absent. [3] is
15474              * any inversion list generated at compile time; [4] indicates if
15475              * that inversion list has any user-defined properties in it. */
15476             if (av_tindex(av) >= 2) {
15477                 if (only_utf8_locale_ptr
15478                     && ary[2]
15479                     && ary[2] != &PL_sv_undef)
15480                 {
15481                     *only_utf8_locale_ptr = ary[2];
15482                 }
15483                 else {
15484                     assert(only_utf8_locale_ptr);
15485                     *only_utf8_locale_ptr = NULL;
15486                 }
15487
15488                 if (av_tindex(av) >= 3) {
15489                     invlist = ary[3];
15490                     if (SvUV(ary[4])) {
15491                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15492                     }
15493                 }
15494                 else {
15495                     invlist = NULL;
15496                 }
15497             }
15498
15499             /* Element [1] is reserved for the set-up swash.  If already there,
15500              * return it; if not, create it and store it there */
15501             if (ary[1] && SvROK(ary[1])) {
15502                 sw = ary[1];
15503             }
15504             else if (doinit && ((si && si != &PL_sv_undef)
15505                                  || (invlist && invlist != &PL_sv_undef))) {
15506                 assert(si);
15507                 sw = _core_swash_init("utf8", /* the utf8 package */
15508                                       "", /* nameless */
15509                                       si,
15510                                       1, /* binary */
15511                                       0, /* not from tr/// */
15512                                       invlist,
15513                                       &swash_init_flags);
15514                 (void)av_store(av, 1, sw);
15515             }
15516         }
15517     }
15518
15519     /* If requested, return a printable version of what this swash matches */
15520     if (listsvp) {
15521         SV* matches_string = newSVpvs("");
15522
15523         /* The swash should be used, if possible, to get the data, as it
15524          * contains the resolved data.  But this function can be called at
15525          * compile-time, before everything gets resolved, in which case we
15526          * return the currently best available information, which is the string
15527          * that will eventually be used to do that resolving, 'si' */
15528         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15529             && (si && si != &PL_sv_undef))
15530         {
15531             sv_catsv(matches_string, si);
15532         }
15533
15534         /* Add the inversion list to whatever we have.  This may have come from
15535          * the swash, or from an input parameter */
15536         if (invlist) {
15537             if (exclude_list) {
15538                 SV* clone = invlist_clone(invlist);
15539                 _invlist_subtract(clone, exclude_list, &clone);
15540                 sv_catsv(matches_string, _invlist_contents(clone));
15541                 SvREFCNT_dec_NN(clone);
15542             }
15543             else {
15544                 sv_catsv(matches_string, _invlist_contents(invlist));
15545             }
15546         }
15547         *listsvp = matches_string;
15548     }
15549
15550     return sw;
15551 }
15552 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15553
15554 /* reg_skipcomment()
15555
15556    Absorbs an /x style # comment from the input stream,
15557    returning a pointer to the first character beyond the comment, or if the
15558    comment terminates the pattern without anything following it, this returns
15559    one past the final character of the pattern (in other words, RExC_end) and
15560    sets the REG_RUN_ON_COMMENT_SEEN flag.
15561
15562    Note it's the callers responsibility to ensure that we are
15563    actually in /x mode
15564
15565 */
15566
15567 PERL_STATIC_INLINE char*
15568 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15569 {
15570     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15571
15572     assert(*p == '#');
15573
15574     while (p < RExC_end) {
15575         if (*(++p) == '\n') {
15576             return p+1;
15577         }
15578     }
15579
15580     /* we ran off the end of the pattern without ending the comment, so we have
15581      * to add an \n when wrapping */
15582     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15583     return p;
15584 }
15585
15586 /* nextchar()
15587
15588    Advances the parse position, and optionally absorbs
15589    "whitespace" from the inputstream.
15590
15591    Without /x "whitespace" means (?#...) style comments only,
15592    with /x this means (?#...) and # comments and whitespace proper.
15593
15594    Returns the RExC_parse point from BEFORE the scan occurs.
15595
15596    This is the /x friendly way of saying RExC_parse++.
15597 */
15598
15599 STATIC char*
15600 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15601 {
15602     char* const retval = RExC_parse++;
15603
15604     PERL_ARGS_ASSERT_NEXTCHAR;
15605
15606     for (;;) {
15607         if (RExC_end - RExC_parse >= 3
15608             && *RExC_parse == '('
15609             && RExC_parse[1] == '?'
15610             && RExC_parse[2] == '#')
15611         {
15612             while (*RExC_parse != ')') {
15613                 if (RExC_parse == RExC_end)
15614                     FAIL("Sequence (?#... not terminated");
15615                 RExC_parse++;
15616             }
15617             RExC_parse++;
15618             continue;
15619         }
15620         if (RExC_flags & RXf_PMf_EXTENDED) {
15621             char * p = regpatws(pRExC_state, RExC_parse,
15622                                           TRUE); /* means recognize comments */
15623             if (p != RExC_parse) {
15624                 RExC_parse = p;
15625                 continue;
15626             }
15627         }
15628         return retval;
15629     }
15630 }
15631
15632 STATIC regnode *
15633 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15634 {
15635     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15636      * space.  In pass1, it aligns and increments RExC_size; in pass2,
15637      * RExC_emit */
15638
15639     regnode * const ret = RExC_emit;
15640     GET_RE_DEBUG_FLAGS_DECL;
15641
15642     PERL_ARGS_ASSERT_REGNODE_GUTS;
15643
15644     assert(extra_size >= regarglen[op]);
15645
15646     if (SIZE_ONLY) {
15647         SIZE_ALIGN(RExC_size);
15648         RExC_size += 1 + extra_size;
15649         return(ret);
15650     }
15651     if (RExC_emit >= RExC_emit_bound)
15652         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15653                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15654
15655     NODE_ALIGN_FILL(ret);
15656 #ifndef RE_TRACK_PATTERN_OFFSETS
15657     PERL_UNUSED_ARG(name);
15658 #else
15659     if (RExC_offsets) {         /* MJD */
15660         MJD_OFFSET_DEBUG(
15661               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15662               name, __LINE__,
15663               PL_reg_name[op],
15664               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15665                 ? "Overwriting end of array!\n" : "OK",
15666               (UV)(RExC_emit - RExC_emit_start),
15667               (UV)(RExC_parse - RExC_start),
15668               (UV)RExC_offsets[0]));
15669         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15670     }
15671 #endif
15672     return(ret);
15673 }
15674
15675 /*
15676 - reg_node - emit a node
15677 */
15678 STATIC regnode *                        /* Location. */
15679 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15680 {
15681     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15682
15683     PERL_ARGS_ASSERT_REG_NODE;
15684
15685     assert(regarglen[op] == 0);
15686
15687     if (PASS2) {
15688         regnode *ptr = ret;
15689         FILL_ADVANCE_NODE(ptr, op);
15690         RExC_emit = ptr;
15691     }
15692     return(ret);
15693 }
15694
15695 /*
15696 - reganode - emit a node with an argument
15697 */
15698 STATIC regnode *                        /* Location. */
15699 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15700 {
15701     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15702
15703     PERL_ARGS_ASSERT_REGANODE;
15704
15705     assert(regarglen[op] == 1);
15706
15707     if (PASS2) {
15708         regnode *ptr = ret;
15709         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15710         RExC_emit = ptr;
15711     }
15712     return(ret);
15713 }
15714
15715 STATIC regnode *
15716 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15717 {
15718     /* emit a node with U32 and I32 arguments */
15719
15720     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15721
15722     PERL_ARGS_ASSERT_REG2LANODE;
15723
15724     assert(regarglen[op] == 2);
15725
15726     if (PASS2) {
15727         regnode *ptr = ret;
15728         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15729         RExC_emit = ptr;
15730     }
15731     return(ret);
15732 }
15733
15734 /*
15735 - reguni - emit (if appropriate) a Unicode character
15736 */
15737 PERL_STATIC_INLINE STRLEN
15738 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15739 {
15740     PERL_ARGS_ASSERT_REGUNI;
15741
15742     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15743 }
15744
15745 /*
15746 - reginsert - insert an operator in front of already-emitted operand
15747 *
15748 * Means relocating the operand.
15749 */
15750 STATIC void
15751 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15752 {
15753     regnode *src;
15754     regnode *dst;
15755     regnode *place;
15756     const int offset = regarglen[(U8)op];
15757     const int size = NODE_STEP_REGNODE + offset;
15758     GET_RE_DEBUG_FLAGS_DECL;
15759
15760     PERL_ARGS_ASSERT_REGINSERT;
15761     PERL_UNUSED_CONTEXT;
15762     PERL_UNUSED_ARG(depth);
15763 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15764     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15765     if (SIZE_ONLY) {
15766         RExC_size += size;
15767         return;
15768     }
15769
15770     src = RExC_emit;
15771     RExC_emit += size;
15772     dst = RExC_emit;
15773     if (RExC_open_parens) {
15774         int paren;
15775         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15776         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15777             if ( RExC_open_parens[paren] >= opnd ) {
15778                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15779                 RExC_open_parens[paren] += size;
15780             } else {
15781                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15782             }
15783             if ( RExC_close_parens[paren] >= opnd ) {
15784                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15785                 RExC_close_parens[paren] += size;
15786             } else {
15787                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15788             }
15789         }
15790     }
15791
15792     while (src > opnd) {
15793         StructCopy(--src, --dst, regnode);
15794 #ifdef RE_TRACK_PATTERN_OFFSETS
15795         if (RExC_offsets) {     /* MJD 20010112 */
15796             MJD_OFFSET_DEBUG(
15797                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15798                   "reg_insert",
15799                   __LINE__,
15800                   PL_reg_name[op],
15801                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15802                     ? "Overwriting end of array!\n" : "OK",
15803                   (UV)(src - RExC_emit_start),
15804                   (UV)(dst - RExC_emit_start),
15805                   (UV)RExC_offsets[0]));
15806             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15807             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15808         }
15809 #endif
15810     }
15811
15812
15813     place = opnd;               /* Op node, where operand used to be. */
15814 #ifdef RE_TRACK_PATTERN_OFFSETS
15815     if (RExC_offsets) {         /* MJD */
15816         MJD_OFFSET_DEBUG(
15817               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15818               "reginsert",
15819               __LINE__,
15820               PL_reg_name[op],
15821               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15822               ? "Overwriting end of array!\n" : "OK",
15823               (UV)(place - RExC_emit_start),
15824               (UV)(RExC_parse - RExC_start),
15825               (UV)RExC_offsets[0]));
15826         Set_Node_Offset(place, RExC_parse);
15827         Set_Node_Length(place, 1);
15828     }
15829 #endif
15830     src = NEXTOPER(place);
15831     FILL_ADVANCE_NODE(place, op);
15832     Zero(src, offset, regnode);
15833 }
15834
15835 /*
15836 - regtail - set the next-pointer at the end of a node chain of p to val.
15837 - SEE ALSO: regtail_study
15838 */
15839 /* TODO: All three parms should be const */
15840 STATIC void
15841 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15842                 const regnode *val,U32 depth)
15843 {
15844     regnode *scan;
15845     GET_RE_DEBUG_FLAGS_DECL;
15846
15847     PERL_ARGS_ASSERT_REGTAIL;
15848 #ifndef DEBUGGING
15849     PERL_UNUSED_ARG(depth);
15850 #endif
15851
15852     if (SIZE_ONLY)
15853         return;
15854
15855     /* Find last node. */
15856     scan = p;
15857     for (;;) {
15858         regnode * const temp = regnext(scan);
15859         DEBUG_PARSE_r({
15860             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15861             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15862             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15863                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15864                     (temp == NULL ? "->" : ""),
15865                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15866             );
15867         });
15868         if (temp == NULL)
15869             break;
15870         scan = temp;
15871     }
15872
15873     if (reg_off_by_arg[OP(scan)]) {
15874         ARG_SET(scan, val - scan);
15875     }
15876     else {
15877         NEXT_OFF(scan) = val - scan;
15878     }
15879 }
15880
15881 #ifdef DEBUGGING
15882 /*
15883 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15884 - Look for optimizable sequences at the same time.
15885 - currently only looks for EXACT chains.
15886
15887 This is experimental code. The idea is to use this routine to perform
15888 in place optimizations on branches and groups as they are constructed,
15889 with the long term intention of removing optimization from study_chunk so
15890 that it is purely analytical.
15891
15892 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15893 to control which is which.
15894
15895 */
15896 /* TODO: All four parms should be const */
15897
15898 STATIC U8
15899 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15900                       const regnode *val,U32 depth)
15901 {
15902     regnode *scan;
15903     U8 exact = PSEUDO;
15904 #ifdef EXPERIMENTAL_INPLACESCAN
15905     I32 min = 0;
15906 #endif
15907     GET_RE_DEBUG_FLAGS_DECL;
15908
15909     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15910
15911
15912     if (SIZE_ONLY)
15913         return exact;
15914
15915     /* Find last node. */
15916
15917     scan = p;
15918     for (;;) {
15919         regnode * const temp = regnext(scan);
15920 #ifdef EXPERIMENTAL_INPLACESCAN
15921         if (PL_regkind[OP(scan)] == EXACT) {
15922             bool unfolded_multi_char;   /* Unexamined in this routine */
15923             if (join_exact(pRExC_state, scan, &min,
15924                            &unfolded_multi_char, 1, val, depth+1))
15925                 return EXACT;
15926         }
15927 #endif
15928         if ( exact ) {
15929             switch (OP(scan)) {
15930                 case EXACT:
15931                 case EXACTF:
15932                 case EXACTFA_NO_TRIE:
15933                 case EXACTFA:
15934                 case EXACTFU:
15935                 case EXACTFU_SS:
15936                 case EXACTFL:
15937                         if( exact == PSEUDO )
15938                             exact= OP(scan);
15939                         else if ( exact != OP(scan) )
15940                             exact= 0;
15941                 case NOTHING:
15942                     break;
15943                 default:
15944                     exact= 0;
15945             }
15946         }
15947         DEBUG_PARSE_r({
15948             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15949             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15950             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15951                 SvPV_nolen_const(RExC_mysv),
15952                 REG_NODE_NUM(scan),
15953                 PL_reg_name[exact]);
15954         });
15955         if (temp == NULL)
15956             break;
15957         scan = temp;
15958     }
15959     DEBUG_PARSE_r({
15960         DEBUG_PARSE_MSG("");
15961         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
15962         PerlIO_printf(Perl_debug_log,
15963                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15964                       SvPV_nolen_const(RExC_mysv),
15965                       (IV)REG_NODE_NUM(val),
15966                       (IV)(val - scan)
15967         );
15968     });
15969     if (reg_off_by_arg[OP(scan)]) {
15970         ARG_SET(scan, val - scan);
15971     }
15972     else {
15973         NEXT_OFF(scan) = val - scan;
15974     }
15975
15976     return exact;
15977 }
15978 #endif
15979
15980 /*
15981  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15982  */
15983 #ifdef DEBUGGING
15984
15985 static void
15986 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15987 {
15988     int bit;
15989     int set=0;
15990
15991     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15992
15993     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15994         if (flags & (1<<bit)) {
15995             if (!set++ && lead)
15996                 PerlIO_printf(Perl_debug_log, "%s",lead);
15997             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15998         }
15999     }
16000     if (lead)  {
16001         if (set)
16002             PerlIO_printf(Perl_debug_log, "\n");
16003         else
16004             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16005     }
16006 }
16007
16008 static void
16009 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16010 {
16011     int bit;
16012     int set=0;
16013     regex_charset cs;
16014
16015     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16016
16017     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16018         if (flags & (1<<bit)) {
16019             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
16020                 continue;
16021             }
16022             if (!set++ && lead)
16023                 PerlIO_printf(Perl_debug_log, "%s",lead);
16024             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16025         }
16026     }
16027     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16028             if (!set++ && lead) {
16029                 PerlIO_printf(Perl_debug_log, "%s",lead);
16030             }
16031             switch (cs) {
16032                 case REGEX_UNICODE_CHARSET:
16033                     PerlIO_printf(Perl_debug_log, "UNICODE");
16034                     break;
16035                 case REGEX_LOCALE_CHARSET:
16036                     PerlIO_printf(Perl_debug_log, "LOCALE");
16037                     break;
16038                 case REGEX_ASCII_RESTRICTED_CHARSET:
16039                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16040                     break;
16041                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16042                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16043                     break;
16044                 default:
16045                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16046                     break;
16047             }
16048     }
16049     if (lead)  {
16050         if (set)
16051             PerlIO_printf(Perl_debug_log, "\n");
16052         else
16053             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16054     }
16055 }
16056 #endif
16057
16058 void
16059 Perl_regdump(pTHX_ const regexp *r)
16060 {
16061 #ifdef DEBUGGING
16062     SV * const sv = sv_newmortal();
16063     SV *dsv= sv_newmortal();
16064     RXi_GET_DECL(r,ri);
16065     GET_RE_DEBUG_FLAGS_DECL;
16066
16067     PERL_ARGS_ASSERT_REGDUMP;
16068
16069     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16070
16071     /* Header fields of interest. */
16072     if (r->anchored_substr) {
16073         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16074             RE_SV_DUMPLEN(r->anchored_substr), 30);
16075         PerlIO_printf(Perl_debug_log,
16076                       "anchored %s%s at %"IVdf" ",
16077                       s, RE_SV_TAIL(r->anchored_substr),
16078                       (IV)r->anchored_offset);
16079     } else if (r->anchored_utf8) {
16080         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16081             RE_SV_DUMPLEN(r->anchored_utf8), 30);
16082         PerlIO_printf(Perl_debug_log,
16083                       "anchored utf8 %s%s at %"IVdf" ",
16084                       s, RE_SV_TAIL(r->anchored_utf8),
16085                       (IV)r->anchored_offset);
16086     }
16087     if (r->float_substr) {
16088         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16089             RE_SV_DUMPLEN(r->float_substr), 30);
16090         PerlIO_printf(Perl_debug_log,
16091                       "floating %s%s at %"IVdf"..%"UVuf" ",
16092                       s, RE_SV_TAIL(r->float_substr),
16093                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16094     } else if (r->float_utf8) {
16095         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16096             RE_SV_DUMPLEN(r->float_utf8), 30);
16097         PerlIO_printf(Perl_debug_log,
16098                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16099                       s, RE_SV_TAIL(r->float_utf8),
16100                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16101     }
16102     if (r->check_substr || r->check_utf8)
16103         PerlIO_printf(Perl_debug_log,
16104                       (const char *)
16105                       (r->check_substr == r->float_substr
16106                        && r->check_utf8 == r->float_utf8
16107                        ? "(checking floating" : "(checking anchored"));
16108     if (r->intflags & PREGf_NOSCAN)
16109         PerlIO_printf(Perl_debug_log, " noscan");
16110     if (r->extflags & RXf_CHECK_ALL)
16111         PerlIO_printf(Perl_debug_log, " isall");
16112     if (r->check_substr || r->check_utf8)
16113         PerlIO_printf(Perl_debug_log, ") ");
16114
16115     if (ri->regstclass) {
16116         regprop(r, sv, ri->regstclass, NULL, NULL);
16117         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16118     }
16119     if (r->intflags & PREGf_ANCH) {
16120         PerlIO_printf(Perl_debug_log, "anchored");
16121         if (r->intflags & PREGf_ANCH_MBOL)
16122             PerlIO_printf(Perl_debug_log, "(MBOL)");
16123         if (r->intflags & PREGf_ANCH_SBOL)
16124             PerlIO_printf(Perl_debug_log, "(SBOL)");
16125         if (r->intflags & PREGf_ANCH_GPOS)
16126             PerlIO_printf(Perl_debug_log, "(GPOS)");
16127         PerlIO_putc(Perl_debug_log, ' ');
16128     }
16129     if (r->intflags & PREGf_GPOS_SEEN)
16130         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16131     if (r->intflags & PREGf_SKIP)
16132         PerlIO_printf(Perl_debug_log, "plus ");
16133     if (r->intflags & PREGf_IMPLICIT)
16134         PerlIO_printf(Perl_debug_log, "implicit ");
16135     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16136     if (r->extflags & RXf_EVAL_SEEN)
16137         PerlIO_printf(Perl_debug_log, "with eval ");
16138     PerlIO_printf(Perl_debug_log, "\n");
16139     DEBUG_FLAGS_r({
16140         regdump_extflags("r->extflags: ",r->extflags);
16141         regdump_intflags("r->intflags: ",r->intflags);
16142     });
16143 #else
16144     PERL_ARGS_ASSERT_REGDUMP;
16145     PERL_UNUSED_CONTEXT;
16146     PERL_UNUSED_ARG(r);
16147 #endif  /* DEBUGGING */
16148 }
16149
16150 /*
16151 - regprop - printable representation of opcode, with run time support
16152 */
16153
16154 void
16155 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16156 {
16157 #ifdef DEBUGGING
16158     int k;
16159
16160     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16161     static const char * const anyofs[] = {
16162 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16163     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
16164     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
16165     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
16166     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
16167     || _CC_VERTSPACE != 16
16168   #error Need to adjust order of anyofs[]
16169 #endif
16170         "\\w",
16171         "\\W",
16172         "\\d",
16173         "\\D",
16174         "[:alpha:]",
16175         "[:^alpha:]",
16176         "[:lower:]",
16177         "[:^lower:]",
16178         "[:upper:]",
16179         "[:^upper:]",
16180         "[:punct:]",
16181         "[:^punct:]",
16182         "[:print:]",
16183         "[:^print:]",
16184         "[:alnum:]",
16185         "[:^alnum:]",
16186         "[:graph:]",
16187         "[:^graph:]",
16188         "[:cased:]",
16189         "[:^cased:]",
16190         "\\s",
16191         "\\S",
16192         "[:blank:]",
16193         "[:^blank:]",
16194         "[:xdigit:]",
16195         "[:^xdigit:]",
16196         "[:space:]",
16197         "[:^space:]",
16198         "[:cntrl:]",
16199         "[:^cntrl:]",
16200         "[:ascii:]",
16201         "[:^ascii:]",
16202         "\\v",
16203         "\\V"
16204     };
16205     RXi_GET_DECL(prog,progi);
16206     GET_RE_DEBUG_FLAGS_DECL;
16207
16208     PERL_ARGS_ASSERT_REGPROP;
16209
16210     sv_setpvn(sv, "", 0);
16211
16212     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
16213         /* It would be nice to FAIL() here, but this may be called from
16214            regexec.c, and it would be hard to supply pRExC_state. */
16215         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16216                                               (int)OP(o), (int)REGNODE_MAX);
16217     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16218
16219     k = PL_regkind[OP(o)];
16220
16221     if (k == EXACT) {
16222         sv_catpvs(sv, " ");
16223         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16224          * is a crude hack but it may be the best for now since
16225          * we have no flag "this EXACTish node was UTF-8"
16226          * --jhi */
16227         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16228                   PERL_PV_ESCAPE_UNI_DETECT |
16229                   PERL_PV_ESCAPE_NONASCII   |
16230                   PERL_PV_PRETTY_ELLIPSES   |
16231                   PERL_PV_PRETTY_LTGT       |
16232                   PERL_PV_PRETTY_NOCLEAR
16233                   );
16234     } else if (k == TRIE) {
16235         /* print the details of the trie in dumpuntil instead, as
16236          * progi->data isn't available here */
16237         const char op = OP(o);
16238         const U32 n = ARG(o);
16239         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16240                (reg_ac_data *)progi->data->data[n] :
16241                NULL;
16242         const reg_trie_data * const trie
16243             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16244
16245         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16246         DEBUG_TRIE_COMPILE_r(
16247           Perl_sv_catpvf(aTHX_ sv,
16248             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16249             (UV)trie->startstate,
16250             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16251             (UV)trie->wordcount,
16252             (UV)trie->minlen,
16253             (UV)trie->maxlen,
16254             (UV)TRIE_CHARCOUNT(trie),
16255             (UV)trie->uniquecharcount
16256           );
16257         );
16258         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16259             sv_catpvs(sv, "[");
16260             (void) put_charclass_bitmap_innards(sv,
16261                                                 (IS_ANYOF_TRIE(op))
16262                                                  ? ANYOF_BITMAP(o)
16263                                                  : TRIE_BITMAP(trie),
16264                                                 NULL);
16265             sv_catpvs(sv, "]");
16266         }
16267
16268     } else if (k == CURLY) {
16269         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16270             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16271         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16272     }
16273     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
16274         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16275     else if (k == REF || k == OPEN || k == CLOSE
16276              || k == GROUPP || OP(o)==ACCEPT)
16277     {
16278         AV *name_list= NULL;
16279         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
16280         if ( RXp_PAREN_NAMES(prog) ) {
16281             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16282         } else if ( pRExC_state ) {
16283             name_list= RExC_paren_name_list;
16284         }
16285         if (name_list) {
16286             if ( k != REF || (OP(o) < NREF)) {
16287                 SV **name= av_fetch(name_list, ARG(o), 0 );
16288                 if (name)
16289                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16290             }
16291             else {
16292                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16293                 I32 *nums=(I32*)SvPVX(sv_dat);
16294                 SV **name= av_fetch(name_list, nums[0], 0 );
16295                 I32 n;
16296                 if (name) {
16297                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
16298                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16299                                     (n ? "," : ""), (IV)nums[n]);
16300                     }
16301                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16302                 }
16303             }
16304         }
16305         if ( k == REF && reginfo) {
16306             U32 n = ARG(o);  /* which paren pair */
16307             I32 ln = prog->offs[n].start;
16308             if (prog->lastparen < n || ln == -1)
16309                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16310             else if (ln == prog->offs[n].end)
16311                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16312             else {
16313                 const char *s = reginfo->strbeg + ln;
16314                 Perl_sv_catpvf(aTHX_ sv, ": ");
16315                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16316                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16317             }
16318         }
16319     } else if (k == GOSUB) {
16320         AV *name_list= NULL;
16321         if ( RXp_PAREN_NAMES(prog) ) {
16322             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16323         } else if ( pRExC_state ) {
16324             name_list= RExC_paren_name_list;
16325         }
16326
16327         /* Paren and offset */
16328         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16329         if (name_list) {
16330             SV **name= av_fetch(name_list, ARG(o), 0 );
16331             if (name)
16332                 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16333         }
16334     }
16335     else if (k == VERB) {
16336         if (!o->flags)
16337             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16338                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16339     } else if (k == LOGICAL)
16340         /* 2: embedded, otherwise 1 */
16341         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16342     else if (k == ANYOF) {
16343         const U8 flags = ANYOF_FLAGS(o);
16344         int do_sep = 0;
16345         SV* bitmap_invlist;  /* Will hold what the bit map contains */
16346
16347
16348         if (flags & ANYOF_LOCALE_FLAGS)
16349             sv_catpvs(sv, "{loc}");
16350         if (flags & ANYOF_LOC_FOLD)
16351             sv_catpvs(sv, "{i}");
16352         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16353         if (flags & ANYOF_INVERT)
16354             sv_catpvs(sv, "^");
16355
16356         /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16357          * */
16358         do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16359                                                             &bitmap_invlist);
16360
16361         /* output any special charclass tests (used entirely under use
16362          * locale) * */
16363         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16364             int i;
16365             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16366                 if (ANYOF_POSIXL_TEST(o,i)) {
16367                     sv_catpv(sv, anyofs[i]);
16368                     do_sep = 1;
16369                 }
16370             }
16371         }
16372
16373         if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16374                       |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16375                       |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16376                       |ANYOF_LOC_FOLD)))
16377         {
16378             if (do_sep) {
16379                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16380                 if (flags & ANYOF_INVERT)
16381                     /*make sure the invert info is in each */
16382                     sv_catpvs(sv, "^");
16383             }
16384
16385             if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16386                 sv_catpvs(sv, "{non-utf8-latin1-all}");
16387             }
16388
16389             /* output information about the unicode matching */
16390             if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16391                 sv_catpvs(sv, "{above_bitmap_all}");
16392             else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16393                 SV *lv; /* Set if there is something outside the bit map. */
16394                 bool byte_output = FALSE;   /* If something in the bitmap has
16395                                                been output */
16396                 SV *only_utf8_locale;
16397
16398                 /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
16399                  * is used to guarantee that nothing in the bitmap gets
16400                  * returned */
16401                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16402                                                     &lv, &only_utf8_locale,
16403                                                     bitmap_invlist);
16404                 if (lv && lv != &PL_sv_undef) {
16405                     char *s = savesvpv(lv);
16406                     char * const origs = s;
16407
16408                     while (*s && *s != '\n')
16409                         s++;
16410
16411                     if (*s == '\n') {
16412                         const char * const t = ++s;
16413
16414                         if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16415                             sv_catpvs(sv, "{outside bitmap}");
16416                         }
16417                         else {
16418                             sv_catpvs(sv, "{utf8}");
16419                         }
16420
16421                         if (byte_output) {
16422                             sv_catpvs(sv, " ");
16423                         }
16424
16425                         while (*s) {
16426                             if (*s == '\n') {
16427
16428                                 /* Truncate very long output */
16429                                 if (s - origs > 256) {
16430                                     Perl_sv_catpvf(aTHX_ sv,
16431                                                 "%.*s...",
16432                                                 (int) (s - origs - 1),
16433                                                 t);
16434                                     goto out_dump;
16435                                 }
16436                                 *s = ' ';
16437                             }
16438                             else if (*s == '\t') {
16439                                 *s = '-';
16440                             }
16441                             s++;
16442                         }
16443                         if (s[-1] == ' ')
16444                             s[-1] = 0;
16445
16446                         sv_catpv(sv, t);
16447                     }
16448
16449                 out_dump:
16450
16451                     Safefree(origs);
16452                     SvREFCNT_dec_NN(lv);
16453                 }
16454
16455                 if ((flags & ANYOF_LOC_FOLD)
16456                      && only_utf8_locale
16457                      && only_utf8_locale != &PL_sv_undef)
16458                 {
16459                     UV start, end;
16460                     int max_entries = 256;
16461
16462                     sv_catpvs(sv, "{utf8 locale}");
16463                     invlist_iterinit(only_utf8_locale);
16464                     while (invlist_iternext(only_utf8_locale,
16465                                             &start, &end)) {
16466                         put_range(sv, start, end, FALSE);
16467                         max_entries --;
16468                         if (max_entries < 0) {
16469                             sv_catpvs(sv, "...");
16470                             break;
16471                         }
16472                     }
16473                     invlist_iterfinish(only_utf8_locale);
16474                 }
16475             }
16476         }
16477         SvREFCNT_dec(bitmap_invlist);
16478
16479
16480         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16481     }
16482     else if (k == POSIXD || k == NPOSIXD) {
16483         U8 index = FLAGS(o) * 2;
16484         if (index < C_ARRAY_LENGTH(anyofs)) {
16485             if (*anyofs[index] != '[')  {
16486                 sv_catpv(sv, "[");
16487             }
16488             sv_catpv(sv, anyofs[index]);
16489             if (*anyofs[index] != '[')  {
16490                 sv_catpv(sv, "]");
16491             }
16492         }
16493         else {
16494             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16495         }
16496     }
16497     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16498         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16499     else if (OP(o) == SBOL)
16500         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16501 #else
16502     PERL_UNUSED_CONTEXT;
16503     PERL_UNUSED_ARG(sv);
16504     PERL_UNUSED_ARG(o);
16505     PERL_UNUSED_ARG(prog);
16506     PERL_UNUSED_ARG(reginfo);
16507     PERL_UNUSED_ARG(pRExC_state);
16508 #endif  /* DEBUGGING */
16509 }
16510
16511
16512
16513 SV *
16514 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16515 {                               /* Assume that RE_INTUIT is set */
16516     struct regexp *const prog = ReANY(r);
16517     GET_RE_DEBUG_FLAGS_DECL;
16518
16519     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16520     PERL_UNUSED_CONTEXT;
16521
16522     DEBUG_COMPILE_r(
16523         {
16524             const char * const s = SvPV_nolen_const(prog->check_substr
16525                       ? prog->check_substr : prog->check_utf8);
16526
16527             if (!PL_colorset) reginitcolors();
16528             PerlIO_printf(Perl_debug_log,
16529                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16530                       PL_colors[4],
16531                       prog->check_substr ? "" : "utf8 ",
16532                       PL_colors[5],PL_colors[0],
16533                       s,
16534                       PL_colors[1],
16535                       (strlen(s) > 60 ? "..." : ""));
16536         } );
16537
16538     return prog->check_substr ? prog->check_substr : prog->check_utf8;
16539 }
16540
16541 /*
16542    pregfree()
16543
16544    handles refcounting and freeing the perl core regexp structure. When
16545    it is necessary to actually free the structure the first thing it
16546    does is call the 'free' method of the regexp_engine associated to
16547    the regexp, allowing the handling of the void *pprivate; member
16548    first. (This routine is not overridable by extensions, which is why
16549    the extensions free is called first.)
16550
16551    See regdupe and regdupe_internal if you change anything here.
16552 */
16553 #ifndef PERL_IN_XSUB_RE
16554 void
16555 Perl_pregfree(pTHX_ REGEXP *r)
16556 {
16557     SvREFCNT_dec(r);
16558 }
16559
16560 void
16561 Perl_pregfree2(pTHX_ REGEXP *rx)
16562 {
16563     struct regexp *const r = ReANY(rx);
16564     GET_RE_DEBUG_FLAGS_DECL;
16565
16566     PERL_ARGS_ASSERT_PREGFREE2;
16567
16568     if (r->mother_re) {
16569         ReREFCNT_dec(r->mother_re);
16570     } else {
16571         CALLREGFREE_PVT(rx); /* free the private data */
16572         SvREFCNT_dec(RXp_PAREN_NAMES(r));
16573         Safefree(r->xpv_len_u.xpvlenu_pv);
16574     }
16575     if (r->substrs) {
16576         SvREFCNT_dec(r->anchored_substr);
16577         SvREFCNT_dec(r->anchored_utf8);
16578         SvREFCNT_dec(r->float_substr);
16579         SvREFCNT_dec(r->float_utf8);
16580         Safefree(r->substrs);
16581     }
16582     RX_MATCH_COPY_FREE(rx);
16583 #ifdef PERL_ANY_COW
16584     SvREFCNT_dec(r->saved_copy);
16585 #endif
16586     Safefree(r->offs);
16587     SvREFCNT_dec(r->qr_anoncv);
16588     rx->sv_u.svu_rx = 0;
16589 }
16590
16591 /*  reg_temp_copy()
16592
16593     This is a hacky workaround to the structural issue of match results
16594     being stored in the regexp structure which is in turn stored in
16595     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16596     could be PL_curpm in multiple contexts, and could require multiple
16597     result sets being associated with the pattern simultaneously, such
16598     as when doing a recursive match with (??{$qr})
16599
16600     The solution is to make a lightweight copy of the regexp structure
16601     when a qr// is returned from the code executed by (??{$qr}) this
16602     lightweight copy doesn't actually own any of its data except for
16603     the starp/end and the actual regexp structure itself.
16604
16605 */
16606
16607
16608 REGEXP *
16609 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16610 {
16611     struct regexp *ret;
16612     struct regexp *const r = ReANY(rx);
16613     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16614
16615     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16616
16617     if (!ret_x)
16618         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16619     else {
16620         SvOK_off((SV *)ret_x);
16621         if (islv) {
16622             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16623                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16624                made both spots point to the same regexp body.) */
16625             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16626             assert(!SvPVX(ret_x));
16627             ret_x->sv_u.svu_rx = temp->sv_any;
16628             temp->sv_any = NULL;
16629             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16630             SvREFCNT_dec_NN(temp);
16631             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16632                ing below will not set it. */
16633             SvCUR_set(ret_x, SvCUR(rx));
16634         }
16635     }
16636     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16637        sv_force_normal(sv) is called.  */
16638     SvFAKE_on(ret_x);
16639     ret = ReANY(ret_x);
16640
16641     SvFLAGS(ret_x) |= SvUTF8(rx);
16642     /* We share the same string buffer as the original regexp, on which we
16643        hold a reference count, incremented when mother_re is set below.
16644        The string pointer is copied here, being part of the regexp struct.
16645      */
16646     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16647            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16648     if (r->offs) {
16649         const I32 npar = r->nparens+1;
16650         Newx(ret->offs, npar, regexp_paren_pair);
16651         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16652     }
16653     if (r->substrs) {
16654         Newx(ret->substrs, 1, struct reg_substr_data);
16655         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16656
16657         SvREFCNT_inc_void(ret->anchored_substr);
16658         SvREFCNT_inc_void(ret->anchored_utf8);
16659         SvREFCNT_inc_void(ret->float_substr);
16660         SvREFCNT_inc_void(ret->float_utf8);
16661
16662         /* check_substr and check_utf8, if non-NULL, point to either their
16663            anchored or float namesakes, and don't hold a second reference.  */
16664     }
16665     RX_MATCH_COPIED_off(ret_x);
16666 #ifdef PERL_ANY_COW
16667     ret->saved_copy = NULL;
16668 #endif
16669     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16670     SvREFCNT_inc_void(ret->qr_anoncv);
16671
16672     return ret_x;
16673 }
16674 #endif
16675
16676 /* regfree_internal()
16677
16678    Free the private data in a regexp. This is overloadable by
16679    extensions. Perl takes care of the regexp structure in pregfree(),
16680    this covers the *pprivate pointer which technically perl doesn't
16681    know about, however of course we have to handle the
16682    regexp_internal structure when no extension is in use.
16683
16684    Note this is called before freeing anything in the regexp
16685    structure.
16686  */
16687
16688 void
16689 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16690 {
16691     struct regexp *const r = ReANY(rx);
16692     RXi_GET_DECL(r,ri);
16693     GET_RE_DEBUG_FLAGS_DECL;
16694
16695     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16696
16697     DEBUG_COMPILE_r({
16698         if (!PL_colorset)
16699             reginitcolors();
16700         {
16701             SV *dsv= sv_newmortal();
16702             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16703                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16704             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16705                 PL_colors[4],PL_colors[5],s);
16706         }
16707     });
16708 #ifdef RE_TRACK_PATTERN_OFFSETS
16709     if (ri->u.offsets)
16710         Safefree(ri->u.offsets);             /* 20010421 MJD */
16711 #endif
16712     if (ri->code_blocks) {
16713         int n;
16714         for (n = 0; n < ri->num_code_blocks; n++)
16715             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16716         Safefree(ri->code_blocks);
16717     }
16718
16719     if (ri->data) {
16720         int n = ri->data->count;
16721
16722         while (--n >= 0) {
16723           /* If you add a ->what type here, update the comment in regcomp.h */
16724             switch (ri->data->what[n]) {
16725             case 'a':
16726             case 'r':
16727             case 's':
16728             case 'S':
16729             case 'u':
16730                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16731                 break;
16732             case 'f':
16733                 Safefree(ri->data->data[n]);
16734                 break;
16735             case 'l':
16736             case 'L':
16737                 break;
16738             case 'T':
16739                 { /* Aho Corasick add-on structure for a trie node.
16740                      Used in stclass optimization only */
16741                     U32 refcount;
16742                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16743 #ifdef USE_ITHREADS
16744                     dVAR;
16745 #endif
16746                     OP_REFCNT_LOCK;
16747                     refcount = --aho->refcount;
16748                     OP_REFCNT_UNLOCK;
16749                     if ( !refcount ) {
16750                         PerlMemShared_free(aho->states);
16751                         PerlMemShared_free(aho->fail);
16752                          /* do this last!!!! */
16753                         PerlMemShared_free(ri->data->data[n]);
16754                         /* we should only ever get called once, so
16755                          * assert as much, and also guard the free
16756                          * which /might/ happen twice. At the least
16757                          * it will make code anlyzers happy and it
16758                          * doesn't cost much. - Yves */
16759                         assert(ri->regstclass);
16760                         if (ri->regstclass) {
16761                             PerlMemShared_free(ri->regstclass);
16762                             ri->regstclass = 0;
16763                         }
16764                     }
16765                 }
16766                 break;
16767             case 't':
16768                 {
16769                     /* trie structure. */
16770                     U32 refcount;
16771                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16772 #ifdef USE_ITHREADS
16773                     dVAR;
16774 #endif
16775                     OP_REFCNT_LOCK;
16776                     refcount = --trie->refcount;
16777                     OP_REFCNT_UNLOCK;
16778                     if ( !refcount ) {
16779                         PerlMemShared_free(trie->charmap);
16780                         PerlMemShared_free(trie->states);
16781                         PerlMemShared_free(trie->trans);
16782                         if (trie->bitmap)
16783                             PerlMemShared_free(trie->bitmap);
16784                         if (trie->jump)
16785                             PerlMemShared_free(trie->jump);
16786                         PerlMemShared_free(trie->wordinfo);
16787                         /* do this last!!!! */
16788                         PerlMemShared_free(ri->data->data[n]);
16789                     }
16790                 }
16791                 break;
16792             default:
16793                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16794                                                     ri->data->what[n]);
16795             }
16796         }
16797         Safefree(ri->data->what);
16798         Safefree(ri->data);
16799     }
16800
16801     Safefree(ri);
16802 }
16803
16804 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16805 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16806 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16807
16808 /*
16809    re_dup - duplicate a regexp.
16810
16811    This routine is expected to clone a given regexp structure. It is only
16812    compiled under USE_ITHREADS.
16813
16814    After all of the core data stored in struct regexp is duplicated
16815    the regexp_engine.dupe method is used to copy any private data
16816    stored in the *pprivate pointer. This allows extensions to handle
16817    any duplication it needs to do.
16818
16819    See pregfree() and regfree_internal() if you change anything here.
16820 */
16821 #if defined(USE_ITHREADS)
16822 #ifndef PERL_IN_XSUB_RE
16823 void
16824 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16825 {
16826     dVAR;
16827     I32 npar;
16828     const struct regexp *r = ReANY(sstr);
16829     struct regexp *ret = ReANY(dstr);
16830
16831     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16832
16833     npar = r->nparens+1;
16834     Newx(ret->offs, npar, regexp_paren_pair);
16835     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16836
16837     if (ret->substrs) {
16838         /* Do it this way to avoid reading from *r after the StructCopy().
16839            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16840            cache, it doesn't matter.  */
16841         const bool anchored = r->check_substr
16842             ? r->check_substr == r->anchored_substr
16843             : r->check_utf8 == r->anchored_utf8;
16844         Newx(ret->substrs, 1, struct reg_substr_data);
16845         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16846
16847         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16848         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16849         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16850         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16851
16852         /* check_substr and check_utf8, if non-NULL, point to either their
16853            anchored or float namesakes, and don't hold a second reference.  */
16854
16855         if (ret->check_substr) {
16856             if (anchored) {
16857                 assert(r->check_utf8 == r->anchored_utf8);
16858                 ret->check_substr = ret->anchored_substr;
16859                 ret->check_utf8 = ret->anchored_utf8;
16860             } else {
16861                 assert(r->check_substr == r->float_substr);
16862                 assert(r->check_utf8 == r->float_utf8);
16863                 ret->check_substr = ret->float_substr;
16864                 ret->check_utf8 = ret->float_utf8;
16865             }
16866         } else if (ret->check_utf8) {
16867             if (anchored) {
16868                 ret->check_utf8 = ret->anchored_utf8;
16869             } else {
16870                 ret->check_utf8 = ret->float_utf8;
16871             }
16872         }
16873     }
16874
16875     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16876     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16877
16878     if (ret->pprivate)
16879         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16880
16881     if (RX_MATCH_COPIED(dstr))
16882         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16883     else
16884         ret->subbeg = NULL;
16885 #ifdef PERL_ANY_COW
16886     ret->saved_copy = NULL;
16887 #endif
16888
16889     /* Whether mother_re be set or no, we need to copy the string.  We
16890        cannot refrain from copying it when the storage points directly to
16891        our mother regexp, because that's
16892                1: a buffer in a different thread
16893                2: something we no longer hold a reference on
16894                so we need to copy it locally.  */
16895     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16896     ret->mother_re   = NULL;
16897 }
16898 #endif /* PERL_IN_XSUB_RE */
16899
16900 /*
16901    regdupe_internal()
16902
16903    This is the internal complement to regdupe() which is used to copy
16904    the structure pointed to by the *pprivate pointer in the regexp.
16905    This is the core version of the extension overridable cloning hook.
16906    The regexp structure being duplicated will be copied by perl prior
16907    to this and will be provided as the regexp *r argument, however
16908    with the /old/ structures pprivate pointer value. Thus this routine
16909    may override any copying normally done by perl.
16910
16911    It returns a pointer to the new regexp_internal structure.
16912 */
16913
16914 void *
16915 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16916 {
16917     dVAR;
16918     struct regexp *const r = ReANY(rx);
16919     regexp_internal *reti;
16920     int len;
16921     RXi_GET_DECL(r,ri);
16922
16923     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16924
16925     len = ProgLen(ri);
16926
16927     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16928           char, regexp_internal);
16929     Copy(ri->program, reti->program, len+1, regnode);
16930
16931     reti->num_code_blocks = ri->num_code_blocks;
16932     if (ri->code_blocks) {
16933         int n;
16934         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16935                 struct reg_code_block);
16936         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16937                 struct reg_code_block);
16938         for (n = 0; n < ri->num_code_blocks; n++)
16939              reti->code_blocks[n].src_regex = (REGEXP*)
16940                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16941     }
16942     else
16943         reti->code_blocks = NULL;
16944
16945     reti->regstclass = NULL;
16946
16947     if (ri->data) {
16948         struct reg_data *d;
16949         const int count = ri->data->count;
16950         int i;
16951
16952         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16953                 char, struct reg_data);
16954         Newx(d->what, count, U8);
16955
16956         d->count = count;
16957         for (i = 0; i < count; i++) {
16958             d->what[i] = ri->data->what[i];
16959             switch (d->what[i]) {
16960                 /* see also regcomp.h and regfree_internal() */
16961             case 'a': /* actually an AV, but the dup function is identical.  */
16962             case 'r':
16963             case 's':
16964             case 'S':
16965             case 'u': /* actually an HV, but the dup function is identical.  */
16966                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16967                 break;
16968             case 'f':
16969                 /* This is cheating. */
16970                 Newx(d->data[i], 1, regnode_ssc);
16971                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16972                 reti->regstclass = (regnode*)d->data[i];
16973                 break;
16974             case 'T':
16975                 /* Trie stclasses are readonly and can thus be shared
16976                  * without duplication. We free the stclass in pregfree
16977                  * when the corresponding reg_ac_data struct is freed.
16978                  */
16979                 reti->regstclass= ri->regstclass;
16980                 /* FALLTHROUGH */
16981             case 't':
16982                 OP_REFCNT_LOCK;
16983                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16984                 OP_REFCNT_UNLOCK;
16985                 /* FALLTHROUGH */
16986             case 'l':
16987             case 'L':
16988                 d->data[i] = ri->data->data[i];
16989                 break;
16990             default:
16991                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16992                                                            ri->data->what[i]);
16993             }
16994         }
16995
16996         reti->data = d;
16997     }
16998     else
16999         reti->data = NULL;
17000
17001     reti->name_list_idx = ri->name_list_idx;
17002
17003 #ifdef RE_TRACK_PATTERN_OFFSETS
17004     if (ri->u.offsets) {
17005         Newx(reti->u.offsets, 2*len+1, U32);
17006         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17007     }
17008 #else
17009     SetProgLen(reti,len);
17010 #endif
17011
17012     return (void*)reti;
17013 }
17014
17015 #endif    /* USE_ITHREADS */
17016
17017 #ifndef PERL_IN_XSUB_RE
17018
17019 /*
17020  - regnext - dig the "next" pointer out of a node
17021  */
17022 regnode *
17023 Perl_regnext(pTHX_ regnode *p)
17024 {
17025     I32 offset;
17026
17027     if (!p)
17028         return(NULL);
17029
17030     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
17031         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17032                                                 (int)OP(p), (int)REGNODE_MAX);
17033     }
17034
17035     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17036     if (offset == 0)
17037         return(NULL);
17038
17039     return(p+offset);
17040 }
17041 #endif
17042
17043 STATIC void
17044 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17045 {
17046     va_list args;
17047     STRLEN l1 = strlen(pat1);
17048     STRLEN l2 = strlen(pat2);
17049     char buf[512];
17050     SV *msv;
17051     const char *message;
17052
17053     PERL_ARGS_ASSERT_RE_CROAK2;
17054
17055     if (l1 > 510)
17056         l1 = 510;
17057     if (l1 + l2 > 510)
17058         l2 = 510 - l1;
17059     Copy(pat1, buf, l1 , char);
17060     Copy(pat2, buf + l1, l2 , char);
17061     buf[l1 + l2] = '\n';
17062     buf[l1 + l2 + 1] = '\0';
17063     va_start(args, pat2);
17064     msv = vmess(buf, &args);
17065     va_end(args);
17066     message = SvPV_const(msv,l1);
17067     if (l1 > 512)
17068         l1 = 512;
17069     Copy(message, buf, l1 , char);
17070     /* l1-1 to avoid \n */
17071     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17072 }
17073
17074 #ifdef DEBUGGING
17075 /* Certain characters are output as a sequence with the first being a
17076  * backslash. */
17077 #define isBACKSLASHED_PUNCT(c)                                              \
17078                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17079
17080 STATIC void
17081 S_put_code_point(pTHX_ SV *sv, UV c)
17082 {
17083     PERL_ARGS_ASSERT_PUT_CODE_POINT;
17084
17085     if (c > 255) {
17086         Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17087     }
17088     else if (isPRINT(c)) {
17089         const char string = (char) c;
17090         if (isBACKSLASHED_PUNCT(c))
17091             sv_catpvs(sv, "\\");
17092         sv_catpvn(sv, &string, 1);
17093     }
17094     else {
17095         const char * const mnemonic = cntrl_to_mnemonic((char) c);
17096         if (mnemonic) {
17097             Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17098         }
17099         else {
17100             Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17101         }
17102     }
17103 }
17104
17105 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17106
17107 STATIC void
17108 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17109 {
17110     /* Appends to 'sv' a displayable version of the range of code points from
17111      * 'start' to 'end'.  It assumes that only ASCII printables are displayable
17112      * as-is (though some of these will be escaped by put_code_point()). */
17113
17114     const unsigned int min_range_count = 3;
17115
17116     assert(start <= end);
17117
17118     PERL_ARGS_ASSERT_PUT_RANGE;
17119
17120     while (start <= end) {
17121         UV this_end;
17122         const char * format;
17123
17124         if (end - start < min_range_count) {
17125
17126             /* Individual chars in short ranges */
17127             for (; start <= end; start++) {
17128                 put_code_point(sv, start);
17129             }
17130             break;
17131         }
17132
17133         /* If permitted by the input options, and there is a possibility that
17134          * this range contains a printable literal, look to see if there is
17135          * one.  */
17136         if (allow_literals && start <= MAX_PRINT_A) {
17137
17138             /* If the range begin isn't an ASCII printable, effectively split
17139              * the range into two parts:
17140              *  1) the portion before the first such printable,
17141              *  2) the rest
17142              * and output them separately. */
17143             if (! isPRINT_A(start)) {
17144                 UV temp_end = start + 1;
17145
17146                 /* There is no point looking beyond the final possible
17147                  * printable, in MAX_PRINT_A */
17148                 UV max = MIN(end, MAX_PRINT_A);
17149
17150                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17151                     temp_end++;
17152                 }
17153
17154                 /* Here, temp_end points to one beyond the first printable if
17155                  * found, or to one beyond 'max' if not.  If none found, make
17156                  * sure that we use the entire range */
17157                 if (temp_end > MAX_PRINT_A) {
17158                     temp_end = end + 1;
17159                 }
17160
17161                 /* Output the first part of the split range, the part that
17162                  * doesn't have printables, with no looking for literals
17163                  * (otherwise we would infinitely recurse) */
17164                 put_range(sv, start, temp_end - 1, FALSE);
17165
17166                 /* The 2nd part of the range (if any) starts here. */
17167                 start = temp_end;
17168
17169                 /* We continue instead of dropping down because even if the 2nd
17170                  * part is non-empty, it could be so short that we want to
17171                  * output it specially, as tested for at the top of this loop.
17172                  * */
17173                 continue;
17174             }
17175
17176             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
17177              * output a sub-range of just the digits or letters, then process
17178              * the remaining portion as usual. */
17179             if (isALPHANUMERIC_A(start)) {
17180                 UV mask = (isDIGIT_A(start))
17181                            ? _CC_DIGIT
17182                              : isUPPER_A(start)
17183                                ? _CC_UPPER
17184                                : _CC_LOWER;
17185                 UV temp_end = start + 1;
17186
17187                 /* Find the end of the sub-range that includes just the
17188                  * characters in the same class as the first character in it */
17189                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17190                     temp_end++;
17191                 }
17192                 temp_end--;
17193
17194                 /* For short ranges, don't duplicate the code above to output
17195                  * them; just call recursively */
17196                 if (temp_end - start < min_range_count) {
17197                     put_range(sv, start, temp_end, FALSE);
17198                 }
17199                 else {  /* Output as a range */
17200                     put_code_point(sv, start);
17201                     sv_catpvs(sv, "-");
17202                     put_code_point(sv, temp_end);
17203                 }
17204                 start = temp_end + 1;
17205                 continue;
17206             }
17207
17208             /* We output any other printables as individual characters */
17209             if (isPUNCT_A(start) || isSPACE_A(start)) {
17210                 while (start <= end && (isPUNCT_A(start)
17211                                         || isSPACE_A(start)))
17212                 {
17213                     put_code_point(sv, start);
17214                     start++;
17215                 }
17216                 continue;
17217             }
17218         } /* End of looking for literals */
17219
17220         /* Here is not to output as a literal.  Some control characters have
17221          * mnemonic names.  Split off any of those at the beginning and end of
17222          * the range to print mnemonically.  It isn't possible for many of
17223          * these to be in a row, so this won't overwhelm with output */
17224         while (isMNEMONIC_CNTRL(start) && start <= end) {
17225             put_code_point(sv, start);
17226             start++;
17227         }
17228         if (start < end && isMNEMONIC_CNTRL(end)) {
17229
17230             /* Here, the final character in the range has a mnemonic name.
17231              * Work backwards from the end to find the final non-mnemonic */
17232             UV temp_end = end - 1;
17233             while (isMNEMONIC_CNTRL(temp_end)) {
17234                 temp_end--;
17235             }
17236
17237             /* And separately output the range that doesn't have mnemonics */
17238             put_range(sv, start, temp_end, FALSE);
17239
17240             /* Then output the mnemonic trailing controls */
17241             start = temp_end + 1;
17242             while (start <= end) {
17243                 put_code_point(sv, start);
17244                 start++;
17245             }
17246             break;
17247         }
17248
17249         /* As a final resort, output the range or subrange as hex. */
17250
17251         this_end = (end < NUM_ANYOF_CODE_POINTS)
17252                     ? end
17253                     : NUM_ANYOF_CODE_POINTS - 1;
17254         format = (this_end < 256)
17255                  ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17256                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17257         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17258         break;
17259     }
17260 }
17261
17262 STATIC bool
17263 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17264 {
17265     /* Appends to 'sv' a displayable version of the innards of the bracketed
17266      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
17267      * output anything, and bitmap_invlist, if not NULL, will point to an
17268      * inversion list of what is in the bit map */
17269
17270     int i;
17271     UV start, end;
17272     unsigned int punct_count = 0;
17273     SV* invlist = NULL;
17274     SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
17275     bool allow_literals = TRUE;
17276
17277     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17278
17279     invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17280
17281     /* Worst case is exactly every-other code point is in the list */
17282     *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17283
17284     /* Convert the bit map to an inversion list, keeping track of how many
17285      * ASCII puncts are set, including an extra amount for the backslashed
17286      * ones.  */
17287     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17288         if (BITMAP_TEST(bitmap, i)) {
17289             *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17290             if (isPUNCT_A(i)) {
17291                 punct_count++;
17292                 if isBACKSLASHED_PUNCT(i) {
17293                     punct_count++;
17294                 }
17295             }
17296         }
17297     }
17298
17299     /* Nothing to output */
17300     if (_invlist_len(*invlist_ptr) == 0) {
17301         SvREFCNT_dec(invlist);
17302         return FALSE;
17303     }
17304
17305     /* Generally, it is more readable if printable characters are output as
17306      * literals, but if a range (nearly) spans all of them, it's best to output
17307      * it as a single range.  This code will use a single range if all but 2
17308      * printables are in it */
17309     invlist_iterinit(*invlist_ptr);
17310     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17311
17312         /* If range starts beyond final printable, it doesn't have any in it */
17313         if (start > MAX_PRINT_A) {
17314             break;
17315         }
17316
17317         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
17318          * all but two, the range must start and end no later than 2 from
17319          * either end */
17320         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17321             if (end > MAX_PRINT_A) {
17322                 end = MAX_PRINT_A;
17323             }
17324             if (start < ' ') {
17325                 start = ' ';
17326             }
17327             if (end - start >= MAX_PRINT_A - ' ' - 2) {
17328                 allow_literals = FALSE;
17329             }
17330             break;
17331         }
17332     }
17333     invlist_iterfinish(*invlist_ptr);
17334
17335     /* The legibility of the output depends mostly on how many punctuation
17336      * characters are output.  There are 32 possible ASCII ones, and some have
17337      * an additional backslash, bringing it to currently 36, so if any more
17338      * than 18 are to be output, we can instead output it as its complement,
17339      * yielding fewer puncts, and making it more legible.  But give some weight
17340      * to the fact that outputting it as a complement is less legible than a
17341      * straight output, so don't complement unless we are somewhat over the 18
17342      * mark */
17343     if (allow_literals && punct_count > 22) {
17344         sv_catpvs(sv, "^");
17345
17346         /* Add everything remaining to the list, so when we invert it just
17347          * below, it will be excluded */
17348         _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17349         _invlist_invert(*invlist_ptr);
17350     }
17351
17352     /* Here we have figured things out.  Output each range */
17353     invlist_iterinit(*invlist_ptr);
17354     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17355         if (start >= NUM_ANYOF_CODE_POINTS) {
17356             break;
17357         }
17358         put_range(sv, start, end, allow_literals);
17359     }
17360     invlist_iterfinish(*invlist_ptr);
17361
17362     return TRUE;
17363 }
17364
17365 #define CLEAR_OPTSTART \
17366     if (optstart) STMT_START {                                               \
17367         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
17368                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17369         optstart=NULL;                                                       \
17370     } STMT_END
17371
17372 #define DUMPUNTIL(b,e)                                                       \
17373                     CLEAR_OPTSTART;                                          \
17374                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17375
17376 STATIC const regnode *
17377 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17378             const regnode *last, const regnode *plast,
17379             SV* sv, I32 indent, U32 depth)
17380 {
17381     U8 op = PSEUDO;     /* Arbitrary non-END op. */
17382     const regnode *next;
17383     const regnode *optstart= NULL;
17384
17385     RXi_GET_DECL(r,ri);
17386     GET_RE_DEBUG_FLAGS_DECL;
17387
17388     PERL_ARGS_ASSERT_DUMPUNTIL;
17389
17390 #ifdef DEBUG_DUMPUNTIL
17391     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17392         last ? last-start : 0,plast ? plast-start : 0);
17393 #endif
17394
17395     if (plast && plast < last)
17396         last= plast;
17397
17398     while (PL_regkind[op] != END && (!last || node < last)) {
17399         assert(node);
17400         /* While that wasn't END last time... */
17401         NODE_ALIGN(node);
17402         op = OP(node);
17403         if (op == CLOSE || op == WHILEM)
17404             indent--;
17405         next = regnext((regnode *)node);
17406
17407         /* Where, what. */
17408         if (OP(node) == OPTIMIZED) {
17409             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17410                 optstart = node;
17411             else
17412                 goto after_print;
17413         } else
17414             CLEAR_OPTSTART;
17415
17416         regprop(r, sv, node, NULL, NULL);
17417         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17418                       (int)(2*indent + 1), "", SvPVX_const(sv));
17419
17420         if (OP(node) != OPTIMIZED) {
17421             if (next == NULL)           /* Next ptr. */
17422                 PerlIO_printf(Perl_debug_log, " (0)");
17423             else if (PL_regkind[(U8)op] == BRANCH
17424                      && PL_regkind[OP(next)] != BRANCH )
17425                 PerlIO_printf(Perl_debug_log, " (FAIL)");
17426             else
17427                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17428             (void)PerlIO_putc(Perl_debug_log, '\n');
17429         }
17430
17431       after_print:
17432         if (PL_regkind[(U8)op] == BRANCHJ) {
17433             assert(next);
17434             {
17435                 const regnode *nnode = (OP(next) == LONGJMP
17436                                        ? regnext((regnode *)next)
17437                                        : next);
17438                 if (last && nnode > last)
17439                     nnode = last;
17440                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17441             }
17442         }
17443         else if (PL_regkind[(U8)op] == BRANCH) {
17444             assert(next);
17445             DUMPUNTIL(NEXTOPER(node), next);
17446         }
17447         else if ( PL_regkind[(U8)op]  == TRIE ) {
17448             const regnode *this_trie = node;
17449             const char op = OP(node);
17450             const U32 n = ARG(node);
17451             const reg_ac_data * const ac = op>=AHOCORASICK ?
17452                (reg_ac_data *)ri->data->data[n] :
17453                NULL;
17454             const reg_trie_data * const trie =
17455                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17456 #ifdef DEBUGGING
17457             AV *const trie_words
17458                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17459 #endif
17460             const regnode *nextbranch= NULL;
17461             I32 word_idx;
17462             sv_setpvs(sv, "");
17463             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17464                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17465
17466                 PerlIO_printf(Perl_debug_log, "%*s%s ",
17467                    (int)(2*(indent+3)), "",
17468                     elem_ptr
17469                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17470                                 SvCUR(*elem_ptr), 60,
17471                                 PL_colors[0], PL_colors[1],
17472                                 (SvUTF8(*elem_ptr)
17473                                  ? PERL_PV_ESCAPE_UNI
17474                                  : 0)
17475                                 | PERL_PV_PRETTY_ELLIPSES
17476                                 | PERL_PV_PRETTY_LTGT
17477                             )
17478                     : "???"
17479                 );
17480                 if (trie->jump) {
17481                     U16 dist= trie->jump[word_idx+1];
17482                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17483                                (UV)((dist ? this_trie + dist : next) - start));
17484                     if (dist) {
17485                         if (!nextbranch)
17486                             nextbranch= this_trie + trie->jump[0];
17487                         DUMPUNTIL(this_trie + dist, nextbranch);
17488                     }
17489                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17490                         nextbranch= regnext((regnode *)nextbranch);
17491                 } else {
17492                     PerlIO_printf(Perl_debug_log, "\n");
17493                 }
17494             }
17495             if (last && next > last)
17496                 node= last;
17497             else
17498                 node= next;
17499         }
17500         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
17501             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17502                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17503         }
17504         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17505             assert(next);
17506             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17507         }
17508         else if ( op == PLUS || op == STAR) {
17509             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17510         }
17511         else if (PL_regkind[(U8)op] == ANYOF) {
17512             /* arglen 1 + class block */
17513             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17514                           ? ANYOF_POSIXL_SKIP
17515                           : ANYOF_SKIP);
17516             node = NEXTOPER(node);
17517         }
17518         else if (PL_regkind[(U8)op] == EXACT) {
17519             /* Literal string, where present. */
17520             node += NODE_SZ_STR(node) - 1;
17521             node = NEXTOPER(node);
17522         }
17523         else {
17524             node = NEXTOPER(node);
17525             node += regarglen[(U8)op];
17526         }
17527         if (op == CURLYX || op == OPEN)
17528             indent++;
17529     }
17530     CLEAR_OPTSTART;
17531 #ifdef DEBUG_DUMPUNTIL
17532     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17533 #endif
17534     return node;
17535 }
17536
17537 #endif  /* DEBUGGING */
17538
17539 /*
17540  * Local variables:
17541  * c-indentation-style: bsd
17542  * c-basic-offset: 4
17543  * indent-tabs-mode: nil
17544  * End:
17545  *
17546  * ex: set ts=8 sts=4 sw=4 et:
17547  */