regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 #ifndef MIN
 106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 107 #endif
 108
 109 /* this is a chain of data about sub patterns we are processing that
 110    need to be handled separately/specially in study_chunk. Its so
 111    we can simulate recursion without losing state.  */
 112 struct scan_frame;
 113 typedef struct scan_frame {
 114     regnode *last_regnode;      /* last node to process in this frame */
 115     regnode *next_regnode;      /* next node to process when last is reached */
 116     U32 prev_recursed_depth;
 117     I32 stopparen;              /* what stopparen do we use */
 118     U32 is_top_frame;           /* what flags do we use? */
 119
 120     struct scan_frame *this_prev_frame; /* this previous frame */
 121     struct scan_frame *prev_frame;      /* previous frame */
 122     struct scan_frame *next_frame;      /* next frame */
 123 } scan_frame;
 124
 125 struct RExC_state_t {
 126     U32         flags;                  /* RXf_* are we folding, multilining? */
 127     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 128     char        *precomp;               /* uncompiled string. */
 129     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 130     regexp      *rx;                    /* perl core regexp structure */
 131     regexp_internal     *rxi;           /* internal data for regexp object
 132                                            pprivate field */
 133     char        *start;                 /* Start of input for compile */
 134     char        *end;                   /* End of input for compile */
 135     char        *parse;                 /* Input-scan pointer. */
 136     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 137     regnode     *emit_start;            /* Start of emitted-code area */
 138     regnode     *emit_bound;            /* First regnode outside of the
 139                                            allocated space */
 140     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 141                                            implies compiling, so don't emit */
 142     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 143                                            large enough for the largest
 144                                            non-EXACTish node, so can use it as
 145                                            scratch in pass1 */
 146     I32         naughty;                /* How bad is this pattern? */
 147     I32         sawback;                /* Did we see \1, ...? */
 148     U32         seen;
 149     SSize_t     size;                   /* Code size. */
 150     I32                npar;            /* Capture buffer count, (OPEN) plus
 151                                            one. ("par" 0 is the whole
 152                                            pattern)*/
 153     I32         nestroot;               /* root parens we are in - used by
 154                                            accept */
 155     I32         extralen;
 156     I32         seen_zerolen;
 157     regnode     **open_parens;          /* pointers to open parens */
 158     regnode     **close_parens;         /* pointers to close parens */
 159     regnode     *opend;                 /* END node in program */
 160     I32         utf8;           /* whether the pattern is utf8 or not */
 161     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 162                                 /* XXX use this for future optimisation of case
 163                                  * where pattern must be upgraded to utf8. */
 164     I32         uni_semantics;  /* If a d charset modifier should use unicode
 165                                    rules, even if the pattern is not in
 166                                    utf8 */
 167     HV          *paren_names;           /* Paren names */
 168
 169     regnode     **recurse;              /* Recurse regops */
 170     I32         recurse_count;          /* Number of recurse regops */
 171     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 172                                            through */
 173     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 174     I32         in_lookbehind;
 175     I32         contains_locale;
 176     I32         contains_i;
 177     I32         override_recoding;
 178     I32         in_multi_char_class;
 179     struct reg_code_block *code_blocks; /* positions of literal (?{})
 180                                             within pattern */
 181     int         num_code_blocks;        /* size of code_blocks[] */
 182     int         code_index;             /* next code_blocks[] slot */
 183     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 184     scan_frame *frame_head;
 185     scan_frame *frame_last;
 186     U32         frame_count;
 187 #ifdef ADD_TO_REGEXEC
 188     char        *starttry;              /* -Dr: where regtry was called. */
 189 #define RExC_starttry   (pRExC_state->starttry)
 190 #endif
 191     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 192 #ifdef DEBUGGING
 193     const char  *lastparse;
 194     I32         lastnum;
 195     AV          *paren_name_list;       /* idx -> name */
 196     U32         study_chunk_recursed_count;
 197     SV          *mysv1;
 198     SV          *mysv2;
 199 #define RExC_lastparse  (pRExC_state->lastparse)
 200 #define RExC_lastnum    (pRExC_state->lastnum)
 201 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 202 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 203 #define RExC_mysv       (pRExC_state->mysv1)
 204 #define RExC_mysv1      (pRExC_state->mysv1)
 205 #define RExC_mysv2      (pRExC_state->mysv2)
 206
 207 #endif
 208 };
 209
 210 #define RExC_flags      (pRExC_state->flags)
 211 #define RExC_pm_flags   (pRExC_state->pm_flags)
 212 #define RExC_precomp    (pRExC_state->precomp)
 213 #define RExC_rx_sv      (pRExC_state->rx_sv)
 214 #define RExC_rx         (pRExC_state->rx)
 215 #define RExC_rxi        (pRExC_state->rxi)
 216 #define RExC_start      (pRExC_state->start)
 217 #define RExC_end        (pRExC_state->end)
 218 #define RExC_parse      (pRExC_state->parse)
 219 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 220 #ifdef RE_TRACK_PATTERN_OFFSETS
 221 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 222                                                          others */
 223 #endif
 224 #define RExC_emit       (pRExC_state->emit)
 225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 226 #define RExC_emit_start (pRExC_state->emit_start)
 227 #define RExC_emit_bound (pRExC_state->emit_bound)
 228 #define RExC_naughty    (pRExC_state->naughty)
 229 #define RExC_sawback    (pRExC_state->sawback)
 230 #define RExC_seen       (pRExC_state->seen)
 231 #define RExC_size       (pRExC_state->size)
 232 #define RExC_maxlen        (pRExC_state->maxlen)
 233 #define RExC_npar       (pRExC_state->npar)
 234 #define RExC_nestroot   (pRExC_state->nestroot)
 235 #define RExC_extralen   (pRExC_state->extralen)
 236 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 237 #define RExC_utf8       (pRExC_state->utf8)
 238 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 239 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 240 #define RExC_open_parens        (pRExC_state->open_parens)
 241 #define RExC_close_parens       (pRExC_state->close_parens)
 242 #define RExC_opend      (pRExC_state->opend)
 243 #define RExC_paren_names        (pRExC_state->paren_names)
 244 #define RExC_recurse    (pRExC_state->recurse)
 245 #define RExC_recurse_count      (pRExC_state->recurse_count)
 246 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 247 #define RExC_study_chunk_recursed_bytes  \
 248                                    (pRExC_state->study_chunk_recursed_bytes)
 249 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 250 #define RExC_contains_locale    (pRExC_state->contains_locale)
 251 #define RExC_contains_i (pRExC_state->contains_i)
 252 #define RExC_override_recoding (pRExC_state->override_recoding)
 253 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 254 #define RExC_frame_head (pRExC_state->frame_head)
 255 #define RExC_frame_last (pRExC_state->frame_last)
 256 #define RExC_frame_count (pRExC_state->frame_count)
 257
 258
 259 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 260 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 261         ((*s) == '{' && regcurly(s)))
 262
 263 /*
 264  * Flags to be passed up and down.
 265  */
 266 #define WORST           0       /* Worst case. */
 267 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 268
 269 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 270  * character.  (There needs to be a case: in the switch statement in regexec.c
 271  * for any node marked SIMPLE.)  Note that this is not the same thing as
 272  * REGNODE_SIMPLE */
 273 #define SIMPLE          0x02
 274 #define SPSTART         0x04    /* Starts with * or + */
 275 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 276 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 277 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 278
 279 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 280
 281 /* whether trie related optimizations are enabled */
 282 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 283 #define TRIE_STUDY_OPT
 284 #define FULL_TRIE_STUDY
 285 #define TRIE_STCLASS
 286 #endif
 287
 288
 289
 290 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 291 #define PBITVAL(paren) (1 << ((paren) & 7))
 292 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 293 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 294 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 295
 296 #define REQUIRE_UTF8    STMT_START {                                       \
 297                                      if (!UTF) {                           \
 298                                          *flagp = RESTART_UTF8;            \
 299                                          return NULL;                      \
 300                                      }                                     \
 301                         } STMT_END
 302
 303 /* This converts the named class defined in regcomp.h to its equivalent class
 304  * number defined in handy.h. */
 305 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 306 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 307
 308 #define _invlist_union_complement_2nd(a, b, output) \
 309                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 310 #define _invlist_intersection_complement_2nd(a, b, output) \
 311                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 312
 313 /* About scan_data_t.
 314
 315   During optimisation we recurse through the regexp program performing
 316   various inplace (keyhole style) optimisations. In addition study_chunk
 317   and scan_commit populate this data structure with information about
 318   what strings MUST appear in the pattern. We look for the longest
 319   string that must appear at a fixed location, and we look for the
 320   longest string that may appear at a floating location. So for instance
 321   in the pattern:
 322
 323     /FOO[xX]A.*B[xX]BAR/
 324
 325   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 326   strings (because they follow a .* construct). study_chunk will identify
 327   both FOO and BAR as being the longest fixed and floating strings respectively.
 328
 329   The strings can be composites, for instance
 330
 331      /(f)(o)(o)/
 332
 333   will result in a composite fixed substring 'foo'.
 334
 335   For each string some basic information is maintained:
 336
 337   - offset or min_offset
 338     This is the position the string must appear at, or not before.
 339     It also implicitly (when combined with minlenp) tells us how many
 340     characters must match before the string we are searching for.
 341     Likewise when combined with minlenp and the length of the string it
 342     tells us how many characters must appear after the string we have
 343     found.
 344
 345   - max_offset
 346     Only used for floating strings. This is the rightmost point that
 347     the string can appear at. If set to SSize_t_MAX it indicates that the
 348     string can occur infinitely far to the right.
 349
 350   - minlenp
 351     A pointer to the minimum number of characters of the pattern that the
 352     string was found inside. This is important as in the case of positive
 353     lookahead or positive lookbehind we can have multiple patterns
 354     involved. Consider
 355
 356     /(?=FOO).*F/
 357
 358     The minimum length of the pattern overall is 3, the minimum length
 359     of the lookahead part is 3, but the minimum length of the part that
 360     will actually match is 1. So 'FOO's minimum length is 3, but the
 361     minimum length for the F is 1. This is important as the minimum length
 362     is used to determine offsets in front of and behind the string being
 363     looked for.  Since strings can be composites this is the length of the
 364     pattern at the time it was committed with a scan_commit. Note that
 365     the length is calculated by study_chunk, so that the minimum lengths
 366     are not known until the full pattern has been compiled, thus the
 367     pointer to the value.
 368
 369   - lookbehind
 370
 371     In the case of lookbehind the string being searched for can be
 372     offset past the start point of the final matching string.
 373     If this value was just blithely removed from the min_offset it would
 374     invalidate some of the calculations for how many chars must match
 375     before or after (as they are derived from min_offset and minlen and
 376     the length of the string being searched for).
 377     When the final pattern is compiled and the data is moved from the
 378     scan_data_t structure into the regexp structure the information
 379     about lookbehind is factored in, with the information that would
 380     have been lost precalculated in the end_shift field for the
 381     associated string.
 382
 383   The fields pos_min and pos_delta are used to store the minimum offset
 384   and the delta to the maximum offset at the current point in the pattern.
 385
 386 */
 387
 388 typedef struct scan_data_t {
 389     /*I32 len_min;      unused */
 390     /*I32 len_delta;    unused */
 391     SSize_t pos_min;
 392     SSize_t pos_delta;
 393     SV *last_found;
 394     SSize_t last_end;       /* min value, <0 unless valid. */
 395     SSize_t last_start_min;
 396     SSize_t last_start_max;
 397     SV **longest;           /* Either &l_fixed, or &l_float. */
 398     SV *longest_fixed;      /* longest fixed string found in pattern */
 399     SSize_t offset_fixed;   /* offset where it starts */
 400     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 401     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 402     SV *longest_float;      /* longest floating string found in pattern */
 403     SSize_t offset_float_min; /* earliest point in string it can appear */
 404     SSize_t offset_float_max; /* latest point in string it can appear */
 405     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 406     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 407     I32 flags;
 408     I32 whilem_c;
 409     SSize_t *last_closep;
 410     regnode_ssc *start_class;
 411 } scan_data_t;
 412
 413 /*
 414  * Forward declarations for pregcomp()'s friends.
 415  */
 416
 417 static const scan_data_t zero_scan_data =
 418   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 419
 420 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 421 #define SF_BEFORE_SEOL          0x0001
 422 #define SF_BEFORE_MEOL          0x0002
 423 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 424 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 425
 426 #define SF_FIX_SHIFT_EOL        (+2)
 427 #define SF_FL_SHIFT_EOL         (+4)
 428
 429 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 430 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 431
 432 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 433 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 434 #define SF_IS_INF               0x0040
 435 #define SF_HAS_PAR              0x0080
 436 #define SF_IN_PAR               0x0100
 437 #define SF_HAS_EVAL             0x0200
 438 #define SCF_DO_SUBSTR           0x0400
 439 #define SCF_DO_STCLASS_AND      0x0800
 440 #define SCF_DO_STCLASS_OR       0x1000
 441 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 442 #define SCF_WHILEM_VISITED_POS  0x2000
 443
 444 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 445 #define SCF_SEEN_ACCEPT         0x8000
 446 #define SCF_TRIE_DOING_RESTUDY 0x10000
 447 #define SCF_IN_DEFINE          0x20000
 448
 449
 450
 451
 452 #define UTF cBOOL(RExC_utf8)
 453
 454 /* The enums for all these are ordered so things work out correctly */
 455 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 456 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 457                                                      == REGEX_DEPENDS_CHARSET)
 458 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 459 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 460                                                      >= REGEX_UNICODE_CHARSET)
 461 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 462                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 463 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 464                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 465 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 466                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 467
 468 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 469
 470 /* For programs that want to be strictly Unicode compatible by dying if any
 471  * attempt is made to match a non-Unicode code point against a Unicode
 472  * property.  */
 473 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 474
 475 #define OOB_NAMEDCLASS          -1
 476
 477 /* There is no code point that is out-of-bounds, so this is problematic.  But
 478  * its only current use is to initialize a variable that is always set before
 479  * looked at. */
 480 #define OOB_UNICODE             0xDEADBEEF
 481
 482 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 483 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 484
 485
 486 /* length of regex to show in messages that don't mark a position within */
 487 #define RegexLengthToShowInErrorMessages 127
 488
 489 /*
 490  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 491  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 492  * op/pragma/warn/regcomp.
 493  */
 494 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 495 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 496
 497 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 498                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 499
 500 #define REPORT_LOCATION_ARGS(offset)            \
 501                 UTF8fARG(UTF, offset, RExC_precomp), \
 502                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 503
 504 /*
 505  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 506  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 507  * "...".
 508  */
 509 #define _FAIL(code) STMT_START {                                        \
 510     const char *ellipses = "";                                          \
 511     IV len = RExC_end - RExC_precomp;                                   \
 512                                                                         \
 513     if (!SIZE_ONLY)                                                     \
 514         SAVEFREESV(RExC_rx_sv);                                         \
 515     if (len > RegexLengthToShowInErrorMessages) {                       \
 516         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 517         len = RegexLengthToShowInErrorMessages - 10;                    \
 518         ellipses = "...";                                               \
 519     }                                                                   \
 520     code;                                                               \
 521 } STMT_END
 522
 523 #define FAIL(msg) _FAIL(                            \
 524     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 525             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 526
 527 #define FAIL2(msg,arg) _FAIL(                       \
 528     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 529             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 530
 531 /*
 532  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 533  */
 534 #define Simple_vFAIL(m) STMT_START {                                    \
 535     const IV offset =                                                   \
 536         (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
 537     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 538             m, REPORT_LOCATION_ARGS(offset));   \
 539 } STMT_END
 540
 541 /*
 542  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 543  */
 544 #define vFAIL(m) STMT_START {                           \
 545     if (!SIZE_ONLY)                                     \
 546         SAVEFREESV(RExC_rx_sv);                         \
 547     Simple_vFAIL(m);                                    \
 548 } STMT_END
 549
 550 /*
 551  * Like Simple_vFAIL(), but accepts two arguments.
 552  */
 553 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 554     const IV offset = RExC_parse - RExC_precomp;                        \
 555     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 556                       REPORT_LOCATION_ARGS(offset));    \
 557 } STMT_END
 558
 559 /*
 560  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 561  */
 562 #define vFAIL2(m,a1) STMT_START {                       \
 563     if (!SIZE_ONLY)                                     \
 564         SAVEFREESV(RExC_rx_sv);                         \
 565     Simple_vFAIL2(m, a1);                               \
 566 } STMT_END
 567
 568
 569 /*
 570  * Like Simple_vFAIL(), but accepts three arguments.
 571  */
 572 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 573     const IV offset = RExC_parse - RExC_precomp;                \
 574     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 575             REPORT_LOCATION_ARGS(offset));      \
 576 } STMT_END
 577
 578 /*
 579  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 580  */
 581 #define vFAIL3(m,a1,a2) STMT_START {                    \
 582     if (!SIZE_ONLY)                                     \
 583         SAVEFREESV(RExC_rx_sv);                         \
 584     Simple_vFAIL3(m, a1, a2);                           \
 585 } STMT_END
 586
 587 /*
 588  * Like Simple_vFAIL(), but accepts four arguments.
 589  */
 590 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 591     const IV offset = RExC_parse - RExC_precomp;                \
 592     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 593             REPORT_LOCATION_ARGS(offset));      \
 594 } STMT_END
 595
 596 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 597     if (!SIZE_ONLY)                                     \
 598         SAVEFREESV(RExC_rx_sv);                         \
 599     Simple_vFAIL4(m, a1, a2, a3);                       \
 600 } STMT_END
 601
 602 /* A specialized version of vFAIL2 that works with UTF8f */
 603 #define vFAIL2utf8f(m, a1) STMT_START { \
 604     const IV offset = RExC_parse - RExC_precomp;   \
 605     if (!SIZE_ONLY)                                \
 606         SAVEFREESV(RExC_rx_sv);                    \
 607     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 608             REPORT_LOCATION_ARGS(offset));         \
 609 } STMT_END
 610
 611 /* These have asserts in them because of [perl #122671] Many warnings in
 612  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 613  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 614  * would get output again.  So they should be output in pass2, and these
 615  * asserts make sure new warnings follow that paradigm. */
 616
 617 /* m is not necessarily a "literal string", in this macro */
 618 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 619     const IV offset = loc - RExC_precomp;                               \
 620     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 621             m, REPORT_LOCATION_ARGS(offset));       \
 622 } STMT_END
 623
 624 #define ckWARNreg(loc,m) STMT_START {                                   \
 625     const IV offset = loc - RExC_precomp;                               \
 626     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 627             REPORT_LOCATION_ARGS(offset));              \
 628 } STMT_END
 629
 630 #define vWARN_dep(loc, m) STMT_START {                                  \
 631     const IV offset = loc - RExC_precomp;                               \
 632     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,    \
 633             REPORT_LOCATION_ARGS(offset));              \
 634 } STMT_END
 635
 636 #define ckWARNdep(loc,m) STMT_START {                                   \
 637     const IV offset = loc - RExC_precomp;                               \
 638     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                  \
 639             m REPORT_LOCATION,                                          \
 640             REPORT_LOCATION_ARGS(offset));              \
 641 } STMT_END
 642
 643 #define ckWARNregdep(loc,m) STMT_START {                                \
 644     const IV offset = loc - RExC_precomp;                               \
 645     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),    \
 646             m REPORT_LOCATION,                                          \
 647             REPORT_LOCATION_ARGS(offset));              \
 648 } STMT_END
 649
 650 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 651     const IV offset = loc - RExC_precomp;                               \
 652     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                      \
 653             m REPORT_LOCATION,                                          \
 654             a1, REPORT_LOCATION_ARGS(offset));  \
 655 } STMT_END
 656
 657 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 658     const IV offset = loc - RExC_precomp;                               \
 659     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 660             a1, REPORT_LOCATION_ARGS(offset));  \
 661 } STMT_END
 662
 663 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 664     const IV offset = loc - RExC_precomp;                               \
 665     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 666             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 667 } STMT_END
 668
 669 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 670     const IV offset = loc - RExC_precomp;                               \
 671     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 672             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 673 } STMT_END
 674
 675 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 676     const IV offset = loc - RExC_precomp;                               \
 677     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 678             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 679 } STMT_END
 680
 681 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 682     const IV offset = loc - RExC_precomp;                               \
 683     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 684             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 685 } STMT_END
 686
 687 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 688     const IV offset = loc - RExC_precomp;                               \
 689     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 690             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 691 } STMT_END
 692
 693
 694 /* Allow for side effects in s */
 695 #define REGC(c,s) STMT_START {                  \
 696     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 697 } STMT_END
 698
 699 /* Macros for recording node offsets.   20001227 mjd@plover.com
 700  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 701  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 702  * Element 0 holds the number n.
 703  * Position is 1 indexed.
 704  */
 705 #ifndef RE_TRACK_PATTERN_OFFSETS
 706 #define Set_Node_Offset_To_R(node,byte)
 707 #define Set_Node_Offset(node,byte)
 708 #define Set_Cur_Node_Offset
 709 #define Set_Node_Length_To_R(node,len)
 710 #define Set_Node_Length(node,len)
 711 #define Set_Node_Cur_Length(node,start)
 712 #define Node_Offset(n)
 713 #define Node_Length(n)
 714 #define Set_Node_Offset_Length(node,offset,len)
 715 #define ProgLen(ri) ri->u.proglen
 716 #define SetProgLen(ri,x) ri->u.proglen = x
 717 #else
 718 #define ProgLen(ri) ri->u.offsets[0]
 719 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 720 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 721     if (! SIZE_ONLY) {                                                  \
 722         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 723                     __LINE__, (int)(node), (int)(byte)));               \
 724         if((node) < 0) {                                                \
 725             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 726                                          (int)(node));                  \
 727         } else {                                                        \
 728             RExC_offsets[2*(node)-1] = (byte);                          \
 729         }                                                               \
 730     }                                                                   \
 731 } STMT_END
 732
 733 #define Set_Node_Offset(node,byte) \
 734     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 735 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 736
 737 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 738     if (! SIZE_ONLY) {                                                  \
 739         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 740                 __LINE__, (int)(node), (int)(len)));                    \
 741         if((node) < 0) {                                                \
 742             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 743                                          (int)(node));                  \
 744         } else {                                                        \
 745             RExC_offsets[2*(node)] = (len);                             \
 746         }                                                               \
 747     }                                                                   \
 748 } STMT_END
 749
 750 #define Set_Node_Length(node,len) \
 751     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 752 #define Set_Node_Cur_Length(node, start)                \
 753     Set_Node_Length(node, RExC_parse - start)
 754
 755 /* Get offsets and lengths */
 756 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 757 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 758
 759 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 760     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 761     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 762 } STMT_END
 763 #endif
 764
 765 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 766 #define EXPERIMENTAL_INPLACESCAN
 767 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 768
 769 #define DEBUG_RExC_seen() \
 770         DEBUG_OPTIMISE_MORE_r({                                             \
 771             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 772                                                                             \
 773             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 774                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 775                                                                             \
 776             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 777                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 778                                                                             \
 779             if (RExC_seen & REG_GPOS_SEEN)                                  \
 780                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 781                                                                             \
 782             if (RExC_seen & REG_CANY_SEEN)                                  \
 783                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 784                                                                             \
 785             if (RExC_seen & REG_RECURSE_SEEN)                               \
 786                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 787                                                                             \
 788             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 789                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 790                                                                             \
 791             if (RExC_seen & REG_VERBARG_SEEN)                               \
 792                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 793                                                                             \
 794             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 795                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 796                                                                             \
 797             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 798                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 799                                                                             \
 800             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 801                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 802                                                                             \
 803             if (RExC_seen & REG_GOSTART_SEEN)                               \
 804                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 805                                                                             \
 806             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 807                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 808                                                                             \
 809             PerlIO_printf(Perl_debug_log,"\n");                             \
 810         });
 811
 812 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 813   if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
 814
 815 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 816     if ( ( flags ) ) {                                                      \
 817         PerlIO_printf(Perl_debug_log, "%s", open_str);                      \
 818         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 819         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 820         DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 821         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 822         DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 823         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 824         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 825         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 826         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 827         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
 828         DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
 829         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
 830         DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
 831         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
 832         DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
 833         PerlIO_printf(Perl_debug_log, "%s", close_str);                     \
 834     }
 835
 836
 837 #define DEBUG_STUDYDATA(str,data,depth)                              \
 838 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 839     PerlIO_printf(Perl_debug_log,                                    \
 840         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 841         " Flags: 0x%"UVXf,                                           \
 842         (int)(depth)*2, "",                                          \
 843         (IV)((data)->pos_min),                                       \
 844         (IV)((data)->pos_delta),                                     \
 845         (UV)((data)->flags)                                          \
 846     );                                                               \
 847     DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
 848     PerlIO_printf(Perl_debug_log,                                    \
 849         " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
 850         (IV)((data)->whilem_c),                                      \
 851         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 852         is_inf ? "INF " : ""                                         \
 853     );                                                               \
 854     if ((data)->last_found)                                          \
 855         PerlIO_printf(Perl_debug_log,                                \
 856             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 857             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 858             SvPVX_const((data)->last_found),                         \
 859             (IV)((data)->last_end),                                  \
 860             (IV)((data)->last_start_min),                            \
 861             (IV)((data)->last_start_max),                            \
 862             ((data)->longest &&                                      \
 863              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 864             SvPVX_const((data)->longest_fixed),                      \
 865             (IV)((data)->offset_fixed),                              \
 866             ((data)->longest &&                                      \
 867              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 868             SvPVX_const((data)->longest_float),                      \
 869             (IV)((data)->offset_float_min),                          \
 870             (IV)((data)->offset_float_max)                           \
 871         );                                                           \
 872     PerlIO_printf(Perl_debug_log,"\n");                              \
 873 });
 874
 875 #ifdef DEBUGGING
 876
 877 /* is c a control character for which we have a mnemonic? */
 878 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 879
 880 STATIC const char *
 881 S_cntrl_to_mnemonic(const U8 c)
 882 {
 883     /* Returns the mnemonic string that represents character 'c', if one
 884      * exists; NULL otherwise.  The only ones that exist for the purposes of
 885      * this routine are a few control characters */
 886
 887     switch (c) {
 888         case '\a':       return "\\a";
 889         case '\b':       return "\\b";
 890         case ESC_NATIVE: return "\\e";
 891         case '\f':       return "\\f";
 892         case '\n':       return "\\n";
 893         case '\r':       return "\\r";
 894         case '\t':       return "\\t";
 895     }
 896
 897     return NULL;
 898 }
 899
 900 #endif
 901
 902 /* Mark that we cannot extend a found fixed substring at this point.
 903    Update the longest found anchored substring and the longest found
 904    floating substrings if needed. */
 905
 906 STATIC void
 907 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 908                     SSize_t *minlenp, int is_inf)
 909 {
 910     const STRLEN l = CHR_SVLEN(data->last_found);
 911     const STRLEN old_l = CHR_SVLEN(*data->longest);
 912     GET_RE_DEBUG_FLAGS_DECL;
 913
 914     PERL_ARGS_ASSERT_SCAN_COMMIT;
 915
 916     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 917         SvSetMagicSV(*data->longest, data->last_found);
 918         if (*data->longest == data->longest_fixed) {
 919             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 920             if (data->flags & SF_BEFORE_EOL)
 921                 data->flags
 922                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 923             else
 924                 data->flags &= ~SF_FIX_BEFORE_EOL;
 925             data->minlen_fixed=minlenp;
 926             data->lookbehind_fixed=0;
 927         }
 928         else { /* *data->longest == data->longest_float */
 929             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 930             data->offset_float_max = (l
 931                                       ? data->last_start_max
 932                                       : (data->pos_delta == SSize_t_MAX
 933                                          ? SSize_t_MAX
 934                                          : data->pos_min + data->pos_delta));
 935             if (is_inf
 936                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 937                 data->offset_float_max = SSize_t_MAX;
 938             if (data->flags & SF_BEFORE_EOL)
 939                 data->flags
 940                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 941             else
 942                 data->flags &= ~SF_FL_BEFORE_EOL;
 943             data->minlen_float=minlenp;
 944             data->lookbehind_float=0;
 945         }
 946     }
 947     SvCUR_set(data->last_found, 0);
 948     {
 949         SV * const sv = data->last_found;
 950         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 951             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 952             if (mg)
 953                 mg->mg_len = 0;
 954         }
 955     }
 956     data->last_end = -1;
 957     data->flags &= ~SF_BEFORE_EOL;
 958     DEBUG_STUDYDATA("commit: ",data,0);
 959 }
 960
 961 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 962  * list that describes which code points it matches */
 963
 964 STATIC void
 965 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 966 {
 967     /* Set the SSC 'ssc' to match an empty string or any code point */
 968
 969     PERL_ARGS_ASSERT_SSC_ANYTHING;
 970
 971     assert(is_ANYOF_SYNTHETIC(ssc));
 972
 973     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 974     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 975     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
 976 }
 977
 978 STATIC int
 979 S_ssc_is_anything(const regnode_ssc *ssc)
 980 {
 981     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 982      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 983      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 984      * in any way, so there's no point in using it */
 985
 986     UV start, end;
 987     bool ret;
 988
 989     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 990
 991     assert(is_ANYOF_SYNTHETIC(ssc));
 992
 993     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
 994         return FALSE;
 995     }
 996
 997     /* See if the list consists solely of the range 0 - Infinity */
 998     invlist_iterinit(ssc->invlist);
 999     ret = invlist_iternext(ssc->invlist, &start, &end)
1000           && start == 0
1001           && end == UV_MAX;
1002
1003     invlist_iterfinish(ssc->invlist);
1004
1005     if (ret) {
1006         return TRUE;
1007     }
1008
1009     /* If e.g., both \w and \W are set, matches everything */
1010     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1011         int i;
1012         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1013             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1014                 return TRUE;
1015             }
1016         }
1017     }
1018
1019     return FALSE;
1020 }
1021
1022 STATIC void
1023 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1024 {
1025     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1026      * string, any code point, or any posix class under locale */
1027
1028     PERL_ARGS_ASSERT_SSC_INIT;
1029
1030     Zero(ssc, 1, regnode_ssc);
1031     set_ANYOF_SYNTHETIC(ssc);
1032     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1033     ssc_anything(ssc);
1034
1035     /* If any portion of the regex is to operate under locale rules,
1036      * initialization includes it.  The reason this isn't done for all regexes
1037      * is that the optimizer was written under the assumption that locale was
1038      * all-or-nothing.  Given the complexity and lack of documentation in the
1039      * optimizer, and that there are inadequate test cases for locale, many
1040      * parts of it may not work properly, it is safest to avoid locale unless
1041      * necessary. */
1042     if (RExC_contains_locale) {
1043         ANYOF_POSIXL_SETALL(ssc);
1044     }
1045     else {
1046         ANYOF_POSIXL_ZERO(ssc);
1047     }
1048 }
1049
1050 STATIC int
1051 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1052                         const regnode_ssc *ssc)
1053 {
1054     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1055      * to the list of code points matched, and locale posix classes; hence does
1056      * not check its flags) */
1057
1058     UV start, end;
1059     bool ret;
1060
1061     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1062
1063     assert(is_ANYOF_SYNTHETIC(ssc));
1064
1065     invlist_iterinit(ssc->invlist);
1066     ret = invlist_iternext(ssc->invlist, &start, &end)
1067           && start == 0
1068           && end == UV_MAX;
1069
1070     invlist_iterfinish(ssc->invlist);
1071
1072     if (! ret) {
1073         return FALSE;
1074     }
1075
1076     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1077         return FALSE;
1078     }
1079
1080     return TRUE;
1081 }
1082
1083 STATIC SV*
1084 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1085                                const regnode_charclass* const node)
1086 {
1087     /* Returns a mortal inversion list defining which code points are matched
1088      * by 'node', which is of type ANYOF.  Handles complementing the result if
1089      * appropriate.  If some code points aren't knowable at this time, the
1090      * returned list must, and will, contain every code point that is a
1091      * possibility. */
1092
1093     SV* invlist = sv_2mortal(_new_invlist(0));
1094     SV* only_utf8_locale_invlist = NULL;
1095     unsigned int i;
1096     const U32 n = ARG(node);
1097     bool new_node_has_latin1 = FALSE;
1098
1099     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1100
1101     /* Look at the data structure created by S_set_ANYOF_arg() */
1102     if (n != ANYOF_ONLY_HAS_BITMAP) {
1103         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1104         AV * const av = MUTABLE_AV(SvRV(rv));
1105         SV **const ary = AvARRAY(av);
1106         assert(RExC_rxi->data->what[n] == 's');
1107
1108         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1109             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1110         }
1111         else if (ary[0] && ary[0] != &PL_sv_undef) {
1112
1113             /* Here, no compile-time swash, and there are things that won't be
1114              * known until runtime -- we have to assume it could be anything */
1115             return _add_range_to_invlist(invlist, 0, UV_MAX);
1116         }
1117         else if (ary[3] && ary[3] != &PL_sv_undef) {
1118
1119             /* Here no compile-time swash, and no run-time only data.  Use the
1120              * node's inversion list */
1121             invlist = sv_2mortal(invlist_clone(ary[3]));
1122         }
1123
1124         /* Get the code points valid only under UTF-8 locales */
1125         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1126             && ary[2] && ary[2] != &PL_sv_undef)
1127         {
1128             only_utf8_locale_invlist = ary[2];
1129         }
1130     }
1131
1132     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1133      * code points, and an inversion list for the others, but if there are code
1134      * points that should match only conditionally on the target string being
1135      * UTF-8, those are placed in the inversion list, and not the bitmap.
1136      * Since there are circumstances under which they could match, they are
1137      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1138      * to exclude them here, so that when we invert below, the end result
1139      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1140      * have to do this here before we add the unconditionally matched code
1141      * points */
1142     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1143         _invlist_intersection_complement_2nd(invlist,
1144                                              PL_UpperLatin1,
1145                                              &invlist);
1146     }
1147
1148     /* Add in the points from the bit map */
1149     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1150         if (ANYOF_BITMAP_TEST(node, i)) {
1151             invlist = add_cp_to_invlist(invlist, i);
1152             new_node_has_latin1 = TRUE;
1153         }
1154     }
1155
1156     /* If this can match all upper Latin1 code points, have to add them
1157      * as well */
1158     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1159         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1160     }
1161
1162     /* Similarly for these */
1163     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1164         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1165     }
1166
1167     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1168         _invlist_invert(invlist);
1169     }
1170     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1171
1172         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1173          * locale.  We can skip this if there are no 0-255 at all. */
1174         _invlist_union(invlist, PL_Latin1, &invlist);
1175     }
1176
1177     /* Similarly add the UTF-8 locale possible matches.  These have to be
1178      * deferred until after the non-UTF-8 locale ones are taken care of just
1179      * above, or it leads to wrong results under ANYOF_INVERT */
1180     if (only_utf8_locale_invlist) {
1181         _invlist_union_maybe_complement_2nd(invlist,
1182                                             only_utf8_locale_invlist,
1183                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1184                                             &invlist);
1185     }
1186
1187     return invlist;
1188 }
1189
1190 /* These two functions currently do the exact same thing */
1191 #define ssc_init_zero           ssc_init
1192
1193 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1194 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1195
1196 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1197  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1198  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1199
1200 STATIC void
1201 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1202                 const regnode_charclass *and_with)
1203 {
1204     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1205      * another SSC or a regular ANYOF class.  Can create false positives. */
1206
1207     SV* anded_cp_list;
1208     U8  anded_flags;
1209
1210     PERL_ARGS_ASSERT_SSC_AND;
1211
1212     assert(is_ANYOF_SYNTHETIC(ssc));
1213
1214     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1215      * the code point inversion list and just the relevant flags */
1216     if (is_ANYOF_SYNTHETIC(and_with)) {
1217         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1218         anded_flags = ANYOF_FLAGS(and_with);
1219
1220         /* XXX This is a kludge around what appears to be deficiencies in the
1221          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1222          * there are paths through the optimizer where it doesn't get weeded
1223          * out when it should.  And if we don't make some extra provision for
1224          * it like the code just below, it doesn't get added when it should.
1225          * This solution is to add it only when AND'ing, which is here, and
1226          * only when what is being AND'ed is the pristine, original node
1227          * matching anything.  Thus it is like adding it to ssc_anything() but
1228          * only when the result is to be AND'ed.  Probably the same solution
1229          * could be adopted for the same problem we have with /l matching,
1230          * which is solved differently in S_ssc_init(), and that would lead to
1231          * fewer false positives than that solution has.  But if this solution
1232          * creates bugs, the consequences are only that a warning isn't raised
1233          * that should be; while the consequences for having /l bugs is
1234          * incorrect matches */
1235         if (ssc_is_anything((regnode_ssc *)and_with)) {
1236             anded_flags |= ANYOF_WARN_SUPER;
1237         }
1238     }
1239     else {
1240         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1241         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1242     }
1243
1244     ANYOF_FLAGS(ssc) &= anded_flags;
1245
1246     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1247      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1248      * 'and_with' may be inverted.  When not inverted, we have the situation of
1249      * computing:
1250      *  (C1 | P1) & (C2 | P2)
1251      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1252      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1253      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1254      *                    <=  ((C1 & C2) | P1 | P2)
1255      * Alternatively, the last few steps could be:
1256      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1257      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1258      *                    <=  (C1 | C2 | (P1 & P2))
1259      * We favor the second approach if either P1 or P2 is non-empty.  This is
1260      * because these components are a barrier to doing optimizations, as what
1261      * they match cannot be known until the moment of matching as they are
1262      * dependent on the current locale, 'AND"ing them likely will reduce or
1263      * eliminate them.
1264      * But we can do better if we know that C1,P1 are in their initial state (a
1265      * frequent occurrence), each matching everything:
1266      *  (<everything>) & (C2 | P2) =  C2 | P2
1267      * Similarly, if C2,P2 are in their initial state (again a frequent
1268      * occurrence), the result is a no-op
1269      *  (C1 | P1) & (<everything>) =  C1 | P1
1270      *
1271      * Inverted, we have
1272      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1273      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1274      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1275      * */
1276
1277     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1278         && ! is_ANYOF_SYNTHETIC(and_with))
1279     {
1280         unsigned int i;
1281
1282         ssc_intersection(ssc,
1283                          anded_cp_list,
1284                          FALSE /* Has already been inverted */
1285                          );
1286
1287         /* If either P1 or P2 is empty, the intersection will be also; can skip
1288          * the loop */
1289         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1290             ANYOF_POSIXL_ZERO(ssc);
1291         }
1292         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1293
1294             /* Note that the Posix class component P from 'and_with' actually
1295              * looks like:
1296              *      P = Pa | Pb | ... | Pn
1297              * where each component is one posix class, such as in [\w\s].
1298              * Thus
1299              *      ~P = ~(Pa | Pb | ... | Pn)
1300              *         = ~Pa & ~Pb & ... & ~Pn
1301              *        <= ~Pa | ~Pb | ... | ~Pn
1302              * The last is something we can easily calculate, but unfortunately
1303              * is likely to have many false positives.  We could do better
1304              * in some (but certainly not all) instances if two classes in
1305              * P have known relationships.  For example
1306              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1307              * So
1308              *      :lower: & :print: = :lower:
1309              * And similarly for classes that must be disjoint.  For example,
1310              * since \s and \w can have no elements in common based on rules in
1311              * the POSIX standard,
1312              *      \w & ^\S = nothing
1313              * Unfortunately, some vendor locales do not meet the Posix
1314              * standard, in particular almost everything by Microsoft.
1315              * The loop below just changes e.g., \w into \W and vice versa */
1316
1317             regnode_charclass_posixl temp;
1318             int add = 1;    /* To calculate the index of the complement */
1319
1320             ANYOF_POSIXL_ZERO(&temp);
1321             for (i = 0; i < ANYOF_MAX; i++) {
1322                 assert(i % 2 != 0
1323                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1324                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1325
1326                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1327                     ANYOF_POSIXL_SET(&temp, i + add);
1328                 }
1329                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1330             }
1331             ANYOF_POSIXL_AND(&temp, ssc);
1332
1333         } /* else ssc already has no posixes */
1334     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1335          in its initial state */
1336     else if (! is_ANYOF_SYNTHETIC(and_with)
1337              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1338     {
1339         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1340          * copy it over 'ssc' */
1341         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1342             if (is_ANYOF_SYNTHETIC(and_with)) {
1343                 StructCopy(and_with, ssc, regnode_ssc);
1344             }
1345             else {
1346                 ssc->invlist = anded_cp_list;
1347                 ANYOF_POSIXL_ZERO(ssc);
1348                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1349                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1350                 }
1351             }
1352         }
1353         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1354                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1355         {
1356             /* One or the other of P1, P2 is non-empty. */
1357             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1358                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1359             }
1360             ssc_union(ssc, anded_cp_list, FALSE);
1361         }
1362         else { /* P1 = P2 = empty */
1363             ssc_intersection(ssc, anded_cp_list, FALSE);
1364         }
1365     }
1366 }
1367
1368 STATIC void
1369 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1370                const regnode_charclass *or_with)
1371 {
1372     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1373      * another SSC or a regular ANYOF class.  Can create false positives if
1374      * 'or_with' is to be inverted. */
1375
1376     SV* ored_cp_list;
1377     U8 ored_flags;
1378
1379     PERL_ARGS_ASSERT_SSC_OR;
1380
1381     assert(is_ANYOF_SYNTHETIC(ssc));
1382
1383     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1384      * the code point inversion list and just the relevant flags */
1385     if (is_ANYOF_SYNTHETIC(or_with)) {
1386         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1387         ored_flags = ANYOF_FLAGS(or_with);
1388     }
1389     else {
1390         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1391         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1392     }
1393
1394     ANYOF_FLAGS(ssc) |= ored_flags;
1395
1396     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1397      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1398      * 'or_with' may be inverted.  When not inverted, we have the simple
1399      * situation of computing:
1400      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1401      * If P1|P2 yields a situation with both a class and its complement are
1402      * set, like having both \w and \W, this matches all code points, and we
1403      * can delete these from the P component of the ssc going forward.  XXX We
1404      * might be able to delete all the P components, but I (khw) am not certain
1405      * about this, and it is better to be safe.
1406      *
1407      * Inverted, we have
1408      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1409      *                         <=  (C1 | P1) | ~C2
1410      *                         <=  (C1 | ~C2) | P1
1411      * (which results in actually simpler code than the non-inverted case)
1412      * */
1413
1414     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1415         && ! is_ANYOF_SYNTHETIC(or_with))
1416     {
1417         /* We ignore P2, leaving P1 going forward */
1418     }   /* else  Not inverted */
1419     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1420         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1421         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1422             unsigned int i;
1423             for (i = 0; i < ANYOF_MAX; i += 2) {
1424                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1425                 {
1426                     ssc_match_all_cp(ssc);
1427                     ANYOF_POSIXL_CLEAR(ssc, i);
1428                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1429                 }
1430             }
1431         }
1432     }
1433
1434     ssc_union(ssc,
1435               ored_cp_list,
1436               FALSE /* Already has been inverted */
1437               );
1438 }
1439
1440 PERL_STATIC_INLINE void
1441 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1442 {
1443     PERL_ARGS_ASSERT_SSC_UNION;
1444
1445     assert(is_ANYOF_SYNTHETIC(ssc));
1446
1447     _invlist_union_maybe_complement_2nd(ssc->invlist,
1448                                         invlist,
1449                                         invert2nd,
1450                                         &ssc->invlist);
1451 }
1452
1453 PERL_STATIC_INLINE void
1454 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1455                          SV* const invlist,
1456                          const bool invert2nd)
1457 {
1458     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1459
1460     assert(is_ANYOF_SYNTHETIC(ssc));
1461
1462     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1463                                                invlist,
1464                                                invert2nd,
1465                                                &ssc->invlist);
1466 }
1467
1468 PERL_STATIC_INLINE void
1469 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1470 {
1471     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1472
1473     assert(is_ANYOF_SYNTHETIC(ssc));
1474
1475     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1476 }
1477
1478 PERL_STATIC_INLINE void
1479 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1480 {
1481     /* AND just the single code point 'cp' into the SSC 'ssc' */
1482
1483     SV* cp_list = _new_invlist(2);
1484
1485     PERL_ARGS_ASSERT_SSC_CP_AND;
1486
1487     assert(is_ANYOF_SYNTHETIC(ssc));
1488
1489     cp_list = add_cp_to_invlist(cp_list, cp);
1490     ssc_intersection(ssc, cp_list,
1491                      FALSE /* Not inverted */
1492                      );
1493     SvREFCNT_dec_NN(cp_list);
1494 }
1495
1496 PERL_STATIC_INLINE void
1497 S_ssc_clear_locale(regnode_ssc *ssc)
1498 {
1499     /* Set the SSC 'ssc' to not match any locale things */
1500     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1501
1502     assert(is_ANYOF_SYNTHETIC(ssc));
1503
1504     ANYOF_POSIXL_ZERO(ssc);
1505     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1506 }
1507
1508 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1509
1510 STATIC bool
1511 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1512 {
1513     /* The synthetic start class is used to hopefully quickly winnow down
1514      * places where a pattern could start a match in the target string.  If it
1515      * doesn't really narrow things down that much, there isn't much point to
1516      * having the overhead of using it.  This function uses some very crude
1517      * heuristics to decide if to use the ssc or not.
1518      *
1519      * It returns TRUE if 'ssc' rules out more than half what it considers to
1520      * be the "likely" possible matches, but of course it doesn't know what the
1521      * actual things being matched are going to be; these are only guesses
1522      *
1523      * For /l matches, it assumes that the only likely matches are going to be
1524      *      in the 0-255 range, uniformly distributed, so half of that is 127
1525      * For /a and /d matches, it assumes that the likely matches will be just
1526      *      the ASCII range, so half of that is 63
1527      * For /u and there isn't anything matching above the Latin1 range, it
1528      *      assumes that that is the only range likely to be matched, and uses
1529      *      half that as the cut-off: 127.  If anything matches above Latin1,
1530      *      it assumes that all of Unicode could match (uniformly), except for
1531      *      non-Unicode code points and things in the General Category "Other"
1532      *      (unassigned, private use, surrogates, controls and formats).  This
1533      *      is a much large number. */
1534
1535     const U32 max_match = (LOC)
1536                           ? 127
1537                           : (! UNI_SEMANTICS)
1538                             ? 63
1539                             : (invlist_highest(ssc->invlist) < 256)
1540                               ? 127
1541                               : ((NON_OTHER_COUNT + 1) / 2) - 1;
1542     U32 count = 0;      /* Running total of number of code points matched by
1543                            'ssc' */
1544     UV start, end;      /* Start and end points of current range in inversion
1545                            list */
1546
1547     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1548
1549     invlist_iterinit(ssc->invlist);
1550     while (invlist_iternext(ssc->invlist, &start, &end)) {
1551
1552         /* /u is the only thing that we expect to match above 255; so if not /u
1553          * and even if there are matches above 255, ignore them.  This catches
1554          * things like \d under /d which does match the digits above 255, but
1555          * since the pattern is /d, it is not likely to be expecting them */
1556         if (! UNI_SEMANTICS) {
1557             if (start > 255) {
1558                 break;
1559             }
1560             end = MIN(end, 255);
1561         }
1562         count += end - start + 1;
1563         if (count > max_match) {
1564             invlist_iterfinish(ssc->invlist);
1565             return FALSE;
1566         }
1567     }
1568
1569     return TRUE;
1570 }
1571
1572
1573 STATIC void
1574 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1575 {
1576     /* The inversion list in the SSC is marked mortal; now we need a more
1577      * permanent copy, which is stored the same way that is done in a regular
1578      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1579      * map */
1580
1581     SV* invlist = invlist_clone(ssc->invlist);
1582
1583     PERL_ARGS_ASSERT_SSC_FINALIZE;
1584
1585     assert(is_ANYOF_SYNTHETIC(ssc));
1586
1587     /* The code in this file assumes that all but these flags aren't relevant
1588      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1589      * by the time we reach here */
1590     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1591
1592     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1593
1594     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1595                                 NULL, NULL, NULL, FALSE);
1596
1597     /* Make sure is clone-safe */
1598     ssc->invlist = NULL;
1599
1600     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1601         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1602     }
1603
1604     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1605 }
1606
1607 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1608 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1609 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1610 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1611                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1612                                : 0 )
1613
1614
1615 #ifdef DEBUGGING
1616 /*
1617    dump_trie(trie,widecharmap,revcharmap)
1618    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1619    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1620
1621    These routines dump out a trie in a somewhat readable format.
1622    The _interim_ variants are used for debugging the interim
1623    tables that are used to generate the final compressed
1624    representation which is what dump_trie expects.
1625
1626    Part of the reason for their existence is to provide a form
1627    of documentation as to how the different representations function.
1628
1629 */
1630
1631 /*
1632   Dumps the final compressed table form of the trie to Perl_debug_log.
1633   Used for debugging make_trie().
1634 */
1635
1636 STATIC void
1637 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1638             AV *revcharmap, U32 depth)
1639 {
1640     U32 state;
1641     SV *sv=sv_newmortal();
1642     int colwidth= widecharmap ? 6 : 4;
1643     U16 word;
1644     GET_RE_DEBUG_FLAGS_DECL;
1645
1646     PERL_ARGS_ASSERT_DUMP_TRIE;
1647
1648     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1649         (int)depth * 2 + 2,"",
1650         "Match","Base","Ofs" );
1651
1652     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1653         SV ** const tmp = av_fetch( revcharmap, state, 0);
1654         if ( tmp ) {
1655             PerlIO_printf( Perl_debug_log, "%*s",
1656                 colwidth,
1657                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1658                             PL_colors[0], PL_colors[1],
1659                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1660                             PERL_PV_ESCAPE_FIRSTCHAR
1661                 )
1662             );
1663         }
1664     }
1665     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1666         (int)depth * 2 + 2,"");
1667
1668     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1669         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1670     PerlIO_printf( Perl_debug_log, "\n");
1671
1672     for( state = 1 ; state < trie->statecount ; state++ ) {
1673         const U32 base = trie->states[ state ].trans.base;
1674
1675         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1676                                        (int)depth * 2 + 2,"", (UV)state);
1677
1678         if ( trie->states[ state ].wordnum ) {
1679             PerlIO_printf( Perl_debug_log, " W%4X",
1680                                            trie->states[ state ].wordnum );
1681         } else {
1682             PerlIO_printf( Perl_debug_log, "%6s", "" );
1683         }
1684
1685         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1686
1687         if ( base ) {
1688             U32 ofs = 0;
1689
1690             while( ( base + ofs  < trie->uniquecharcount ) ||
1691                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1692                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1693                                                                     != state))
1694                     ofs++;
1695
1696             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1697
1698             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1699                 if ( ( base + ofs >= trie->uniquecharcount )
1700                         && ( base + ofs - trie->uniquecharcount
1701                                                         < trie->lasttrans )
1702                         && trie->trans[ base + ofs
1703                                     - trie->uniquecharcount ].check == state )
1704                 {
1705                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1706                     colwidth,
1707                     (UV)trie->trans[ base + ofs
1708                                              - trie->uniquecharcount ].next );
1709                 } else {
1710                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1711                 }
1712             }
1713
1714             PerlIO_printf( Perl_debug_log, "]");
1715
1716         }
1717         PerlIO_printf( Perl_debug_log, "\n" );
1718     }
1719     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1720                                 (int)depth*2, "");
1721     for (word=1; word <= trie->wordcount; word++) {
1722         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1723             (int)word, (int)(trie->wordinfo[word].prev),
1724             (int)(trie->wordinfo[word].len));
1725     }
1726     PerlIO_printf(Perl_debug_log, "\n" );
1727 }
1728 /*
1729   Dumps a fully constructed but uncompressed trie in list form.
1730   List tries normally only are used for construction when the number of
1731   possible chars (trie->uniquecharcount) is very high.
1732   Used for debugging make_trie().
1733 */
1734 STATIC void
1735 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1736                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1737                          U32 depth)
1738 {
1739     U32 state;
1740     SV *sv=sv_newmortal();
1741     int colwidth= widecharmap ? 6 : 4;
1742     GET_RE_DEBUG_FLAGS_DECL;
1743
1744     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1745
1746     /* print out the table precompression.  */
1747     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1748         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1749         "------:-----+-----------------\n" );
1750
1751     for( state=1 ; state < next_alloc ; state ++ ) {
1752         U16 charid;
1753
1754         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1755             (int)depth * 2 + 2,"", (UV)state  );
1756         if ( ! trie->states[ state ].wordnum ) {
1757             PerlIO_printf( Perl_debug_log, "%5s| ","");
1758         } else {
1759             PerlIO_printf( Perl_debug_log, "W%4x| ",
1760                 trie->states[ state ].wordnum
1761             );
1762         }
1763         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1764             SV ** const tmp = av_fetch( revcharmap,
1765                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1766             if ( tmp ) {
1767                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1768                     colwidth,
1769                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1770                               colwidth,
1771                               PL_colors[0], PL_colors[1],
1772                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1773                               | PERL_PV_ESCAPE_FIRSTCHAR
1774                     ) ,
1775                     TRIE_LIST_ITEM(state,charid).forid,
1776                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1777                 );
1778                 if (!(charid % 10))
1779                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1780                         (int)((depth * 2) + 14), "");
1781             }
1782         }
1783         PerlIO_printf( Perl_debug_log, "\n");
1784     }
1785 }
1786
1787 /*
1788   Dumps a fully constructed but uncompressed trie in table form.
1789   This is the normal DFA style state transition table, with a few
1790   twists to facilitate compression later.
1791   Used for debugging make_trie().
1792 */
1793 STATIC void
1794 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1795                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1796                           U32 depth)
1797 {
1798     U32 state;
1799     U16 charid;
1800     SV *sv=sv_newmortal();
1801     int colwidth= widecharmap ? 6 : 4;
1802     GET_RE_DEBUG_FLAGS_DECL;
1803
1804     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1805
1806     /*
1807        print out the table precompression so that we can do a visual check
1808        that they are identical.
1809      */
1810
1811     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1812
1813     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1814         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1815         if ( tmp ) {
1816             PerlIO_printf( Perl_debug_log, "%*s",
1817                 colwidth,
1818                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1819                             PL_colors[0], PL_colors[1],
1820                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1821                             PERL_PV_ESCAPE_FIRSTCHAR
1822                 )
1823             );
1824         }
1825     }
1826
1827     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1828
1829     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1830         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1831     }
1832
1833     PerlIO_printf( Perl_debug_log, "\n" );
1834
1835     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1836
1837         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1838             (int)depth * 2 + 2,"",
1839             (UV)TRIE_NODENUM( state ) );
1840
1841         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1842             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1843             if (v)
1844                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1845             else
1846                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1847         }
1848         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1849             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1850                                             (UV)trie->trans[ state ].check );
1851         } else {
1852             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1853                                             (UV)trie->trans[ state ].check,
1854             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1855         }
1856     }
1857 }
1858
1859 #endif
1860
1861
1862 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1863   startbranch: the first branch in the whole branch sequence
1864   first      : start branch of sequence of branch-exact nodes.
1865                May be the same as startbranch
1866   last       : Thing following the last branch.
1867                May be the same as tail.
1868   tail       : item following the branch sequence
1869   count      : words in the sequence
1870   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1871   depth      : indent depth
1872
1873 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1874
1875 A trie is an N'ary tree where the branches are determined by digital
1876 decomposition of the key. IE, at the root node you look up the 1st character and
1877 follow that branch repeat until you find the end of the branches. Nodes can be
1878 marked as "accepting" meaning they represent a complete word. Eg:
1879
1880   /he|she|his|hers/
1881
1882 would convert into the following structure. Numbers represent states, letters
1883 following numbers represent valid transitions on the letter from that state, if
1884 the number is in square brackets it represents an accepting state, otherwise it
1885 will be in parenthesis.
1886
1887       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1888       |    |
1889       |   (2)
1890       |    |
1891      (1)   +-i->(6)-+-s->[7]
1892       |
1893       +-s->(3)-+-h->(4)-+-e->[5]
1894
1895       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1896
1897 This shows that when matching against the string 'hers' we will begin at state 1
1898 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1899 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1900 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1901 single traverse. We store a mapping from accepting to state to which word was
1902 matched, and then when we have multiple possibilities we try to complete the
1903 rest of the regex in the order in which they occured in the alternation.
1904
1905 The only prior NFA like behaviour that would be changed by the TRIE support is
1906 the silent ignoring of duplicate alternations which are of the form:
1907
1908  / (DUPE|DUPE) X? (?{ ... }) Y /x
1909
1910 Thus EVAL blocks following a trie may be called a different number of times with
1911 and without the optimisation. With the optimisations dupes will be silently
1912 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1913 the following demonstrates:
1914
1915  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1916
1917 which prints out 'word' three times, but
1918
1919  'words'=~/(word|word|word)(?{ print $1 })S/
1920
1921 which doesnt print it out at all. This is due to other optimisations kicking in.
1922
1923 Example of what happens on a structural level:
1924
1925 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1926
1927    1: CURLYM[1] {1,32767}(18)
1928    5:   BRANCH(8)
1929    6:     EXACT <ac>(16)
1930    8:   BRANCH(11)
1931    9:     EXACT <ad>(16)
1932   11:   BRANCH(14)
1933   12:     EXACT <ab>(16)
1934   16:   SUCCEED(0)
1935   17:   NOTHING(18)
1936   18: END(0)
1937
1938 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1939 and should turn into:
1940
1941    1: CURLYM[1] {1,32767}(18)
1942    5:   TRIE(16)
1943         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1944           <ac>
1945           <ad>
1946           <ab>
1947   16:   SUCCEED(0)
1948   17:   NOTHING(18)
1949   18: END(0)
1950
1951 Cases where tail != last would be like /(?foo|bar)baz/:
1952
1953    1: BRANCH(4)
1954    2:   EXACT <foo>(8)
1955    4: BRANCH(7)
1956    5:   EXACT <bar>(8)
1957    7: TAIL(8)
1958    8: EXACT <baz>(10)
1959   10: END(0)
1960
1961 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1962 and would end up looking like:
1963
1964     1: TRIE(8)
1965       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1966         <foo>
1967         <bar>
1968    7: TAIL(8)
1969    8: EXACT <baz>(10)
1970   10: END(0)
1971
1972     d = uvchr_to_utf8_flags(d, uv, 0);
1973
1974 is the recommended Unicode-aware way of saying
1975
1976     *(d++) = uv;
1977 */
1978
1979 #define TRIE_STORE_REVCHAR(val)                                            \
1980     STMT_START {                                                           \
1981         if (UTF) {                                                         \
1982             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1983             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1984             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1985             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1986             SvPOK_on(zlopp);                                               \
1987             SvUTF8_on(zlopp);                                              \
1988             av_push(revcharmap, zlopp);                                    \
1989         } else {                                                           \
1990             char ooooff = (char)val;                                           \
1991             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1992         }                                                                  \
1993         } STMT_END
1994
1995 /* This gets the next character from the input, folding it if not already
1996  * folded. */
1997 #define TRIE_READ_CHAR STMT_START {                                           \
1998     wordlen++;                                                                \
1999     if ( UTF ) {                                                              \
2000         /* if it is UTF then it is either already folded, or does not need    \
2001          * folding */                                                         \
2002         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2003     }                                                                         \
2004     else if (folder == PL_fold_latin1) {                                      \
2005         /* This folder implies Unicode rules, which in the range expressible  \
2006          *  by not UTF is the lower case, with the two exceptions, one of     \
2007          *  which should have been taken care of before calling this */       \
2008         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2009         uvc = toLOWER_L1(*uc);                                                \
2010         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2011         len = 1;                                                              \
2012     } else {                                                                  \
2013         /* raw data, will be folded later if needed */                        \
2014         uvc = (U32)*uc;                                                       \
2015         len = 1;                                                              \
2016     }                                                                         \
2017 } STMT_END
2018
2019
2020
2021 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2022     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2023         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2024         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2025     }                                                           \
2026     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2027     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2028     TRIE_LIST_CUR( state )++;                                   \
2029 } STMT_END
2030
2031 #define TRIE_LIST_NEW(state) STMT_START {                       \
2032     Newxz( trie->states[ state ].trans.list,               \
2033         4, reg_trie_trans_le );                                 \
2034      TRIE_LIST_CUR( state ) = 1;                                \
2035      TRIE_LIST_LEN( state ) = 4;                                \
2036 } STMT_END
2037
2038 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2039     U16 dupe= trie->states[ state ].wordnum;                    \
2040     regnode * const noper_next = regnext( noper );              \
2041                                                                 \
2042     DEBUG_r({                                                   \
2043         /* store the word for dumping */                        \
2044         SV* tmp;                                                \
2045         if (OP(noper) != NOTHING)                               \
2046             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2047         else                                                    \
2048             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2049         av_push( trie_words, tmp );                             \
2050     });                                                         \
2051                                                                 \
2052     curword++;                                                  \
2053     trie->wordinfo[curword].prev   = 0;                         \
2054     trie->wordinfo[curword].len    = wordlen;                   \
2055     trie->wordinfo[curword].accept = state;                     \
2056                                                                 \
2057     if ( noper_next < tail ) {                                  \
2058         if (!trie->jump)                                        \
2059             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2060                                                  sizeof(U16) ); \
2061         trie->jump[curword] = (U16)(noper_next - convert);      \
2062         if (!jumper)                                            \
2063             jumper = noper_next;                                \
2064         if (!nextbranch)                                        \
2065             nextbranch= regnext(cur);                           \
2066     }                                                           \
2067                                                                 \
2068     if ( dupe ) {                                               \
2069         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2070         /* chain, so that when the bits of chain are later    */\
2071         /* linked together, the dups appear in the chain      */\
2072         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2073         trie->wordinfo[dupe].prev = curword;                    \
2074     } else {                                                    \
2075         /* we haven't inserted this word yet.                */ \
2076         trie->states[ state ].wordnum = curword;                \
2077     }                                                           \
2078 } STMT_END
2079
2080
2081 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2082      ( ( base + charid >=  ucharcount                                   \
2083          && base + charid < ubound                                      \
2084          && state == trie->trans[ base - ucharcount + charid ].check    \
2085          && trie->trans[ base - ucharcount + charid ].next )            \
2086            ? trie->trans[ base - ucharcount + charid ].next             \
2087            : ( state==1 ? special : 0 )                                 \
2088       )
2089
2090 #define MADE_TRIE       1
2091 #define MADE_JUMP_TRIE  2
2092 #define MADE_EXACT_TRIE 4
2093
2094 STATIC I32
2095 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2096                   regnode *first, regnode *last, regnode *tail,
2097                   U32 word_count, U32 flags, U32 depth)
2098 {
2099     /* first pass, loop through and scan words */
2100     reg_trie_data *trie;
2101     HV *widecharmap = NULL;
2102     AV *revcharmap = newAV();
2103     regnode *cur;
2104     STRLEN len = 0;
2105     UV uvc = 0;
2106     U16 curword = 0;
2107     U32 next_alloc = 0;
2108     regnode *jumper = NULL;
2109     regnode *nextbranch = NULL;
2110     regnode *convert = NULL;
2111     U32 *prev_states; /* temp array mapping each state to previous one */
2112     /* we just use folder as a flag in utf8 */
2113     const U8 * folder = NULL;
2114
2115 #ifdef DEBUGGING
2116     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2117     AV *trie_words = NULL;
2118     /* along with revcharmap, this only used during construction but both are
2119      * useful during debugging so we store them in the struct when debugging.
2120      */
2121 #else
2122     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2123     STRLEN trie_charcount=0;
2124 #endif
2125     SV *re_trie_maxbuff;
2126     GET_RE_DEBUG_FLAGS_DECL;
2127
2128     PERL_ARGS_ASSERT_MAKE_TRIE;
2129 #ifndef DEBUGGING
2130     PERL_UNUSED_ARG(depth);
2131 #endif
2132
2133     switch (flags) {
2134         case EXACT: break;
2135         case EXACTFA:
2136         case EXACTFU_SS:
2137         case EXACTFU: folder = PL_fold_latin1; break;
2138         case EXACTF:  folder = PL_fold; break;
2139         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2140     }
2141
2142     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2143     trie->refcount = 1;
2144     trie->startstate = 1;
2145     trie->wordcount = word_count;
2146     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2147     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2148     if (flags == EXACT)
2149         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2150     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2151                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2152
2153     DEBUG_r({
2154         trie_words = newAV();
2155     });
2156
2157     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2158     assert(re_trie_maxbuff);
2159     if (!SvIOK(re_trie_maxbuff)) {
2160         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2161     }
2162     DEBUG_TRIE_COMPILE_r({
2163         PerlIO_printf( Perl_debug_log,
2164           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2165           (int)depth * 2 + 2, "",
2166           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2167           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2168     });
2169
2170    /* Find the node we are going to overwrite */
2171     if ( first == startbranch && OP( last ) != BRANCH ) {
2172         /* whole branch chain */
2173         convert = first;
2174     } else {
2175         /* branch sub-chain */
2176         convert = NEXTOPER( first );
2177     }
2178
2179     /*  -- First loop and Setup --
2180
2181        We first traverse the branches and scan each word to determine if it
2182        contains widechars, and how many unique chars there are, this is
2183        important as we have to build a table with at least as many columns as we
2184        have unique chars.
2185
2186        We use an array of integers to represent the character codes 0..255
2187        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2188        the native representation of the character value as the key and IV's for
2189        the coded index.
2190
2191        *TODO* If we keep track of how many times each character is used we can
2192        remap the columns so that the table compression later on is more
2193        efficient in terms of memory by ensuring the most common value is in the
2194        middle and the least common are on the outside.  IMO this would be better
2195        than a most to least common mapping as theres a decent chance the most
2196        common letter will share a node with the least common, meaning the node
2197        will not be compressible. With a middle is most common approach the worst
2198        case is when we have the least common nodes twice.
2199
2200      */
2201
2202     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2203         regnode *noper = NEXTOPER( cur );
2204         const U8 *uc = (U8*)STRING( noper );
2205         const U8 *e  = uc + STR_LEN( noper );
2206         int foldlen = 0;
2207         U32 wordlen      = 0;         /* required init */
2208         STRLEN minchars = 0;
2209         STRLEN maxchars = 0;
2210         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2211                                                bitmap?*/
2212
2213         if (OP(noper) == NOTHING) {
2214             regnode *noper_next= regnext(noper);
2215             if (noper_next != tail && OP(noper_next) == flags) {
2216                 noper = noper_next;
2217                 uc= (U8*)STRING(noper);
2218                 e= uc + STR_LEN(noper);
2219                 trie->minlen= STR_LEN(noper);
2220             } else {
2221                 trie->minlen= 0;
2222                 continue;
2223             }
2224         }
2225
2226         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2227             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2228                                           regardless of encoding */
2229             if (OP( noper ) == EXACTFU_SS) {
2230                 /* false positives are ok, so just set this */
2231                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2232             }
2233         }
2234         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2235                                            branch */
2236             TRIE_CHARCOUNT(trie)++;
2237             TRIE_READ_CHAR;
2238
2239             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2240              * is in effect.  Under /i, this character can match itself, or
2241              * anything that folds to it.  If not under /i, it can match just
2242              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2243              * all fold to k, and all are single characters.   But some folds
2244              * expand to more than one character, so for example LATIN SMALL
2245              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2246              * the string beginning at 'uc' is 'ffi', it could be matched by
2247              * three characters, or just by the one ligature character. (It
2248              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2249              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2250              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2251              * match.)  The trie needs to know the minimum and maximum number
2252              * of characters that could match so that it can use size alone to
2253              * quickly reject many match attempts.  The max is simple: it is
2254              * the number of folded characters in this branch (since a fold is
2255              * never shorter than what folds to it. */
2256
2257             maxchars++;
2258
2259             /* And the min is equal to the max if not under /i (indicated by
2260              * 'folder' being NULL), or there are no multi-character folds.  If
2261              * there is a multi-character fold, the min is incremented just
2262              * once, for the character that folds to the sequence.  Each
2263              * character in the sequence needs to be added to the list below of
2264              * characters in the trie, but we count only the first towards the
2265              * min number of characters needed.  This is done through the
2266              * variable 'foldlen', which is returned by the macros that look
2267              * for these sequences as the number of bytes the sequence
2268              * occupies.  Each time through the loop, we decrement 'foldlen' by
2269              * how many bytes the current char occupies.  Only when it reaches
2270              * 0 do we increment 'minchars' or look for another multi-character
2271              * sequence. */
2272             if (folder == NULL) {
2273                 minchars++;
2274             }
2275             else if (foldlen > 0) {
2276                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2277             }
2278             else {
2279                 minchars++;
2280
2281                 /* See if *uc is the beginning of a multi-character fold.  If
2282                  * so, we decrement the length remaining to look at, to account
2283                  * for the current character this iteration.  (We can use 'uc'
2284                  * instead of the fold returned by TRIE_READ_CHAR because for
2285                  * non-UTF, the latin1_safe macro is smart enough to account
2286                  * for all the unfolded characters, and because for UTF, the
2287                  * string will already have been folded earlier in the
2288                  * compilation process */
2289                 if (UTF) {
2290                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2291                         foldlen -= UTF8SKIP(uc);
2292                     }
2293                 }
2294                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2295                     foldlen--;
2296                 }
2297             }
2298
2299             /* The current character (and any potential folds) should be added
2300              * to the possible matching characters for this position in this
2301              * branch */
2302             if ( uvc < 256 ) {
2303                 if ( folder ) {
2304                     U8 folded= folder[ (U8) uvc ];
2305                     if ( !trie->charmap[ folded ] ) {
2306                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2307                         TRIE_STORE_REVCHAR( folded );
2308                     }
2309                 }
2310                 if ( !trie->charmap[ uvc ] ) {
2311                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2312                     TRIE_STORE_REVCHAR( uvc );
2313                 }
2314                 if ( set_bit ) {
2315                     /* store the codepoint in the bitmap, and its folded
2316                      * equivalent. */
2317                     TRIE_BITMAP_SET(trie, uvc);
2318
2319                     /* store the folded codepoint */
2320                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2321
2322                     if ( !UTF ) {
2323                         /* store first byte of utf8 representation of
2324                            variant codepoints */
2325                         if (! UVCHR_IS_INVARIANT(uvc)) {
2326                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2327                         }
2328                     }
2329                     set_bit = 0; /* We've done our bit :-) */
2330                 }
2331             } else {
2332
2333                 /* XXX We could come up with the list of code points that fold
2334                  * to this using PL_utf8_foldclosures, except not for
2335                  * multi-char folds, as there may be multiple combinations
2336                  * there that could work, which needs to wait until runtime to
2337                  * resolve (The comment about LIGATURE FFI above is such an
2338                  * example */
2339
2340                 SV** svpp;
2341                 if ( !widecharmap )
2342                     widecharmap = newHV();
2343
2344                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2345
2346                 if ( !svpp )
2347                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2348
2349                 if ( !SvTRUE( *svpp ) ) {
2350                     sv_setiv( *svpp, ++trie->uniquecharcount );
2351                     TRIE_STORE_REVCHAR(uvc);
2352                 }
2353             }
2354         } /* end loop through characters in this branch of the trie */
2355
2356         /* We take the min and max for this branch and combine to find the min
2357          * and max for all branches processed so far */
2358         if( cur == first ) {
2359             trie->minlen = minchars;
2360             trie->maxlen = maxchars;
2361         } else if (minchars < trie->minlen) {
2362             trie->minlen = minchars;
2363         } else if (maxchars > trie->maxlen) {
2364             trie->maxlen = maxchars;
2365         }
2366     } /* end first pass */
2367     DEBUG_TRIE_COMPILE_r(
2368         PerlIO_printf( Perl_debug_log,
2369                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2370                 (int)depth * 2 + 2,"",
2371                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2372                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2373                 (int)trie->minlen, (int)trie->maxlen )
2374     );
2375
2376     /*
2377         We now know what we are dealing with in terms of unique chars and
2378         string sizes so we can calculate how much memory a naive
2379         representation using a flat table  will take. If it's over a reasonable
2380         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2381         conservative but potentially much slower representation using an array
2382         of lists.
2383
2384         At the end we convert both representations into the same compressed
2385         form that will be used in regexec.c for matching with. The latter
2386         is a form that cannot be used to construct with but has memory
2387         properties similar to the list form and access properties similar
2388         to the table form making it both suitable for fast searches and
2389         small enough that its feasable to store for the duration of a program.
2390
2391         See the comment in the code where the compressed table is produced
2392         inplace from the flat tabe representation for an explanation of how
2393         the compression works.
2394
2395     */
2396
2397
2398     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2399     prev_states[1] = 0;
2400
2401     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2402                                                     > SvIV(re_trie_maxbuff) )
2403     {
2404         /*
2405             Second Pass -- Array Of Lists Representation
2406
2407             Each state will be represented by a list of charid:state records
2408             (reg_trie_trans_le) the first such element holds the CUR and LEN
2409             points of the allocated array. (See defines above).
2410
2411             We build the initial structure using the lists, and then convert
2412             it into the compressed table form which allows faster lookups
2413             (but cant be modified once converted).
2414         */
2415
2416         STRLEN transcount = 1;
2417
2418         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2419             "%*sCompiling trie using list compiler\n",
2420             (int)depth * 2 + 2, ""));
2421
2422         trie->states = (reg_trie_state *)
2423             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2424                                   sizeof(reg_trie_state) );
2425         TRIE_LIST_NEW(1);
2426         next_alloc = 2;
2427
2428         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2429
2430             regnode *noper   = NEXTOPER( cur );
2431             U8 *uc           = (U8*)STRING( noper );
2432             const U8 *e      = uc + STR_LEN( noper );
2433             U32 state        = 1;         /* required init */
2434             U16 charid       = 0;         /* sanity init */
2435             U32 wordlen      = 0;         /* required init */
2436
2437             if (OP(noper) == NOTHING) {
2438                 regnode *noper_next= regnext(noper);
2439                 if (noper_next != tail && OP(noper_next) == flags) {
2440                     noper = noper_next;
2441                     uc= (U8*)STRING(noper);
2442                     e= uc + STR_LEN(noper);
2443                 }
2444             }
2445
2446             if (OP(noper) != NOTHING) {
2447                 for ( ; uc < e ; uc += len ) {
2448
2449                     TRIE_READ_CHAR;
2450
2451                     if ( uvc < 256 ) {
2452                         charid = trie->charmap[ uvc ];
2453                     } else {
2454                         SV** const svpp = hv_fetch( widecharmap,
2455                                                     (char*)&uvc,
2456                                                     sizeof( UV ),
2457                                                     0);
2458                         if ( !svpp ) {
2459                             charid = 0;
2460                         } else {
2461                             charid=(U16)SvIV( *svpp );
2462                         }
2463                     }
2464                     /* charid is now 0 if we dont know the char read, or
2465                      * nonzero if we do */
2466                     if ( charid ) {
2467
2468                         U16 check;
2469                         U32 newstate = 0;
2470
2471                         charid--;
2472                         if ( !trie->states[ state ].trans.list ) {
2473                             TRIE_LIST_NEW( state );
2474                         }
2475                         for ( check = 1;
2476                               check <= TRIE_LIST_USED( state );
2477                               check++ )
2478                         {
2479                             if ( TRIE_LIST_ITEM( state, check ).forid
2480                                                                     == charid )
2481                             {
2482                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2483                                 break;
2484                             }
2485                         }
2486                         if ( ! newstate ) {
2487                             newstate = next_alloc++;
2488                             prev_states[newstate] = state;
2489                             TRIE_LIST_PUSH( state, charid, newstate );
2490                             transcount++;
2491                         }
2492                         state = newstate;
2493                     } else {
2494                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2495                     }
2496                 }
2497             }
2498             TRIE_HANDLE_WORD(state);
2499
2500         } /* end second pass */
2501
2502         /* next alloc is the NEXT state to be allocated */
2503         trie->statecount = next_alloc;
2504         trie->states = (reg_trie_state *)
2505             PerlMemShared_realloc( trie->states,
2506                                    next_alloc
2507                                    * sizeof(reg_trie_state) );
2508
2509         /* and now dump it out before we compress it */
2510         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2511                                                          revcharmap, next_alloc,
2512                                                          depth+1)
2513         );
2514
2515         trie->trans = (reg_trie_trans *)
2516             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2517         {
2518             U32 state;
2519             U32 tp = 0;
2520             U32 zp = 0;
2521
2522
2523             for( state=1 ; state < next_alloc ; state ++ ) {
2524                 U32 base=0;
2525
2526                 /*
2527                 DEBUG_TRIE_COMPILE_MORE_r(
2528                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2529                 );
2530                 */
2531
2532                 if (trie->states[state].trans.list) {
2533                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2534                     U16 maxid=minid;
2535                     U16 idx;
2536
2537                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2538                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2539                         if ( forid < minid ) {
2540                             minid=forid;
2541                         } else if ( forid > maxid ) {
2542                             maxid=forid;
2543                         }
2544                     }
2545                     if ( transcount < tp + maxid - minid + 1) {
2546                         transcount *= 2;
2547                         trie->trans = (reg_trie_trans *)
2548                             PerlMemShared_realloc( trie->trans,
2549                                                      transcount
2550                                                      * sizeof(reg_trie_trans) );
2551                         Zero( trie->trans + (transcount / 2),
2552                               transcount / 2,
2553                               reg_trie_trans );
2554                     }
2555                     base = trie->uniquecharcount + tp - minid;
2556                     if ( maxid == minid ) {
2557                         U32 set = 0;
2558                         for ( ; zp < tp ; zp++ ) {
2559                             if ( ! trie->trans[ zp ].next ) {
2560                                 base = trie->uniquecharcount + zp - minid;
2561                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2562                                                                    1).newstate;
2563                                 trie->trans[ zp ].check = state;
2564                                 set = 1;
2565                                 break;
2566                             }
2567                         }
2568                         if ( !set ) {
2569                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2570                                                                    1).newstate;
2571                             trie->trans[ tp ].check = state;
2572                             tp++;
2573                             zp = tp;
2574                         }
2575                     } else {
2576                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2577                             const U32 tid = base
2578                                            - trie->uniquecharcount
2579                                            + TRIE_LIST_ITEM( state, idx ).forid;
2580                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2581                                                                 idx ).newstate;
2582                             trie->trans[ tid ].check = state;
2583                         }
2584                         tp += ( maxid - minid + 1 );
2585                     }
2586                     Safefree(trie->states[ state ].trans.list);
2587                 }
2588                 /*
2589                 DEBUG_TRIE_COMPILE_MORE_r(
2590                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2591                 );
2592                 */
2593                 trie->states[ state ].trans.base=base;
2594             }
2595             trie->lasttrans = tp + 1;
2596         }
2597     } else {
2598         /*
2599            Second Pass -- Flat Table Representation.
2600
2601            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2602            each.  We know that we will need Charcount+1 trans at most to store
2603            the data (one row per char at worst case) So we preallocate both
2604            structures assuming worst case.
2605
2606            We then construct the trie using only the .next slots of the entry
2607            structs.
2608
2609            We use the .check field of the first entry of the node temporarily
2610            to make compression both faster and easier by keeping track of how
2611            many non zero fields are in the node.
2612
2613            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2614            transition.
2615
2616            There are two terms at use here: state as a TRIE_NODEIDX() which is
2617            a number representing the first entry of the node, and state as a
2618            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2619            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2620            if there are 2 entrys per node. eg:
2621
2622              A B       A B
2623           1. 2 4    1. 3 7
2624           2. 0 3    3. 0 5
2625           3. 0 0    5. 0 0
2626           4. 0 0    7. 0 0
2627
2628            The table is internally in the right hand, idx form. However as we
2629            also have to deal with the states array which is indexed by nodenum
2630            we have to use TRIE_NODENUM() to convert.
2631
2632         */
2633         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2634             "%*sCompiling trie using table compiler\n",
2635             (int)depth * 2 + 2, ""));
2636
2637         trie->trans = (reg_trie_trans *)
2638             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2639                                   * trie->uniquecharcount + 1,
2640                                   sizeof(reg_trie_trans) );
2641         trie->states = (reg_trie_state *)
2642             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2643                                   sizeof(reg_trie_state) );
2644         next_alloc = trie->uniquecharcount + 1;
2645
2646
2647         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2648
2649             regnode *noper   = NEXTOPER( cur );
2650             const U8 *uc     = (U8*)STRING( noper );
2651             const U8 *e      = uc + STR_LEN( noper );
2652
2653             U32 state        = 1;         /* required init */
2654
2655             U16 charid       = 0;         /* sanity init */
2656             U32 accept_state = 0;         /* sanity init */
2657
2658             U32 wordlen      = 0;         /* required init */
2659
2660             if (OP(noper) == NOTHING) {
2661                 regnode *noper_next= regnext(noper);
2662                 if (noper_next != tail && OP(noper_next) == flags) {
2663                     noper = noper_next;
2664                     uc= (U8*)STRING(noper);
2665                     e= uc + STR_LEN(noper);
2666                 }
2667             }
2668
2669             if ( OP(noper) != NOTHING ) {
2670                 for ( ; uc < e ; uc += len ) {
2671
2672                     TRIE_READ_CHAR;
2673
2674                     if ( uvc < 256 ) {
2675                         charid = trie->charmap[ uvc ];
2676                     } else {
2677                         SV* const * const svpp = hv_fetch( widecharmap,
2678                                                            (char*)&uvc,
2679                                                            sizeof( UV ),
2680                                                            0);
2681                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2682                     }
2683                     if ( charid ) {
2684                         charid--;
2685                         if ( !trie->trans[ state + charid ].next ) {
2686                             trie->trans[ state + charid ].next = next_alloc;
2687                             trie->trans[ state ].check++;
2688                             prev_states[TRIE_NODENUM(next_alloc)]
2689                                     = TRIE_NODENUM(state);
2690                             next_alloc += trie->uniquecharcount;
2691                         }
2692                         state = trie->trans[ state + charid ].next;
2693                     } else {
2694                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2695                     }
2696                     /* charid is now 0 if we dont know the char read, or
2697                      * nonzero if we do */
2698                 }
2699             }
2700             accept_state = TRIE_NODENUM( state );
2701             TRIE_HANDLE_WORD(accept_state);
2702
2703         } /* end second pass */
2704
2705         /* and now dump it out before we compress it */
2706         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2707                                                           revcharmap,
2708                                                           next_alloc, depth+1));
2709
2710         {
2711         /*
2712            * Inplace compress the table.*
2713
2714            For sparse data sets the table constructed by the trie algorithm will
2715            be mostly 0/FAIL transitions or to put it another way mostly empty.
2716            (Note that leaf nodes will not contain any transitions.)
2717
2718            This algorithm compresses the tables by eliminating most such
2719            transitions, at the cost of a modest bit of extra work during lookup:
2720
2721            - Each states[] entry contains a .base field which indicates the
2722            index in the state[] array wheres its transition data is stored.
2723
2724            - If .base is 0 there are no valid transitions from that node.
2725
2726            - If .base is nonzero then charid is added to it to find an entry in
2727            the trans array.
2728
2729            -If trans[states[state].base+charid].check!=state then the
2730            transition is taken to be a 0/Fail transition. Thus if there are fail
2731            transitions at the front of the node then the .base offset will point
2732            somewhere inside the previous nodes data (or maybe even into a node
2733            even earlier), but the .check field determines if the transition is
2734            valid.
2735
2736            XXX - wrong maybe?
2737            The following process inplace converts the table to the compressed
2738            table: We first do not compress the root node 1,and mark all its
2739            .check pointers as 1 and set its .base pointer as 1 as well. This
2740            allows us to do a DFA construction from the compressed table later,
2741            and ensures that any .base pointers we calculate later are greater
2742            than 0.
2743
2744            - We set 'pos' to indicate the first entry of the second node.
2745
2746            - We then iterate over the columns of the node, finding the first and
2747            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2748            and set the .check pointers accordingly, and advance pos
2749            appropriately and repreat for the next node. Note that when we copy
2750            the next pointers we have to convert them from the original
2751            NODEIDX form to NODENUM form as the former is not valid post
2752            compression.
2753
2754            - If a node has no transitions used we mark its base as 0 and do not
2755            advance the pos pointer.
2756
2757            - If a node only has one transition we use a second pointer into the
2758            structure to fill in allocated fail transitions from other states.
2759            This pointer is independent of the main pointer and scans forward
2760            looking for null transitions that are allocated to a state. When it
2761            finds one it writes the single transition into the "hole".  If the
2762            pointer doesnt find one the single transition is appended as normal.
2763
2764            - Once compressed we can Renew/realloc the structures to release the
2765            excess space.
2766
2767            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2768            specifically Fig 3.47 and the associated pseudocode.
2769
2770            demq
2771         */
2772         const U32 laststate = TRIE_NODENUM( next_alloc );
2773         U32 state, charid;
2774         U32 pos = 0, zp=0;
2775         trie->statecount = laststate;
2776
2777         for ( state = 1 ; state < laststate ; state++ ) {
2778             U8 flag = 0;
2779             const U32 stateidx = TRIE_NODEIDX( state );
2780             const U32 o_used = trie->trans[ stateidx ].check;
2781             U32 used = trie->trans[ stateidx ].check;
2782             trie->trans[ stateidx ].check = 0;
2783
2784             for ( charid = 0;
2785                   used && charid < trie->uniquecharcount;
2786                   charid++ )
2787             {
2788                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2789                     if ( trie->trans[ stateidx + charid ].next ) {
2790                         if (o_used == 1) {
2791                             for ( ; zp < pos ; zp++ ) {
2792                                 if ( ! trie->trans[ zp ].next ) {
2793                                     break;
2794                                 }
2795                             }
2796                             trie->states[ state ].trans.base
2797                                                     = zp
2798                                                       + trie->uniquecharcount
2799                                                       - charid ;
2800                             trie->trans[ zp ].next
2801                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2802                                                              + charid ].next );
2803                             trie->trans[ zp ].check = state;
2804                             if ( ++zp > pos ) pos = zp;
2805                             break;
2806                         }
2807                         used--;
2808                     }
2809                     if ( !flag ) {
2810                         flag = 1;
2811                         trie->states[ state ].trans.base
2812                                        = pos + trie->uniquecharcount - charid ;
2813                     }
2814                     trie->trans[ pos ].next
2815                         = SAFE_TRIE_NODENUM(
2816                                        trie->trans[ stateidx + charid ].next );
2817                     trie->trans[ pos ].check = state;
2818                     pos++;
2819                 }
2820             }
2821         }
2822         trie->lasttrans = pos + 1;
2823         trie->states = (reg_trie_state *)
2824             PerlMemShared_realloc( trie->states, laststate
2825                                    * sizeof(reg_trie_state) );
2826         DEBUG_TRIE_COMPILE_MORE_r(
2827             PerlIO_printf( Perl_debug_log,
2828                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2829                 (int)depth * 2 + 2,"",
2830                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2831                        + 1 ),
2832                 (IV)next_alloc,
2833                 (IV)pos,
2834                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2835             );
2836
2837         } /* end table compress */
2838     }
2839     DEBUG_TRIE_COMPILE_MORE_r(
2840             PerlIO_printf(Perl_debug_log,
2841                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2842                 (int)depth * 2 + 2, "",
2843                 (UV)trie->statecount,
2844                 (UV)trie->lasttrans)
2845     );
2846     /* resize the trans array to remove unused space */
2847     trie->trans = (reg_trie_trans *)
2848         PerlMemShared_realloc( trie->trans, trie->lasttrans
2849                                * sizeof(reg_trie_trans) );
2850
2851     {   /* Modify the program and insert the new TRIE node */
2852         U8 nodetype =(U8)(flags & 0xFF);
2853         char *str=NULL;
2854
2855 #ifdef DEBUGGING
2856         regnode *optimize = NULL;
2857 #ifdef RE_TRACK_PATTERN_OFFSETS
2858
2859         U32 mjd_offset = 0;
2860         U32 mjd_nodelen = 0;
2861 #endif /* RE_TRACK_PATTERN_OFFSETS */
2862 #endif /* DEBUGGING */
2863         /*
2864            This means we convert either the first branch or the first Exact,
2865            depending on whether the thing following (in 'last') is a branch
2866            or not and whther first is the startbranch (ie is it a sub part of
2867            the alternation or is it the whole thing.)
2868            Assuming its a sub part we convert the EXACT otherwise we convert
2869            the whole branch sequence, including the first.
2870          */
2871         /* Find the node we are going to overwrite */
2872         if ( first != startbranch || OP( last ) == BRANCH ) {
2873             /* branch sub-chain */
2874             NEXT_OFF( first ) = (U16)(last - first);
2875 #ifdef RE_TRACK_PATTERN_OFFSETS
2876             DEBUG_r({
2877                 mjd_offset= Node_Offset((convert));
2878                 mjd_nodelen= Node_Length((convert));
2879             });
2880 #endif
2881             /* whole branch chain */
2882         }
2883 #ifdef RE_TRACK_PATTERN_OFFSETS
2884         else {
2885             DEBUG_r({
2886                 const  regnode *nop = NEXTOPER( convert );
2887                 mjd_offset= Node_Offset((nop));
2888                 mjd_nodelen= Node_Length((nop));
2889             });
2890         }
2891         DEBUG_OPTIMISE_r(
2892             PerlIO_printf(Perl_debug_log,
2893                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2894                 (int)depth * 2 + 2, "",
2895                 (UV)mjd_offset, (UV)mjd_nodelen)
2896         );
2897 #endif
2898         /* But first we check to see if there is a common prefix we can
2899            split out as an EXACT and put in front of the TRIE node.  */
2900         trie->startstate= 1;
2901         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2902             U32 state;
2903             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2904                 U32 ofs = 0;
2905                 I32 idx = -1;
2906                 U32 count = 0;
2907                 const U32 base = trie->states[ state ].trans.base;
2908
2909                 if ( trie->states[state].wordnum )
2910                         count = 1;
2911
2912                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2913                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2914                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2915                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2916                     {
2917                         if ( ++count > 1 ) {
2918                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2919                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2920                             if ( state == 1 ) break;
2921                             if ( count == 2 ) {
2922                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2923                                 DEBUG_OPTIMISE_r(
2924                                     PerlIO_printf(Perl_debug_log,
2925                                         "%*sNew Start State=%"UVuf" Class: [",
2926                                         (int)depth * 2 + 2, "",
2927                                         (UV)state));
2928                                 if (idx >= 0) {
2929                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2930                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2931
2932                                     TRIE_BITMAP_SET(trie,*ch);
2933                                     if ( folder )
2934                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2935                                     DEBUG_OPTIMISE_r(
2936                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2937                                     );
2938                                 }
2939                             }
2940                             TRIE_BITMAP_SET(trie,*ch);
2941                             if ( folder )
2942                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2943                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2944                         }
2945                         idx = ofs;
2946                     }
2947                 }
2948                 if ( count == 1 ) {
2949                     SV **tmp = av_fetch( revcharmap, idx, 0);
2950                     STRLEN len;
2951                     char *ch = SvPV( *tmp, len );
2952                     DEBUG_OPTIMISE_r({
2953                         SV *sv=sv_newmortal();
2954                         PerlIO_printf( Perl_debug_log,
2955                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2956                             (int)depth * 2 + 2, "",
2957                             (UV)state, (UV)idx,
2958                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2959                                 PL_colors[0], PL_colors[1],
2960                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2961                                 PERL_PV_ESCAPE_FIRSTCHAR
2962                             )
2963                         );
2964                     });
2965                     if ( state==1 ) {
2966                         OP( convert ) = nodetype;
2967                         str=STRING(convert);
2968                         STR_LEN(convert)=0;
2969                     }
2970                     STR_LEN(convert) += len;
2971                     while (len--)
2972                         *str++ = *ch++;
2973                 } else {
2974 #ifdef DEBUGGING
2975                     if (state>1)
2976                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2977 #endif
2978                     break;
2979                 }
2980             }
2981             trie->prefixlen = (state-1);
2982             if (str) {
2983                 regnode *n = convert+NODE_SZ_STR(convert);
2984                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2985                 trie->startstate = state;
2986                 trie->minlen -= (state - 1);
2987                 trie->maxlen -= (state - 1);
2988 #ifdef DEBUGGING
2989                /* At least the UNICOS C compiler choked on this
2990                 * being argument to DEBUG_r(), so let's just have
2991                 * it right here. */
2992                if (
2993 #ifdef PERL_EXT_RE_BUILD
2994                    1
2995 #else
2996                    DEBUG_r_TEST
2997 #endif
2998                    ) {
2999                    regnode *fix = convert;
3000                    U32 word = trie->wordcount;
3001                    mjd_nodelen++;
3002                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3003                    while( ++fix < n ) {
3004                        Set_Node_Offset_Length(fix, 0, 0);
3005                    }
3006                    while (word--) {
3007                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3008                        if (tmp) {
3009                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3010                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3011                            else
3012                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3013                        }
3014                    }
3015                }
3016 #endif
3017                 if (trie->maxlen) {
3018                     convert = n;
3019                 } else {
3020                     NEXT_OFF(convert) = (U16)(tail - convert);
3021                     DEBUG_r(optimize= n);
3022                 }
3023             }
3024         }
3025         if (!jumper)
3026             jumper = last;
3027         if ( trie->maxlen ) {
3028             NEXT_OFF( convert ) = (U16)(tail - convert);
3029             ARG_SET( convert, data_slot );
3030             /* Store the offset to the first unabsorbed branch in
3031                jump[0], which is otherwise unused by the jump logic.
3032                We use this when dumping a trie and during optimisation. */
3033             if (trie->jump)
3034                 trie->jump[0] = (U16)(nextbranch - convert);
3035
3036             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3037              *   and there is a bitmap
3038              *   and the first "jump target" node we found leaves enough room
3039              * then convert the TRIE node into a TRIEC node, with the bitmap
3040              * embedded inline in the opcode - this is hypothetically faster.
3041              */
3042             if ( !trie->states[trie->startstate].wordnum
3043                  && trie->bitmap
3044                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3045             {
3046                 OP( convert ) = TRIEC;
3047                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3048                 PerlMemShared_free(trie->bitmap);
3049                 trie->bitmap= NULL;
3050             } else
3051                 OP( convert ) = TRIE;
3052
3053             /* store the type in the flags */
3054             convert->flags = nodetype;
3055             DEBUG_r({
3056             optimize = convert
3057                       + NODE_STEP_REGNODE
3058                       + regarglen[ OP( convert ) ];
3059             });
3060             /* XXX We really should free up the resource in trie now,
3061                    as we won't use them - (which resources?) dmq */
3062         }
3063         /* needed for dumping*/
3064         DEBUG_r(if (optimize) {
3065             regnode *opt = convert;
3066
3067             while ( ++opt < optimize) {
3068                 Set_Node_Offset_Length(opt,0,0);
3069             }
3070             /*
3071                 Try to clean up some of the debris left after the
3072                 optimisation.
3073              */
3074             while( optimize < jumper ) {
3075                 mjd_nodelen += Node_Length((optimize));
3076                 OP( optimize ) = OPTIMIZED;
3077                 Set_Node_Offset_Length(optimize,0,0);
3078                 optimize++;
3079             }
3080             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3081         });
3082     } /* end node insert */
3083
3084     /*  Finish populating the prev field of the wordinfo array.  Walk back
3085      *  from each accept state until we find another accept state, and if
3086      *  so, point the first word's .prev field at the second word. If the
3087      *  second already has a .prev field set, stop now. This will be the
3088      *  case either if we've already processed that word's accept state,
3089      *  or that state had multiple words, and the overspill words were
3090      *  already linked up earlier.
3091      */
3092     {
3093         U16 word;
3094         U32 state;
3095         U16 prev;
3096
3097         for (word=1; word <= trie->wordcount; word++) {
3098             prev = 0;
3099             if (trie->wordinfo[word].prev)
3100                 continue;
3101             state = trie->wordinfo[word].accept;
3102             while (state) {
3103                 state = prev_states[state];
3104                 if (!state)
3105                     break;
3106                 prev = trie->states[state].wordnum;
3107                 if (prev)
3108                     break;
3109             }
3110             trie->wordinfo[word].prev = prev;
3111         }
3112         Safefree(prev_states);
3113     }
3114
3115
3116     /* and now dump out the compressed format */
3117     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3118
3119     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3120 #ifdef DEBUGGING
3121     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3122     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3123 #else
3124     SvREFCNT_dec_NN(revcharmap);
3125 #endif
3126     return trie->jump
3127            ? MADE_JUMP_TRIE
3128            : trie->startstate>1
3129              ? MADE_EXACT_TRIE
3130              : MADE_TRIE;
3131 }
3132
3133 STATIC regnode *
3134 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3135 {
3136 /* The Trie is constructed and compressed now so we can build a fail array if
3137  * it's needed
3138
3139    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3140    3.32 in the
3141    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3142    Ullman 1985/88
3143    ISBN 0-201-10088-6
3144
3145    We find the fail state for each state in the trie, this state is the longest
3146    proper suffix of the current state's 'word' that is also a proper prefix of
3147    another word in our trie. State 1 represents the word '' and is thus the
3148    default fail state. This allows the DFA not to have to restart after its
3149    tried and failed a word at a given point, it simply continues as though it
3150    had been matching the other word in the first place.
3151    Consider
3152       'abcdgu'=~/abcdefg|cdgu/
3153    When we get to 'd' we are still matching the first word, we would encounter
3154    'g' which would fail, which would bring us to the state representing 'd' in
3155    the second word where we would try 'g' and succeed, proceeding to match
3156    'cdgu'.
3157  */
3158  /* add a fail transition */
3159     const U32 trie_offset = ARG(source);
3160     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3161     U32 *q;
3162     const U32 ucharcount = trie->uniquecharcount;
3163     const U32 numstates = trie->statecount;
3164     const U32 ubound = trie->lasttrans + ucharcount;
3165     U32 q_read = 0;
3166     U32 q_write = 0;
3167     U32 charid;
3168     U32 base = trie->states[ 1 ].trans.base;
3169     U32 *fail;
3170     reg_ac_data *aho;
3171     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3172     regnode *stclass;
3173     GET_RE_DEBUG_FLAGS_DECL;
3174
3175     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3176     PERL_UNUSED_CONTEXT;
3177 #ifndef DEBUGGING
3178     PERL_UNUSED_ARG(depth);
3179 #endif
3180
3181     if ( OP(source) == TRIE ) {
3182         struct regnode_1 *op = (struct regnode_1 *)
3183             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3184         StructCopy(source,op,struct regnode_1);
3185         stclass = (regnode *)op;
3186     } else {
3187         struct regnode_charclass *op = (struct regnode_charclass *)
3188             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3189         StructCopy(source,op,struct regnode_charclass);
3190         stclass = (regnode *)op;
3191     }
3192     OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3193
3194     ARG_SET( stclass, data_slot );
3195     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3196     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3197     aho->trie=trie_offset;
3198     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3199     Copy( trie->states, aho->states, numstates, reg_trie_state );
3200     Newxz( q, numstates, U32);
3201     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3202     aho->refcount = 1;
3203     fail = aho->fail;
3204     /* initialize fail[0..1] to be 1 so that we always have
3205        a valid final fail state */
3206     fail[ 0 ] = fail[ 1 ] = 1;
3207
3208     for ( charid = 0; charid < ucharcount ; charid++ ) {
3209         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3210         if ( newstate ) {
3211             q[ q_write ] = newstate;
3212             /* set to point at the root */
3213             fail[ q[ q_write++ ] ]=1;
3214         }
3215     }
3216     while ( q_read < q_write) {
3217         const U32 cur = q[ q_read++ % numstates ];
3218         base = trie->states[ cur ].trans.base;
3219
3220         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3221             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3222             if (ch_state) {
3223                 U32 fail_state = cur;
3224                 U32 fail_base;
3225                 do {
3226                     fail_state = fail[ fail_state ];
3227                     fail_base = aho->states[ fail_state ].trans.base;
3228                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3229
3230                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3231                 fail[ ch_state ] = fail_state;
3232                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3233                 {
3234                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3235                 }
3236                 q[ q_write++ % numstates] = ch_state;
3237             }
3238         }
3239     }
3240     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3241        when we fail in state 1, this allows us to use the
3242        charclass scan to find a valid start char. This is based on the principle
3243        that theres a good chance the string being searched contains lots of stuff
3244        that cant be a start char.
3245      */
3246     fail[ 0 ] = fail[ 1 ] = 0;
3247     DEBUG_TRIE_COMPILE_r({
3248         PerlIO_printf(Perl_debug_log,
3249                       "%*sStclass Failtable (%"UVuf" states): 0",
3250                       (int)(depth * 2), "", (UV)numstates
3251         );
3252         for( q_read=1; q_read<numstates; q_read++ ) {
3253             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3254         }
3255         PerlIO_printf(Perl_debug_log, "\n");
3256     });
3257     Safefree(q);
3258     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3259     return stclass;
3260 }
3261
3262
3263 #define DEBUG_PEEP(str,scan,depth) \
3264     DEBUG_OPTIMISE_r({if (scan){ \
3265        regnode *Next = regnext(scan); \
3266        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3267        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3268            (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3269            Next ? (REG_NODE_NUM(Next)) : 0 ); \
3270        DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3271        PerlIO_printf(Perl_debug_log, "\n"); \
3272    }});
3273
3274 /* The below joins as many adjacent EXACTish nodes as possible into a single
3275  * one.  The regop may be changed if the node(s) contain certain sequences that
3276  * require special handling.  The joining is only done if:
3277  * 1) there is room in the current conglomerated node to entirely contain the
3278  *    next one.
3279  * 2) they are the exact same node type
3280  *
3281  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3282  * these get optimized out
3283  *
3284  * If a node is to match under /i (folded), the number of characters it matches
3285  * can be different than its character length if it contains a multi-character
3286  * fold.  *min_subtract is set to the total delta number of characters of the
3287  * input nodes.
3288  *
3289  * And *unfolded_multi_char is set to indicate whether or not the node contains
3290  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3291  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3292  * SMALL LETTER SHARP S, as only if the target string being matched against
3293  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3294  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3295  * whose components are all above the Latin1 range are not run-time locale
3296  * dependent, and have already been folded by the time this function is
3297  * called.)
3298  *
3299  * This is as good a place as any to discuss the design of handling these
3300  * multi-character fold sequences.  It's been wrong in Perl for a very long
3301  * time.  There are three code points in Unicode whose multi-character folds
3302  * were long ago discovered to mess things up.  The previous designs for
3303  * dealing with these involved assigning a special node for them.  This
3304  * approach doesn't always work, as evidenced by this example:
3305  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3306  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3307  * would match just the \xDF, it won't be able to handle the case where a
3308  * successful match would have to cross the node's boundary.  The new approach
3309  * that hopefully generally solves the problem generates an EXACTFU_SS node
3310  * that is "sss" in this case.
3311  *
3312  * It turns out that there are problems with all multi-character folds, and not
3313  * just these three.  Now the code is general, for all such cases.  The
3314  * approach taken is:
3315  * 1)   This routine examines each EXACTFish node that could contain multi-
3316  *      character folded sequences.  Since a single character can fold into
3317  *      such a sequence, the minimum match length for this node is less than
3318  *      the number of characters in the node.  This routine returns in
3319  *      *min_subtract how many characters to subtract from the the actual
3320  *      length of the string to get a real minimum match length; it is 0 if
3321  *      there are no multi-char foldeds.  This delta is used by the caller to
3322  *      adjust the min length of the match, and the delta between min and max,
3323  *      so that the optimizer doesn't reject these possibilities based on size
3324  *      constraints.
3325  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3326  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3327  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3328  *      there is a possible fold length change.  That means that a regular
3329  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3330  *      with length changes, and so can be processed faster.  regexec.c takes
3331  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3332  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3333  *      known until runtime).  This saves effort in regex matching.  However,
3334  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3335  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3336  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3337  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3338  *      possibilities for the non-UTF8 patterns are quite simple, except for
3339  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3340  *      members of a fold-pair, and arrays are set up for all of them so that
3341  *      the other member of the pair can be found quickly.  Code elsewhere in
3342  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3343  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3344  *      described in the next item.
3345  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3346  *      validity of the fold won't be known until runtime, and so must remain
3347  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3348  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3349  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3350  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3351  *      The reason this is a problem is that the optimizer part of regexec.c
3352  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3353  *      that a character in the pattern corresponds to at most a single
3354  *      character in the target string.  (And I do mean character, and not byte
3355  *      here, unlike other parts of the documentation that have never been
3356  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3357  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3358  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3359  *      nodes, violate the assumption, and they are the only instances where it
3360  *      is violated.  I'm reluctant to try to change the assumption, as the
3361  *      code involved is impenetrable to me (khw), so instead the code here
3362  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3363  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3364  *      boolean indicating whether or not the node contains such a fold.  When
3365  *      it is true, the caller sets a flag that later causes the optimizer in
3366  *      this file to not set values for the floating and fixed string lengths,
3367  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3368  *      assumption.  Thus, there is no optimization based on string lengths for
3369  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3370  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3371  *      assumption is wrong only in these cases is that all other non-UTF-8
3372  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3373  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3374  *      EXACTF nodes because we don't know at compile time if it actually
3375  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3376  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3377  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3378  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3379  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3380  *      string would require the pattern to be forced into UTF-8, the overhead
3381  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3382  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3383  *      locale.)
3384  *
3385  *      Similarly, the code that generates tries doesn't currently handle
3386  *      not-already-folded multi-char folds, and it looks like a pain to change
3387  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3388  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3389  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3390  *      using /iaa matching will be doing so almost entirely with ASCII
3391  *      strings, so this should rarely be encountered in practice */
3392
3393 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3394     if (PL_regkind[OP(scan)] == EXACT) \
3395         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3396
3397 STATIC U32
3398 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3399                    UV *min_subtract, bool *unfolded_multi_char,
3400                    U32 flags,regnode *val, U32 depth)
3401 {
3402     /* Merge several consecutive EXACTish nodes into one. */
3403     regnode *n = regnext(scan);
3404     U32 stringok = 1;
3405     regnode *next = scan + NODE_SZ_STR(scan);
3406     U32 merged = 0;
3407     U32 stopnow = 0;
3408 #ifdef DEBUGGING
3409     regnode *stop = scan;
3410     GET_RE_DEBUG_FLAGS_DECL;
3411 #else
3412     PERL_UNUSED_ARG(depth);
3413 #endif
3414
3415     PERL_ARGS_ASSERT_JOIN_EXACT;
3416 #ifndef EXPERIMENTAL_INPLACESCAN
3417     PERL_UNUSED_ARG(flags);
3418     PERL_UNUSED_ARG(val);
3419 #endif
3420     DEBUG_PEEP("join",scan,depth);
3421
3422     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3423      * EXACT ones that are mergeable to the current one. */
3424     while (n
3425            && (PL_regkind[OP(n)] == NOTHING
3426                || (stringok && OP(n) == OP(scan)))
3427            && NEXT_OFF(n)
3428            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3429     {
3430
3431         if (OP(n) == TAIL || n > next)
3432             stringok = 0;
3433         if (PL_regkind[OP(n)] == NOTHING) {
3434             DEBUG_PEEP("skip:",n,depth);
3435             NEXT_OFF(scan) += NEXT_OFF(n);
3436             next = n + NODE_STEP_REGNODE;
3437 #ifdef DEBUGGING
3438             if (stringok)
3439                 stop = n;
3440 #endif
3441             n = regnext(n);
3442         }
3443         else if (stringok) {
3444             const unsigned int oldl = STR_LEN(scan);
3445             regnode * const nnext = regnext(n);
3446
3447             /* XXX I (khw) kind of doubt that this works on platforms (should
3448              * Perl ever run on one) where U8_MAX is above 255 because of lots
3449              * of other assumptions */
3450             /* Don't join if the sum can't fit into a single node */
3451             if (oldl + STR_LEN(n) > U8_MAX)
3452                 break;
3453
3454             DEBUG_PEEP("merg",n,depth);
3455             merged++;
3456
3457             NEXT_OFF(scan) += NEXT_OFF(n);
3458             STR_LEN(scan) += STR_LEN(n);
3459             next = n + NODE_SZ_STR(n);
3460             /* Now we can overwrite *n : */
3461             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3462 #ifdef DEBUGGING
3463             stop = next - 1;
3464 #endif
3465             n = nnext;
3466             if (stopnow) break;
3467         }
3468
3469 #ifdef EXPERIMENTAL_INPLACESCAN
3470         if (flags && !NEXT_OFF(n)) {
3471             DEBUG_PEEP("atch", val, depth);
3472             if (reg_off_by_arg[OP(n)]) {
3473                 ARG_SET(n, val - n);
3474             }
3475             else {
3476                 NEXT_OFF(n) = val - n;
3477             }
3478             stopnow = 1;
3479         }
3480 #endif
3481     }
3482
3483     *min_subtract = 0;
3484     *unfolded_multi_char = FALSE;
3485
3486     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3487      * can now analyze for sequences of problematic code points.  (Prior to
3488      * this final joining, sequences could have been split over boundaries, and
3489      * hence missed).  The sequences only happen in folding, hence for any
3490      * non-EXACT EXACTish node */
3491     if (OP(scan) != EXACT) {
3492         U8* s0 = (U8*) STRING(scan);
3493         U8* s = s0;
3494         U8* s_end = s0 + STR_LEN(scan);
3495
3496         int total_count_delta = 0;  /* Total delta number of characters that
3497                                        multi-char folds expand to */
3498
3499         /* One pass is made over the node's string looking for all the
3500          * possibilities.  To avoid some tests in the loop, there are two main
3501          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3502          * non-UTF-8 */
3503         if (UTF) {
3504             U8* folded = NULL;
3505
3506             if (OP(scan) == EXACTFL) {
3507                 U8 *d;
3508
3509                 /* An EXACTFL node would already have been changed to another
3510                  * node type unless there is at least one character in it that
3511                  * is problematic; likely a character whose fold definition
3512                  * won't be known until runtime, and so has yet to be folded.
3513                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3514                  * to handle the UTF-8 case, we need to create a temporary
3515                  * folded copy using UTF-8 locale rules in order to analyze it.
3516                  * This is because our macros that look to see if a sequence is
3517                  * a multi-char fold assume everything is folded (otherwise the
3518                  * tests in those macros would be too complicated and slow).
3519                  * Note that here, the non-problematic folds will have already
3520                  * been done, so we can just copy such characters.  We actually
3521                  * don't completely fold the EXACTFL string.  We skip the
3522                  * unfolded multi-char folds, as that would just create work
3523                  * below to figure out the size they already are */
3524
3525                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3526                 d = folded;
3527                 while (s < s_end) {
3528                     STRLEN s_len = UTF8SKIP(s);
3529                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3530                         Copy(s, d, s_len, U8);
3531                         d += s_len;
3532                     }
3533                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3534                         *unfolded_multi_char = TRUE;
3535                         Copy(s, d, s_len, U8);
3536                         d += s_len;
3537                     }
3538                     else if (isASCII(*s)) {
3539                         *(d++) = toFOLD(*s);
3540                     }
3541                     else {
3542                         STRLEN len;
3543                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3544                         d += len;
3545                     }
3546                     s += s_len;
3547                 }
3548
3549                 /* Point the remainder of the routine to look at our temporary
3550                  * folded copy */
3551                 s = folded;
3552                 s_end = d;
3553             } /* End of creating folded copy of EXACTFL string */
3554
3555             /* Examine the string for a multi-character fold sequence.  UTF-8
3556              * patterns have all characters pre-folded by the time this code is
3557              * executed */
3558             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3559                                      length sequence we are looking for is 2 */
3560             {
3561                 int count = 0;  /* How many characters in a multi-char fold */
3562                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3563                 if (! len) {    /* Not a multi-char fold: get next char */
3564                     s += UTF8SKIP(s);
3565                     continue;
3566                 }
3567
3568                 /* Nodes with 'ss' require special handling, except for
3569                  * EXACTFA-ish for which there is no multi-char fold to this */
3570                 if (len == 2 && *s == 's' && *(s+1) == 's'
3571                     && OP(scan) != EXACTFA
3572                     && OP(scan) != EXACTFA_NO_TRIE)
3573                 {
3574                     count = 2;
3575                     if (OP(scan) != EXACTFL) {
3576                         OP(scan) = EXACTFU_SS;
3577                     }
3578                     s += 2;
3579                 }
3580                 else { /* Here is a generic multi-char fold. */
3581                     U8* multi_end  = s + len;
3582
3583                     /* Count how many characters are in it.  In the case of
3584                      * /aa, no folds which contain ASCII code points are
3585                      * allowed, so check for those, and skip if found. */
3586                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3587                         count = utf8_length(s, multi_end);
3588                         s = multi_end;
3589                     }
3590                     else {
3591                         while (s < multi_end) {
3592                             if (isASCII(*s)) {
3593                                 s++;
3594                                 goto next_iteration;
3595                             }
3596                             else {
3597                                 s += UTF8SKIP(s);
3598                             }
3599                             count++;
3600                         }
3601                     }
3602                 }
3603
3604                 /* The delta is how long the sequence is minus 1 (1 is how long
3605                  * the character that folds to the sequence is) */
3606                 total_count_delta += count - 1;
3607               next_iteration: ;
3608             }
3609
3610             /* We created a temporary folded copy of the string in EXACTFL
3611              * nodes.  Therefore we need to be sure it doesn't go below zero,
3612              * as the real string could be shorter */
3613             if (OP(scan) == EXACTFL) {
3614                 int total_chars = utf8_length((U8*) STRING(scan),
3615                                            (U8*) STRING(scan) + STR_LEN(scan));
3616                 if (total_count_delta > total_chars) {
3617                     total_count_delta = total_chars;
3618                 }
3619             }
3620
3621             *min_subtract += total_count_delta;
3622             Safefree(folded);
3623         }
3624         else if (OP(scan) == EXACTFA) {
3625
3626             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3627              * fold to the ASCII range (and there are no existing ones in the
3628              * upper latin1 range).  But, as outlined in the comments preceding
3629              * this function, we need to flag any occurrences of the sharp s.
3630              * This character forbids trie formation (because of added
3631              * complexity) */
3632             while (s < s_end) {
3633                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3634                     OP(scan) = EXACTFA_NO_TRIE;
3635                     *unfolded_multi_char = TRUE;
3636                     break;
3637                 }
3638                 s++;
3639                 continue;
3640             }
3641         }
3642         else {
3643
3644             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3645              * folds that are all Latin1.  As explained in the comments
3646              * preceding this function, we look also for the sharp s in EXACTF
3647              * and EXACTFL nodes; it can be in the final position.  Otherwise
3648              * we can stop looking 1 byte earlier because have to find at least
3649              * two characters for a multi-fold */
3650             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3651                               ? s_end
3652                               : s_end -1;
3653
3654             while (s < upper) {
3655                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3656                 if (! len) {    /* Not a multi-char fold. */
3657                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3658                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3659                     {
3660                         *unfolded_multi_char = TRUE;
3661                     }
3662                     s++;
3663                     continue;
3664                 }
3665
3666                 if (len == 2
3667                     && isALPHA_FOLD_EQ(*s, 's')
3668                     && isALPHA_FOLD_EQ(*(s+1), 's'))
3669                 {
3670
3671                     /* EXACTF nodes need to know that the minimum length
3672                      * changed so that a sharp s in the string can match this
3673                      * ss in the pattern, but they remain EXACTF nodes, as they
3674                      * won't match this unless the target string is is UTF-8,
3675                      * which we don't know until runtime.  EXACTFL nodes can't
3676                      * transform into EXACTFU nodes */
3677                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3678                         OP(scan) = EXACTFU_SS;
3679                     }
3680                 }
3681
3682                 *min_subtract += len - 1;
3683                 s += len;
3684             }
3685         }
3686     }
3687
3688 #ifdef DEBUGGING
3689     /* Allow dumping but overwriting the collection of skipped
3690      * ops and/or strings with fake optimized ops */
3691     n = scan + NODE_SZ_STR(scan);
3692     while (n <= stop) {
3693         OP(n) = OPTIMIZED;
3694         FLAGS(n) = 0;
3695         NEXT_OFF(n) = 0;
3696         n++;
3697     }
3698 #endif
3699     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3700     return stopnow;
3701 }
3702
3703 /* REx optimizer.  Converts nodes into quicker variants "in place".
3704    Finds fixed substrings.  */
3705
3706 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3707    to the position after last scanned or to NULL. */
3708
3709 #define INIT_AND_WITHP \
3710     assert(!and_withp); \
3711     Newx(and_withp,1, regnode_ssc); \
3712     SAVEFREEPV(and_withp)
3713
3714
3715 static void
3716 S_unwind_scan_frames(pTHX_ const void *p)
3717 {
3718     scan_frame *f= (scan_frame *)p;
3719     do {
3720         scan_frame *n= f->next_frame;
3721         Safefree(f);
3722         f= n;
3723     } while (f);
3724 }
3725
3726
3727 STATIC SSize_t
3728 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3729                         SSize_t *minlenp, SSize_t *deltap,
3730                         regnode *last,
3731                         scan_data_t *data,
3732                         I32 stopparen,
3733                         U32 recursed_depth,
3734                         regnode_ssc *and_withp,
3735                         U32 flags, U32 depth)
3736                         /* scanp: Start here (read-write). */
3737                         /* deltap: Write maxlen-minlen here. */
3738                         /* last: Stop before this one. */
3739                         /* data: string data about the pattern */
3740                         /* stopparen: treat close N as END */
3741                         /* recursed: which subroutines have we recursed into */
3742                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3743 {
3744     /* There must be at least this number of characters to match */
3745     SSize_t min = 0;
3746     I32 pars = 0, code;
3747     regnode *scan = *scanp, *next;
3748     SSize_t delta = 0;
3749     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3750     int is_inf_internal = 0;            /* The studied chunk is infinite */
3751     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3752     scan_data_t data_fake;
3753     SV *re_trie_maxbuff = NULL;
3754     regnode *first_non_open = scan;
3755     SSize_t stopmin = SSize_t_MAX;
3756     scan_frame *frame = NULL;
3757     GET_RE_DEBUG_FLAGS_DECL;
3758
3759     PERL_ARGS_ASSERT_STUDY_CHUNK;
3760
3761
3762     if ( depth == 0 ) {
3763         while (first_non_open && OP(first_non_open) == OPEN)
3764             first_non_open=regnext(first_non_open);
3765     }
3766
3767
3768   fake_study_recurse:
3769     DEBUG_r(
3770         RExC_study_chunk_recursed_count++;
3771     );
3772     DEBUG_OPTIMISE_MORE_r(
3773     {
3774         PerlIO_printf(Perl_debug_log,
3775             "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3776             ((int) depth*2), "", (long)stopparen,
3777             (unsigned long)RExC_study_chunk_recursed_count,
3778             (unsigned long)depth, (unsigned long)recursed_depth,
3779             scan,
3780             last);
3781         if (recursed_depth) {
3782             U32 i;
3783             U32 j;
3784             for ( j = 0 ; j < recursed_depth ; j++ ) {
3785                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3786                     if (
3787                         PAREN_TEST(RExC_study_chunk_recursed +
3788                                    ( j * RExC_study_chunk_recursed_bytes), i )
3789                         && (
3790                             !j ||
3791                             !PAREN_TEST(RExC_study_chunk_recursed +
3792                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3793                         )
3794                     ) {
3795                         PerlIO_printf(Perl_debug_log," %d",i);
3796                         break;
3797                     }
3798                 }
3799                 if ( j + 1 < recursed_depth ) {
3800                     PerlIO_printf(Perl_debug_log, ",");
3801                 }
3802             }
3803         }
3804         PerlIO_printf(Perl_debug_log,"\n");
3805     }
3806     );
3807     while ( scan && OP(scan) != END && scan < last ){
3808         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3809                                    node length to get a real minimum (because
3810                                    the folded version may be shorter) */
3811         bool unfolded_multi_char = FALSE;
3812         /* Peephole optimizer: */
3813         DEBUG_STUDYDATA("Peep:", data, depth);
3814         DEBUG_PEEP("Peep", scan, depth);
3815
3816
3817         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3818          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3819          * by a different invocation of reg() -- Yves
3820          */
3821         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3822
3823         /* Follow the next-chain of the current node and optimize
3824            away all the NOTHINGs from it.  */
3825         if (OP(scan) != CURLYX) {
3826             const int max = (reg_off_by_arg[OP(scan)]
3827                        ? I32_MAX
3828                        /* I32 may be smaller than U16 on CRAYs! */
3829                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3830             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3831             int noff;
3832             regnode *n = scan;
3833
3834             /* Skip NOTHING and LONGJMP. */
3835             while ((n = regnext(n))
3836                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3837                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3838                    && off + noff < max)
3839                 off += noff;
3840             if (reg_off_by_arg[OP(scan)])
3841                 ARG(scan) = off;
3842             else
3843                 NEXT_OFF(scan) = off;
3844         }
3845
3846         /* The principal pseudo-switch.  Cannot be a switch, since we
3847            look into several different things.  */
3848         if ( OP(scan) == DEFINEP ) {
3849             SSize_t minlen = 0;
3850             SSize_t deltanext = 0;
3851             SSize_t fake_last_close = 0;
3852             I32 f = SCF_IN_DEFINE;
3853
3854             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3855             scan = regnext(scan);
3856             assert( OP(scan) == IFTHEN );
3857             DEBUG_PEEP("expect IFTHEN", scan, depth);
3858
3859             data_fake.last_closep= &fake_last_close;
3860             minlen = *minlenp;
3861             next = regnext(scan);
3862             scan = NEXTOPER(NEXTOPER(scan));
3863             DEBUG_PEEP("scan", scan, depth);
3864             DEBUG_PEEP("next", next, depth);
3865
3866             /* we suppose the run is continuous, last=next...
3867              * NOTE we dont use the return here! */
3868             (void)study_chunk(pRExC_state, &scan, &minlen,
3869                               &deltanext, next, &data_fake, stopparen,
3870                               recursed_depth, NULL, f, depth+1);
3871
3872             scan = next;
3873         } else
3874         if (
3875             OP(scan) == BRANCH  ||
3876             OP(scan) == BRANCHJ ||
3877             OP(scan) == IFTHEN
3878         ) {
3879             next = regnext(scan);
3880             code = OP(scan);
3881
3882             /* The op(next)==code check below is to see if we
3883              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3884              * IFTHEN is special as it might not appear in pairs.
3885              * Not sure whether BRANCH-BRANCHJ is possible, regardless
3886              * we dont handle it cleanly. */
3887             if (OP(next) == code || code == IFTHEN) {
3888                 /* NOTE - There is similar code to this block below for
3889                  * handling TRIE nodes on a re-study.  If you change stuff here
3890                  * check there too. */
3891                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3892                 regnode_ssc accum;
3893                 regnode * const startbranch=scan;
3894
3895                 if (flags & SCF_DO_SUBSTR) {
3896                     /* Cannot merge strings after this. */
3897                     scan_commit(pRExC_state, data, minlenp, is_inf);
3898                 }
3899
3900                 if (flags & SCF_DO_STCLASS)
3901                     ssc_init_zero(pRExC_state, &accum);
3902
3903                 while (OP(scan) == code) {
3904                     SSize_t deltanext, minnext, fake;
3905                     I32 f = 0;
3906                     regnode_ssc this_class;
3907
3908                     DEBUG_PEEP("Branch", scan, depth);
3909
3910                     num++;
3911                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3912                     if (data) {
3913                         data_fake.whilem_c = data->whilem_c;
3914                         data_fake.last_closep = data->last_closep;
3915                     }
3916                     else
3917                         data_fake.last_closep = &fake;
3918
3919                     data_fake.pos_delta = delta;
3920                     next = regnext(scan);
3921
3922                     scan = NEXTOPER(scan); /* everything */
3923                     if (code != BRANCH)    /* everything but BRANCH */
3924                         scan = NEXTOPER(scan);
3925
3926                     if (flags & SCF_DO_STCLASS) {
3927                         ssc_init(pRExC_state, &this_class);
3928                         data_fake.start_class = &this_class;
3929                         f = SCF_DO_STCLASS_AND;
3930                     }
3931                     if (flags & SCF_WHILEM_VISITED_POS)
3932                         f |= SCF_WHILEM_VISITED_POS;
3933
3934                     /* we suppose the run is continuous, last=next...*/
3935                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3936                                       &deltanext, next, &data_fake, stopparen,
3937                                       recursed_depth, NULL, f,depth+1);
3938
3939                     if (min1 > minnext)
3940                         min1 = minnext;
3941                     if (deltanext == SSize_t_MAX) {
3942                         is_inf = is_inf_internal = 1;
3943                         max1 = SSize_t_MAX;
3944                     } else if (max1 < minnext + deltanext)
3945                         max1 = minnext + deltanext;
3946                     scan = next;
3947                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3948                         pars++;
3949                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3950                         if ( stopmin > minnext)
3951                             stopmin = min + min1;
3952                         flags &= ~SCF_DO_SUBSTR;
3953                         if (data)
3954                             data->flags |= SCF_SEEN_ACCEPT;
3955                     }
3956                     if (data) {
3957                         if (data_fake.flags & SF_HAS_EVAL)
3958                             data->flags |= SF_HAS_EVAL;
3959                         data->whilem_c = data_fake.whilem_c;
3960                     }
3961                     if (flags & SCF_DO_STCLASS)
3962                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3963                 }
3964                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3965                     min1 = 0;
3966                 if (flags & SCF_DO_SUBSTR) {
3967                     data->pos_min += min1;
3968                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3969                         data->pos_delta = SSize_t_MAX;
3970                     else
3971                         data->pos_delta += max1 - min1;
3972                     if (max1 != min1 || is_inf)
3973                         data->longest = &(data->longest_float);
3974                 }
3975                 min += min1;
3976                 if (delta == SSize_t_MAX
3977                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3978                     delta = SSize_t_MAX;
3979                 else
3980                     delta += max1 - min1;
3981                 if (flags & SCF_DO_STCLASS_OR) {
3982                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3983                     if (min1) {
3984                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3985                         flags &= ~SCF_DO_STCLASS;
3986                     }
3987                 }
3988                 else if (flags & SCF_DO_STCLASS_AND) {
3989                     if (min1) {
3990                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3991                         flags &= ~SCF_DO_STCLASS;
3992                     }
3993                     else {
3994                         /* Switch to OR mode: cache the old value of
3995                          * data->start_class */
3996                         INIT_AND_WITHP;
3997                         StructCopy(data->start_class, and_withp, regnode_ssc);
3998                         flags &= ~SCF_DO_STCLASS_AND;
3999                         StructCopy(&accum, data->start_class, regnode_ssc);
4000                         flags |= SCF_DO_STCLASS_OR;
4001                     }
4002                 }
4003
4004                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4005                         OP( startbranch ) == BRANCH )
4006                 {
4007                 /* demq.
4008
4009                    Assuming this was/is a branch we are dealing with: 'scan'
4010                    now points at the item that follows the branch sequence,
4011                    whatever it is. We now start at the beginning of the
4012                    sequence and look for subsequences of
4013
4014                    BRANCH->EXACT=>x1
4015                    BRANCH->EXACT=>x2
4016                    tail
4017
4018                    which would be constructed from a pattern like
4019                    /A|LIST|OF|WORDS/
4020
4021                    If we can find such a subsequence we need to turn the first
4022                    element into a trie and then add the subsequent branch exact
4023                    strings to the trie.
4024
4025                    We have two cases
4026
4027                      1. patterns where the whole set of branches can be
4028                         converted.
4029
4030                      2. patterns where only a subset can be converted.
4031
4032                    In case 1 we can replace the whole set with a single regop
4033                    for the trie. In case 2 we need to keep the start and end
4034                    branches so
4035
4036                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4037                      becomes BRANCH TRIE; BRANCH X;
4038
4039                   There is an additional case, that being where there is a
4040                   common prefix, which gets split out into an EXACT like node
4041                   preceding the TRIE node.
4042
4043                   If x(1..n)==tail then we can do a simple trie, if not we make
4044                   a "jump" trie, such that when we match the appropriate word
4045                   we "jump" to the appropriate tail node. Essentially we turn
4046                   a nested if into a case structure of sorts.
4047
4048                 */
4049
4050                     int made=0;
4051                     if (!re_trie_maxbuff) {
4052                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4053                         if (!SvIOK(re_trie_maxbuff))
4054                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4055                     }
4056                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4057                         regnode *cur;
4058                         regnode *first = (regnode *)NULL;
4059                         regnode *last = (regnode *)NULL;
4060                         regnode *tail = scan;
4061                         U8 trietype = 0;
4062                         U32 count=0;
4063
4064                         /* var tail is used because there may be a TAIL
4065                            regop in the way. Ie, the exacts will point to the
4066                            thing following the TAIL, but the last branch will
4067                            point at the TAIL. So we advance tail. If we
4068                            have nested (?:) we may have to move through several
4069                            tails.
4070                          */
4071
4072                         while ( OP( tail ) == TAIL ) {
4073                             /* this is the TAIL generated by (?:) */
4074                             tail = regnext( tail );
4075                         }
4076
4077
4078                         DEBUG_TRIE_COMPILE_r({
4079                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4080                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4081                               (int)depth * 2 + 2, "",
4082                               "Looking for TRIE'able sequences. Tail node is: ",
4083                               SvPV_nolen_const( RExC_mysv )
4084                             );
4085                         });
4086
4087                         /*
4088
4089                             Step through the branches
4090                                 cur represents each branch,
4091                                 noper is the first thing to be matched as part
4092                                       of that branch
4093                                 noper_next is the regnext() of that node.
4094
4095                             We normally handle a case like this
4096                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4097                             support building with NOJUMPTRIE, which restricts
4098                             the trie logic to structures like /FOO|BAR/.
4099
4100                             If noper is a trieable nodetype then the branch is
4101                             a possible optimization target. If we are building
4102                             under NOJUMPTRIE then we require that noper_next is
4103                             the same as scan (our current position in the regex
4104                             program).
4105
4106                             Once we have two or more consecutive such branches
4107                             we can create a trie of the EXACT's contents and
4108                             stitch it in place into the program.
4109
4110                             If the sequence represents all of the branches in
4111                             the alternation we replace the entire thing with a
4112                             single TRIE node.
4113
4114                             Otherwise when it is a subsequence we need to
4115                             stitch it in place and replace only the relevant
4116                             branches. This means the first branch has to remain
4117                             as it is used by the alternation logic, and its
4118                             next pointer, and needs to be repointed at the item
4119                             on the branch chain following the last branch we
4120                             have optimized away.
4121
4122                             This could be either a BRANCH, in which case the
4123                             subsequence is internal, or it could be the item
4124                             following the branch sequence in which case the
4125                             subsequence is at the end (which does not
4126                             necessarily mean the first node is the start of the
4127                             alternation).
4128
4129                             TRIE_TYPE(X) is a define which maps the optype to a
4130                             trietype.
4131
4132                                 optype          |  trietype
4133                                 ----------------+-----------
4134                                 NOTHING         | NOTHING
4135                                 EXACT           | EXACT
4136                                 EXACTFU         | EXACTFU
4137                                 EXACTFU_SS      | EXACTFU
4138                                 EXACTFA         | EXACTFA
4139
4140
4141                         */
4142 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
4143                        ( EXACT == (X) )   ? EXACT :        \
4144                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
4145                        ( EXACTFA == (X) ) ? EXACTFA :        \
4146                        0 )
4147
4148                         /* dont use tail as the end marker for this traverse */
4149                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4150                             regnode * const noper = NEXTOPER( cur );
4151                             U8 noper_type = OP( noper );
4152                             U8 noper_trietype = TRIE_TYPE( noper_type );
4153 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4154                             regnode * const noper_next = regnext( noper );
4155                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4156                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4157 #endif
4158
4159                             DEBUG_TRIE_COMPILE_r({
4160                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4161                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4162                                    (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4163
4164                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4165                                 PerlIO_printf( Perl_debug_log, " -> %s",
4166                                     SvPV_nolen_const(RExC_mysv));
4167
4168                                 if ( noper_next ) {
4169                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4170                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4171                                     SvPV_nolen_const(RExC_mysv));
4172                                 }
4173                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4174                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4175                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4176                                 );
4177                             });
4178
4179                             /* Is noper a trieable nodetype that can be merged
4180                              * with the current trie (if there is one)? */
4181                             if ( noper_trietype
4182                                   &&
4183                                   (
4184                                         ( noper_trietype == NOTHING)
4185                                         || ( trietype == NOTHING )
4186                                         || ( trietype == noper_trietype )
4187                                   )
4188 #ifdef NOJUMPTRIE
4189                                   && noper_next == tail
4190 #endif
4191                                   && count < U16_MAX)
4192                             {
4193                                 /* Handle mergable triable node Either we are
4194                                  * the first node in a new trieable sequence,
4195                                  * in which case we do some bookkeeping,
4196                                  * otherwise we update the end pointer. */
4197                                 if ( !first ) {
4198                                     first = cur;
4199                                     if ( noper_trietype == NOTHING ) {
4200 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4201                                         regnode * const noper_next = regnext( noper );
4202                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4203                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4204 #endif
4205
4206                                         if ( noper_next_trietype ) {
4207                                             trietype = noper_next_trietype;
4208                                         } else if (noper_next_type)  {
4209                                             /* a NOTHING regop is 1 regop wide.
4210                                              * We need at least two for a trie
4211                                              * so we can't merge this in */
4212                                             first = NULL;
4213                                         }
4214                                     } else {
4215                                         trietype = noper_trietype;
4216                                     }
4217                                 } else {
4218                                     if ( trietype == NOTHING )
4219                                         trietype = noper_trietype;
4220                                     last = cur;
4221                                 }
4222                                 if (first)
4223                                     count++;
4224                             } /* end handle mergable triable node */
4225                             else {
4226                                 /* handle unmergable node -
4227                                  * noper may either be a triable node which can
4228                                  * not be tried together with the current trie,
4229                                  * or a non triable node */
4230                                 if ( last ) {
4231                                     /* If last is set and trietype is not
4232                                      * NOTHING then we have found at least two
4233                                      * triable branch sequences in a row of a
4234                                      * similar trietype so we can turn them
4235                                      * into a trie. If/when we allow NOTHING to
4236                                      * start a trie sequence this condition
4237                                      * will be required, and it isn't expensive
4238                                      * so we leave it in for now. */
4239                                     if ( trietype && trietype != NOTHING )
4240                                         make_trie( pRExC_state,
4241                                                 startbranch, first, cur, tail,
4242                                                 count, trietype, depth+1 );
4243                                     last = NULL; /* note: we clear/update
4244                                                     first, trietype etc below,
4245                                                     so we dont do it here */
4246                                 }
4247                                 if ( noper_trietype
4248 #ifdef NOJUMPTRIE
4249                                      && noper_next == tail
4250 #endif
4251                                 ){
4252                                     /* noper is triable, so we can start a new
4253                                      * trie sequence */
4254                                     count = 1;
4255                                     first = cur;
4256                                     trietype = noper_trietype;
4257                                 } else if (first) {
4258                                     /* if we already saw a first but the
4259                                      * current node is not triable then we have
4260                                      * to reset the first information. */
4261                                     count = 0;
4262                                     first = NULL;
4263                                     trietype = 0;
4264                                 }
4265                             } /* end handle unmergable node */
4266                         } /* loop over branches */
4267                         DEBUG_TRIE_COMPILE_r({
4268                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4269                             PerlIO_printf( Perl_debug_log,
4270                               "%*s- %s (%d) <SCAN FINISHED>\n",
4271                               (int)depth * 2 + 2,
4272                               "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4273
4274                         });
4275                         if ( last && trietype ) {
4276                             if ( trietype != NOTHING ) {
4277                                 /* the last branch of the sequence was part of
4278                                  * a trie, so we have to construct it here
4279                                  * outside of the loop */
4280                                 made= make_trie( pRExC_state, startbranch,
4281                                                  first, scan, tail, count,
4282                                                  trietype, depth+1 );
4283 #ifdef TRIE_STUDY_OPT
4284                                 if ( ((made == MADE_EXACT_TRIE &&
4285                                      startbranch == first)
4286                                      || ( first_non_open == first )) &&
4287                                      depth==0 ) {
4288                                     flags |= SCF_TRIE_RESTUDY;
4289                                     if ( startbranch == first
4290                                          && scan == tail )
4291                                     {
4292                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4293                                     }
4294                                 }
4295 #endif
4296                             } else {
4297                                 /* at this point we know whatever we have is a
4298                                  * NOTHING sequence/branch AND if 'startbranch'
4299                                  * is 'first' then we can turn the whole thing
4300                                  * into a NOTHING
4301                                  */
4302                                 if ( startbranch == first ) {
4303                                     regnode *opt;
4304                                     /* the entire thing is a NOTHING sequence,
4305                                      * something like this: (?:|) So we can
4306                                      * turn it into a plain NOTHING op. */
4307                                     DEBUG_TRIE_COMPILE_r({
4308                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4309                                         PerlIO_printf( Perl_debug_log,
4310                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4311                                           "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4312
4313                                     });
4314                                     OP(startbranch)= NOTHING;
4315                                     NEXT_OFF(startbranch)= tail - startbranch;
4316                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4317                                         OP(opt)= OPTIMIZED;
4318                                 }
4319                             }
4320                         } /* end if ( last) */
4321                     } /* TRIE_MAXBUF is non zero */
4322
4323                 } /* do trie */
4324
4325             }
4326             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4327                 scan = NEXTOPER(NEXTOPER(scan));
4328             } else                      /* single branch is optimized. */
4329                 scan = NEXTOPER(scan);
4330             continue;
4331         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4332             I32 paren = 0;
4333             regnode *start = NULL;
4334             regnode *end = NULL;
4335             U32 my_recursed_depth= recursed_depth;
4336
4337
4338             if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4339                 /* Do setup, note this code has side effects beyond
4340                  * the rest of this block. Specifically setting
4341                  * RExC_recurse[] must happen at least once during
4342                  * study_chunk(). */
4343                 if (OP(scan) == GOSUB) {
4344                     paren = ARG(scan);
4345                     RExC_recurse[ARG2L(scan)] = scan;
4346                     start = RExC_open_parens[paren-1];
4347                     end   = RExC_close_parens[paren-1];
4348                 } else {
4349                     start = RExC_rxi->program + 1;
4350                     end   = RExC_opend;
4351                 }
4352                 /* NOTE we MUST always execute the above code, even
4353                  * if we do nothing with a GOSUB/GOSTART */
4354                 if (
4355                     ( flags & SCF_IN_DEFINE )
4356                     ||
4357                     (
4358                         (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4359                         &&
4360                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4361                     )
4362                 ) {
4363                     /* no need to do anything here if we are in a define. */
4364                     /* or we are after some kind of infinite construct
4365                      * so we can skip recursing into this item.
4366                      * Since it is infinite we will not change the maxlen
4367                      * or delta, and if we miss something that might raise
4368                      * the minlen it will merely pessimise a little.
4369                      *
4370                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4371                      * might result in a minlen of 1 and not of 4,
4372                      * but this doesn't make us mismatch, just try a bit
4373                      * harder than we should.
4374                      * */
4375                     scan= regnext(scan);
4376                     continue;
4377                 }
4378
4379                 if (
4380                     !recursed_depth
4381                     ||
4382                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4383                 ) {
4384                     /* it is quite possible that there are more efficient ways
4385                      * to do this. We maintain a bitmap per level of recursion
4386                      * of which patterns we have entered so we can detect if a
4387                      * pattern creates a possible infinite loop. When we
4388                      * recurse down a level we copy the previous levels bitmap
4389                      * down. When we are at recursion level 0 we zero the top
4390                      * level bitmap. It would be nice to implement a different
4391                      * more efficient way of doing this. In particular the top
4392                      * level bitmap may be unnecessary.
4393                      */
4394                     if (!recursed_depth) {
4395                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4396                     } else {
4397                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4398                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4399                              RExC_study_chunk_recursed_bytes, U8);
4400                     }
4401                     /* we havent recursed into this paren yet, so recurse into it */
4402                     DEBUG_STUDYDATA("set:", data,depth);
4403                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4404                     my_recursed_depth= recursed_depth + 1;
4405                 } else {
4406                     DEBUG_STUDYDATA("inf:", data,depth);
4407                     /* some form of infinite recursion, assume infinite length
4408                      * */
4409                     if (flags & SCF_DO_SUBSTR) {
4410                         scan_commit(pRExC_state, data, minlenp, is_inf);
4411                         data->longest = &(data->longest_float);
4412                     }
4413                     is_inf = is_inf_internal = 1;
4414                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4415                         ssc_anything(data->start_class);
4416                     flags &= ~SCF_DO_STCLASS;
4417
4418                     start= NULL; /* reset start so we dont recurse later on. */
4419                 }
4420             } else {
4421                 paren = stopparen;
4422                 start = scan + 2;
4423                 end = regnext(scan);
4424             }
4425             if (start) {
4426                 scan_frame *newframe;
4427                 assert(end);
4428                 if (!RExC_frame_last) {
4429                     Newxz(newframe, 1, scan_frame);
4430                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4431                     RExC_frame_head= newframe;
4432                     RExC_frame_count++;
4433                 } else if (!RExC_frame_last->next_frame) {
4434                     Newxz(newframe,1,scan_frame);
4435                     RExC_frame_last->next_frame= newframe;
4436                     newframe->prev_frame= RExC_frame_last;
4437                     RExC_frame_count++;
4438                 } else {
4439                     newframe= RExC_frame_last->next_frame;
4440                 }
4441                 RExC_frame_last= newframe;
4442
4443                 newframe->next_regnode = regnext(scan);
4444                 newframe->last_regnode = last;
4445                 newframe->stopparen = stopparen;
4446                 newframe->prev_recursed_depth = recursed_depth;
4447                 newframe->this_prev_frame= frame;
4448
4449                 DEBUG_STUDYDATA("frame-new:",data,depth);
4450                 DEBUG_PEEP("fnew", scan, depth);
4451
4452                 frame = newframe;
4453                 scan =  start;
4454                 stopparen = paren;
4455                 last = end;
4456                 depth = depth + 1;
4457                 recursed_depth= my_recursed_depth;
4458
4459                 continue;
4460             }
4461         }
4462         else if (OP(scan) == EXACT) {
4463             SSize_t l = STR_LEN(scan);
4464             UV uc;
4465             if (UTF) {
4466                 const U8 * const s = (U8*)STRING(scan);
4467                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4468                 l = utf8_length(s, s + l);
4469             } else {
4470                 uc = *((U8*)STRING(scan));
4471             }
4472             min += l;
4473             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4474                 /* The code below prefers earlier match for fixed
4475                    offset, later match for variable offset.  */
4476                 if (data->last_end == -1) { /* Update the start info. */
4477                     data->last_start_min = data->pos_min;
4478                     data->last_start_max = is_inf
4479                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4480                 }
4481                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4482                 if (UTF)
4483                     SvUTF8_on(data->last_found);
4484                 {
4485                     SV * const sv = data->last_found;
4486                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4487                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4488                     if (mg && mg->mg_len >= 0)
4489                         mg->mg_len += utf8_length((U8*)STRING(scan),
4490                                               (U8*)STRING(scan)+STR_LEN(scan));
4491                 }
4492                 data->last_end = data->pos_min + l;
4493                 data->pos_min += l; /* As in the first entry. */
4494                 data->flags &= ~SF_BEFORE_EOL;
4495             }
4496
4497             /* ANDing the code point leaves at most it, and not in locale, and
4498              * can't match null string */
4499             if (flags & SCF_DO_STCLASS_AND) {
4500                 ssc_cp_and(data->start_class, uc);
4501                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4502                 ssc_clear_locale(data->start_class);
4503             }
4504             else if (flags & SCF_DO_STCLASS_OR) {
4505                 ssc_add_cp(data->start_class, uc);
4506                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4507
4508                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4509                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4510             }
4511             flags &= ~SCF_DO_STCLASS;
4512         }
4513         else if (PL_regkind[OP(scan)] == EXACT) {
4514             /* But OP != EXACT!, so is EXACTFish */
4515             SSize_t l = STR_LEN(scan);
4516             UV uc = *((U8*)STRING(scan));
4517             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4518                                                      separate code points */
4519             const U8 * s = (U8*)STRING(scan);
4520
4521             /* Search for fixed substrings supports EXACT only. */
4522             if (flags & SCF_DO_SUBSTR) {
4523                 assert(data);
4524                 scan_commit(pRExC_state, data, minlenp, is_inf);
4525             }
4526             if (UTF) {
4527                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4528                 l = utf8_length(s, s + l);
4529             }
4530             if (unfolded_multi_char) {
4531                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4532             }
4533             min += l - min_subtract;
4534             assert (min >= 0);
4535             delta += min_subtract;
4536             if (flags & SCF_DO_SUBSTR) {
4537                 data->pos_min += l - min_subtract;
4538                 if (data->pos_min < 0) {
4539                     data->pos_min = 0;
4540                 }
4541                 data->pos_delta += min_subtract;
4542                 if (min_subtract) {
4543                     data->longest = &(data->longest_float);
4544                 }
4545             }
4546
4547             if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4548                 ssc_clear_locale(data->start_class);
4549             }
4550
4551             if (! UTF) {
4552
4553                 /* We punt and assume can match anything if the node begins
4554                  * with a multi-character fold.  Things are complicated.  For
4555                  * example, /ffi/i could match any of:
4556                  *  "\N{LATIN SMALL LIGATURE FFI}"
4557                  *  "\N{LATIN SMALL LIGATURE FF}I"
4558                  *  "F\N{LATIN SMALL LIGATURE FI}"
4559                  *  plus several other things; and making sure we have all the
4560                  *  possibilities is hard. */
4561                 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4562                     EXACTF_invlist =
4563                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4564                 }
4565                 else {
4566
4567                     /* Any Latin1 range character can potentially match any
4568                      * other depending on the locale */
4569                     if (OP(scan) == EXACTFL) {
4570                         _invlist_union(EXACTF_invlist, PL_Latin1,
4571                                                               &EXACTF_invlist);
4572                     }
4573                     else {
4574                         /* But otherwise, it matches at least itself.  We can
4575                          * quickly tell if it has a distinct fold, and if so,
4576                          * it matches that as well */
4577                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4578                         if (IS_IN_SOME_FOLD_L1(uc)) {
4579                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4580                                                            PL_fold_latin1[uc]);
4581                         }
4582                     }
4583
4584                     /* Some characters match above-Latin1 ones under /i.  This
4585                      * is true of EXACTFL ones when the locale is UTF-8 */
4586                     if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4587                         && (! isASCII(uc) || (OP(scan) != EXACTFA
4588                                             && OP(scan) != EXACTFA_NO_TRIE)))
4589                     {
4590                         add_above_Latin1_folds(pRExC_state,
4591                                                (U8) uc,
4592                                                &EXACTF_invlist);
4593                     }
4594                 }
4595             }
4596             else {  /* Pattern is UTF-8 */
4597                 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4598                 STRLEN foldlen = UTF8SKIP(s);
4599                 const U8* e = s + STR_LEN(scan);
4600                 SV** listp;
4601
4602                 /* The only code points that aren't folded in a UTF EXACTFish
4603                  * node are are the problematic ones in EXACTFL nodes */
4604                 if (OP(scan) == EXACTFL
4605                     && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4606                 {
4607                     /* We need to check for the possibility that this EXACTFL
4608                      * node begins with a multi-char fold.  Therefore we fold
4609                      * the first few characters of it so that we can make that
4610                      * check */
4611                     U8 *d = folded;
4612                     int i;
4613
4614                     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4615                         if (isASCII(*s)) {
4616                             *(d++) = (U8) toFOLD(*s);
4617                             s++;
4618                         }
4619                         else {
4620                             STRLEN len;
4621                             to_utf8_fold(s, d, &len);
4622                             d += len;
4623                             s += UTF8SKIP(s);
4624                         }
4625                     }
4626
4627                     /* And set up so the code below that looks in this folded
4628                      * buffer instead of the node's string */
4629                     e = d;
4630                     foldlen = UTF8SKIP(folded);
4631                     s = folded;
4632                 }
4633
4634                 /* When we reach here 's' points to the fold of the first
4635                  * character(s) of the node; and 'e' points to far enough along
4636                  * the folded string to be just past any possible multi-char
4637                  * fold. 'foldlen' is the length in bytes of the first
4638                  * character in 's'
4639                  *
4640                  * Unlike the non-UTF-8 case, the macro for determining if a
4641                  * string is a multi-char fold requires all the characters to
4642                  * already be folded.  This is because of all the complications
4643                  * if not.  Note that they are folded anyway, except in EXACTFL
4644                  * nodes.  Like the non-UTF case above, we punt if the node
4645                  * begins with a multi-char fold  */
4646
4647                 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4648                     EXACTF_invlist =
4649                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4650                 }
4651                 else {  /* Single char fold */
4652
4653                     /* It matches all the things that fold to it, which are
4654                      * found in PL_utf8_foldclosures (including itself) */
4655                     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4656                     if (! PL_utf8_foldclosures) {
4657                         _load_PL_utf8_foldclosures();
4658                     }
4659                     if ((listp = hv_fetch(PL_utf8_foldclosures,
4660                                         (char *) s, foldlen, FALSE)))
4661                     {
4662                         AV* list = (AV*) *listp;
4663                         IV k;
4664                         for (k = 0; k <= av_tindex(list); k++) {
4665                             SV** c_p = av_fetch(list, k, FALSE);
4666                             UV c;
4667                             assert(c_p);
4668
4669                             c = SvUV(*c_p);
4670
4671                             /* /aa doesn't allow folds between ASCII and non- */
4672                             if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4673                                 && isASCII(c) != isASCII(uc))
4674                             {
4675                                 continue;
4676                             }
4677
4678                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4679                         }
4680                     }
4681                 }
4682             }
4683             if (flags & SCF_DO_STCLASS_AND) {
4684                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4685                 ANYOF_POSIXL_ZERO(data->start_class);
4686                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4687             }
4688             else if (flags & SCF_DO_STCLASS_OR) {
4689                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4690                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4691
4692                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4693                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4694             }
4695             flags &= ~SCF_DO_STCLASS;
4696             SvREFCNT_dec(EXACTF_invlist);
4697         }
4698         else if (REGNODE_VARIES(OP(scan))) {
4699             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4700             I32 fl = 0, f = flags;
4701             regnode * const oscan = scan;
4702             regnode_ssc this_class;
4703             regnode_ssc *oclass = NULL;
4704             I32 next_is_eval = 0;
4705
4706             switch (PL_regkind[OP(scan)]) {
4707             case WHILEM:                /* End of (?:...)* . */
4708                 scan = NEXTOPER(scan);
4709                 goto finish;
4710             case PLUS:
4711                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4712                     next = NEXTOPER(scan);
4713                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4714                         mincount = 1;
4715                         maxcount = REG_INFTY;
4716                         next = regnext(scan);
4717                         scan = NEXTOPER(scan);
4718                         goto do_curly;
4719                     }
4720                 }
4721                 if (flags & SCF_DO_SUBSTR)
4722                     data->pos_min++;
4723                 min++;
4724                 /* FALLTHROUGH */
4725             case STAR:
4726                 if (flags & SCF_DO_STCLASS) {
4727                     mincount = 0;
4728                     maxcount = REG_INFTY;
4729                     next = regnext(scan);
4730                     scan = NEXTOPER(scan);
4731                     goto do_curly;
4732                 }
4733                 if (flags & SCF_DO_SUBSTR) {
4734                     scan_commit(pRExC_state, data, minlenp, is_inf);
4735                     /* Cannot extend fixed substrings */
4736                     data->longest = &(data->longest_float);
4737                 }
4738                 is_inf = is_inf_internal = 1;
4739                 scan = regnext(scan);
4740                 goto optimize_curly_tail;
4741             case CURLY:
4742                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4743                     && (scan->flags == stopparen))
4744                 {
4745                     mincount = 1;
4746                     maxcount = 1;
4747                 } else {
4748                     mincount = ARG1(scan);
4749                     maxcount = ARG2(scan);
4750                 }
4751                 next = regnext(scan);
4752                 if (OP(scan) == CURLYX) {
4753                     I32 lp = (data ? *(data->last_closep) : 0);
4754                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4755                 }
4756                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4757                 next_is_eval = (OP(scan) == EVAL);
4758               do_curly:
4759                 if (flags & SCF_DO_SUBSTR) {
4760                     if (mincount == 0)
4761                         scan_commit(pRExC_state, data, minlenp, is_inf);
4762                     /* Cannot extend fixed substrings */
4763                     pos_before = data->pos_min;
4764                 }
4765                 if (data) {
4766                     fl = data->flags;
4767                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4768                     if (is_inf)
4769                         data->flags |= SF_IS_INF;
4770                 }
4771                 if (flags & SCF_DO_STCLASS) {
4772                     ssc_init(pRExC_state, &this_class);
4773                     oclass = data->start_class;
4774                     data->start_class = &this_class;
4775                     f |= SCF_DO_STCLASS_AND;
4776                     f &= ~SCF_DO_STCLASS_OR;
4777                 }
4778                 /* Exclude from super-linear cache processing any {n,m}
4779                    regops for which the combination of input pos and regex
4780                    pos is not enough information to determine if a match
4781                    will be possible.
4782
4783                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4784                    regex pos at the \s*, the prospects for a match depend not
4785                    only on the input position but also on how many (bar\s*)
4786                    repeats into the {4,8} we are. */
4787                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4788                     f &= ~SCF_WHILEM_VISITED_POS;
4789
4790                 /* This will finish on WHILEM, setting scan, or on NULL: */
4791                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4792                                   last, data, stopparen, recursed_depth, NULL,
4793                                   (mincount == 0
4794                                    ? (f & ~SCF_DO_SUBSTR)
4795                                    : f)
4796                                   ,depth+1);
4797
4798                 if (flags & SCF_DO_STCLASS)
4799                     data->start_class = oclass;
4800                 if (mincount == 0 || minnext == 0) {
4801                     if (flags & SCF_DO_STCLASS_OR) {
4802                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4803                     }
4804                     else if (flags & SCF_DO_STCLASS_AND) {
4805                         /* Switch to OR mode: cache the old value of
4806                          * data->start_class */
4807                         INIT_AND_WITHP;
4808                         StructCopy(data->start_class, and_withp, regnode_ssc);
4809                         flags &= ~SCF_DO_STCLASS_AND;
4810                         StructCopy(&this_class, data->start_class, regnode_ssc);
4811                         flags |= SCF_DO_STCLASS_OR;
4812                         ANYOF_FLAGS(data->start_class)
4813                                                 |= SSC_MATCHES_EMPTY_STRING;
4814                     }
4815                 } else {                /* Non-zero len */
4816                     if (flags & SCF_DO_STCLASS_OR) {
4817                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4818                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4819                     }
4820                     else if (flags & SCF_DO_STCLASS_AND)
4821                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4822                     flags &= ~SCF_DO_STCLASS;
4823                 }
4824                 if (!scan)              /* It was not CURLYX, but CURLY. */
4825                     scan = next;
4826                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4827                     /* ? quantifier ok, except for (?{ ... }) */
4828                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4829                     && (minnext == 0) && (deltanext == 0)
4830                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4831                     && maxcount <= REG_INFTY/3) /* Complement check for big
4832                                                    count */
4833                 {
4834                     /* Fatal warnings may leak the regexp without this: */
4835                     SAVEFREESV(RExC_rx_sv);
4836                     ckWARNreg(RExC_parse,
4837                             "Quantifier unexpected on zero-length expression");
4838                     (void)ReREFCNT_inc(RExC_rx_sv);
4839                 }
4840
4841                 min += minnext * mincount;
4842                 is_inf_internal |= deltanext == SSize_t_MAX
4843                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4844                 is_inf |= is_inf_internal;
4845                 if (is_inf) {
4846                     delta = SSize_t_MAX;
4847                 } else {
4848                     delta += (minnext + deltanext) * maxcount
4849                              - minnext * mincount;
4850                 }
4851                 /* Try powerful optimization CURLYX => CURLYN. */
4852                 if (  OP(oscan) == CURLYX && data
4853                       && data->flags & SF_IN_PAR
4854                       && !(data->flags & SF_HAS_EVAL)
4855                       && !deltanext && minnext == 1 ) {
4856                     /* Try to optimize to CURLYN.  */
4857                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4858                     regnode * const nxt1 = nxt;
4859 #ifdef DEBUGGING
4860                     regnode *nxt2;
4861 #endif
4862
4863                     /* Skip open. */
4864                     nxt = regnext(nxt);
4865                     if (!REGNODE_SIMPLE(OP(nxt))
4866                         && !(PL_regkind[OP(nxt)] == EXACT
4867                              && STR_LEN(nxt) == 1))
4868                         goto nogo;
4869 #ifdef DEBUGGING
4870                     nxt2 = nxt;
4871 #endif
4872                     nxt = regnext(nxt);
4873                     if (OP(nxt) != CLOSE)
4874                         goto nogo;
4875                     if (RExC_open_parens) {
4876                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4877                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4878                     }
4879                     /* Now we know that nxt2 is the only contents: */
4880                     oscan->flags = (U8)ARG(nxt);
4881                     OP(oscan) = CURLYN;
4882                     OP(nxt1) = NOTHING; /* was OPEN. */
4883
4884 #ifdef DEBUGGING
4885                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4886                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4887                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4888                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4889                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4890                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4891 #endif
4892                 }
4893               nogo:
4894
4895                 /* Try optimization CURLYX => CURLYM. */
4896                 if (  OP(oscan) == CURLYX && data
4897                       && !(data->flags & SF_HAS_PAR)
4898                       && !(data->flags & SF_HAS_EVAL)
4899                       && !deltanext     /* atom is fixed width */
4900                       && minnext != 0   /* CURLYM can't handle zero width */
4901
4902                          /* Nor characters whose fold at run-time may be
4903                           * multi-character */
4904                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4905                 ) {
4906                     /* XXXX How to optimize if data == 0? */
4907                     /* Optimize to a simpler form.  */
4908                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4909                     regnode *nxt2;
4910
4911                     OP(oscan) = CURLYM;
4912                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4913                             && (OP(nxt2) != WHILEM))
4914                         nxt = nxt2;
4915                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4916                     /* Need to optimize away parenths. */
4917                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4918                         /* Set the parenth number.  */
4919                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4920
4921                         oscan->flags = (U8)ARG(nxt);
4922                         if (RExC_open_parens) {
4923                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4924                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4925                         }
4926                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4927                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4928
4929 #ifdef DEBUGGING
4930                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4931                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4932                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4933                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4934 #endif
4935 #if 0
4936                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4937                             regnode *nnxt = regnext(nxt1);
4938                             if (nnxt == nxt) {
4939                                 if (reg_off_by_arg[OP(nxt1)])
4940                                     ARG_SET(nxt1, nxt2 - nxt1);
4941                                 else if (nxt2 - nxt1 < U16_MAX)
4942                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4943                                 else
4944                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4945                             }
4946                             nxt1 = nnxt;
4947                         }
4948 #endif
4949                         /* Optimize again: */
4950                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4951                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4952                     }
4953                     else
4954                         oscan->flags = 0;
4955                 }
4956                 else if ((OP(oscan) == CURLYX)
4957                          && (flags & SCF_WHILEM_VISITED_POS)
4958                          /* See the comment on a similar expression above.
4959                             However, this time it's not a subexpression
4960                             we care about, but the expression itself. */
4961                          && (maxcount == REG_INFTY)
4962                          && data && ++data->whilem_c < 16) {
4963                     /* This stays as CURLYX, we can put the count/of pair. */
4964                     /* Find WHILEM (as in regexec.c) */
4965                     regnode *nxt = oscan + NEXT_OFF(oscan);
4966
4967                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4968                         nxt += ARG(nxt);
4969                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4970                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4971                 }
4972                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4973                     pars++;
4974                 if (flags & SCF_DO_SUBSTR) {
4975                     SV *last_str = NULL;
4976                     STRLEN last_chrs = 0;
4977                     int counted = mincount != 0;
4978
4979                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4980                                                                   string. */
4981                         SSize_t b = pos_before >= data->last_start_min
4982                             ? pos_before : data->last_start_min;
4983                         STRLEN l;
4984                         const char * const s = SvPV_const(data->last_found, l);
4985                         SSize_t old = b - data->last_start_min;
4986
4987                         if (UTF)
4988                             old = utf8_hop((U8*)s, old) - (U8*)s;
4989                         l -= old;
4990                         /* Get the added string: */
4991                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4992                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4993                                             (U8*)(s + old + l)) : l;
4994                         if (deltanext == 0 && pos_before == b) {
4995                             /* What was added is a constant string */
4996                             if (mincount > 1) {
4997
4998                                 SvGROW(last_str, (mincount * l) + 1);
4999                                 repeatcpy(SvPVX(last_str) + l,
5000                                           SvPVX_const(last_str), l,
5001                                           mincount - 1);
5002                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5003                                 /* Add additional parts. */
5004                                 SvCUR_set(data->last_found,
5005                                           SvCUR(data->last_found) - l);
5006                                 sv_catsv(data->last_found, last_str);
5007                                 {
5008                                     SV * sv = data->last_found;
5009                                     MAGIC *mg =
5010                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5011                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5012                                     if (mg && mg->mg_len >= 0)
5013                                         mg->mg_len += last_chrs * (mincount-1);
5014                                 }
5015                                 last_chrs *= mincount;
5016                                 data->last_end += l * (mincount - 1);
5017                             }
5018                         } else {
5019                             /* start offset must point into the last copy */
5020                             data->last_start_min += minnext * (mincount - 1);
5021                             data->last_start_max += is_inf ? SSize_t_MAX
5022                                 : (maxcount - 1) * (minnext + data->pos_delta);
5023                         }
5024                     }
5025                     /* It is counted once already... */
5026                     data->pos_min += minnext * (mincount - counted);
5027 #if 0
5028 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5029                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5030                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
5031     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5032     (UV)mincount);
5033 if (deltanext != SSize_t_MAX)
5034 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5035     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5036           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5037 #endif
5038                     if (deltanext == SSize_t_MAX
5039                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5040                         data->pos_delta = SSize_t_MAX;
5041                     else
5042                         data->pos_delta += - counted * deltanext +
5043                         (minnext + deltanext) * maxcount - minnext * mincount;
5044                     if (mincount != maxcount) {
5045                          /* Cannot extend fixed substrings found inside
5046                             the group.  */
5047                         scan_commit(pRExC_state, data, minlenp, is_inf);
5048                         if (mincount && last_str) {
5049                             SV * const sv = data->last_found;
5050                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5051                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5052
5053                             if (mg)
5054                                 mg->mg_len = -1;
5055                             sv_setsv(sv, last_str);
5056                             data->last_end = data->pos_min;
5057                             data->last_start_min = data->pos_min - last_chrs;
5058                             data->last_start_max = is_inf
5059                                 ? SSize_t_MAX
5060                                 : data->pos_min + data->pos_delta - last_chrs;
5061                         }
5062                         data->longest = &(data->longest_float);
5063                     }
5064                     SvREFCNT_dec(last_str);
5065                 }
5066                 if (data && (fl & SF_HAS_EVAL))
5067                     data->flags |= SF_HAS_EVAL;
5068               optimize_curly_tail:
5069                 if (OP(oscan) != CURLYX) {
5070                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5071                            && NEXT_OFF(next))
5072                         NEXT_OFF(oscan) += NEXT_OFF(next);
5073                 }
5074                 continue;
5075
5076             default:
5077 #ifdef DEBUGGING
5078                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5079                                                                     OP(scan));
5080 #endif
5081             case REF:
5082             case CLUMP:
5083                 if (flags & SCF_DO_SUBSTR) {
5084                     /* Cannot expect anything... */
5085                     scan_commit(pRExC_state, data, minlenp, is_inf);
5086                     data->longest = &(data->longest_float);
5087                 }
5088                 is_inf = is_inf_internal = 1;
5089                 if (flags & SCF_DO_STCLASS_OR) {
5090                     if (OP(scan) == CLUMP) {
5091                         /* Actually is any start char, but very few code points
5092                          * aren't start characters */
5093                         ssc_match_all_cp(data->start_class);
5094                     }
5095                     else {
5096                         ssc_anything(data->start_class);
5097                     }
5098                 }
5099                 flags &= ~SCF_DO_STCLASS;
5100                 break;
5101             }
5102         }
5103         else if (OP(scan) == LNBREAK) {
5104             if (flags & SCF_DO_STCLASS) {
5105                 if (flags & SCF_DO_STCLASS_AND) {
5106                     ssc_intersection(data->start_class,
5107                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5108                     ssc_clear_locale(data->start_class);
5109                     ANYOF_FLAGS(data->start_class)
5110                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5111                 }
5112                 else if (flags & SCF_DO_STCLASS_OR) {
5113                     ssc_union(data->start_class,
5114                               PL_XPosix_ptrs[_CC_VERTSPACE],
5115                               FALSE);
5116                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5117
5118                     /* See commit msg for
5119                      * 749e076fceedeb708a624933726e7989f2302f6a */
5120                     ANYOF_FLAGS(data->start_class)
5121                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5122                 }
5123                 flags &= ~SCF_DO_STCLASS;
5124             }
5125             min++;
5126             delta++;    /* Because of the 2 char string cr-lf */
5127             if (flags & SCF_DO_SUBSTR) {
5128                 /* Cannot expect anything... */
5129                 scan_commit(pRExC_state, data, minlenp, is_inf);
5130                 data->pos_min += 1;
5131                 data->pos_delta += 1;
5132                 data->longest = &(data->longest_float);
5133             }
5134         }
5135         else if (REGNODE_SIMPLE(OP(scan))) {
5136
5137             if (flags & SCF_DO_SUBSTR) {
5138                 scan_commit(pRExC_state, data, minlenp, is_inf);
5139                 data->pos_min++;
5140             }
5141             min++;
5142             if (flags & SCF_DO_STCLASS) {
5143                 bool invert = 0;
5144                 SV* my_invlist = sv_2mortal(_new_invlist(0));
5145                 U8 namedclass;
5146
5147                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5148                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5149
5150                 /* Some of the logic below assumes that switching
5151                    locale on will only add false positives. */
5152                 switch (OP(scan)) {
5153
5154                 default:
5155 #ifdef DEBUGGING
5156                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5157                                                                      OP(scan));
5158 #endif
5159                 case CANY:
5160                 case SANY:
5161                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5162                         ssc_match_all_cp(data->start_class);
5163                     break;
5164
5165                 case REG_ANY:
5166                     {
5167                         SV* REG_ANY_invlist = _new_invlist(2);
5168                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5169                                                             '\n');
5170                         if (flags & SCF_DO_STCLASS_OR) {
5171                             ssc_union(data->start_class,
5172                                       REG_ANY_invlist,
5173                                       TRUE /* TRUE => invert, hence all but \n
5174                                             */
5175                                       );
5176                         }
5177                         else if (flags & SCF_DO_STCLASS_AND) {
5178                             ssc_intersection(data->start_class,
5179                                              REG_ANY_invlist,
5180                                              TRUE  /* TRUE => invert */
5181                                              );
5182                             ssc_clear_locale(data->start_class);
5183                         }
5184                         SvREFCNT_dec_NN(REG_ANY_invlist);
5185                     }
5186                     break;
5187
5188                 case ANYOF:
5189                     if (flags & SCF_DO_STCLASS_AND)
5190                         ssc_and(pRExC_state, data->start_class,
5191                                 (regnode_charclass *) scan);
5192                     else
5193                         ssc_or(pRExC_state, data->start_class,
5194                                                           (regnode_charclass *) scan);
5195                     break;
5196
5197                 case NPOSIXL:
5198                     invert = 1;
5199                     /* FALLTHROUGH */
5200
5201                 case POSIXL:
5202                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5203                     if (flags & SCF_DO_STCLASS_AND) {
5204                         bool was_there = cBOOL(
5205                                           ANYOF_POSIXL_TEST(data->start_class,
5206                                                                  namedclass));
5207                         ANYOF_POSIXL_ZERO(data->start_class);
5208                         if (was_there) {    /* Do an AND */
5209                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5210                         }
5211                         /* No individual code points can now match */
5212                         data->start_class->invlist
5213                                                 = sv_2mortal(_new_invlist(0));
5214                     }
5215                     else {
5216                         int complement = namedclass + ((invert) ? -1 : 1);
5217
5218                         assert(flags & SCF_DO_STCLASS_OR);
5219
5220                         /* If the complement of this class was already there,
5221                          * the result is that they match all code points,
5222                          * (\d + \D == everything).  Remove the classes from
5223                          * future consideration.  Locale is not relevant in
5224                          * this case */
5225                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5226                             ssc_match_all_cp(data->start_class);
5227                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5228                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5229                         }
5230                         else {  /* The usual case; just add this class to the
5231                                    existing set */
5232                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5233                         }
5234                     }
5235                     break;
5236
5237                 case NPOSIXA:   /* For these, we always know the exact set of
5238                                    what's matched */
5239                     invert = 1;
5240                     /* FALLTHROUGH */
5241                 case POSIXA:
5242                     if (FLAGS(scan) == _CC_ASCII) {
5243                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
5244                     }
5245                     else {
5246                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5247                                               PL_XPosix_ptrs[_CC_ASCII],
5248                                               &my_invlist);
5249                     }
5250                     goto join_posix;
5251
5252                 case NPOSIXD:
5253                 case NPOSIXU:
5254                     invert = 1;
5255                     /* FALLTHROUGH */
5256                 case POSIXD:
5257                 case POSIXU:
5258                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5259
5260                     /* NPOSIXD matches all upper Latin1 code points unless the
5261                      * target string being matched is UTF-8, which is
5262                      * unknowable until match time.  Since we are going to
5263                      * invert, we want to get rid of all of them so that the
5264                      * inversion will match all */
5265                     if (OP(scan) == NPOSIXD) {
5266                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5267                                           &my_invlist);
5268                     }
5269
5270                   join_posix:
5271
5272                     if (flags & SCF_DO_STCLASS_AND) {
5273                         ssc_intersection(data->start_class, my_invlist, invert);
5274                         ssc_clear_locale(data->start_class);
5275                     }
5276                     else {
5277                         assert(flags & SCF_DO_STCLASS_OR);
5278                         ssc_union(data->start_class, my_invlist, invert);
5279                     }
5280                 }
5281                 if (flags & SCF_DO_STCLASS_OR)
5282                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5283                 flags &= ~SCF_DO_STCLASS;
5284             }
5285         }
5286         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5287             data->flags |= (OP(scan) == MEOL
5288                             ? SF_BEFORE_MEOL
5289                             : SF_BEFORE_SEOL);
5290             scan_commit(pRExC_state, data, minlenp, is_inf);
5291
5292         }
5293         else if (  PL_regkind[OP(scan)] == BRANCHJ
5294                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5295                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5296                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5297         {
5298             if ( OP(scan) == UNLESSM &&
5299                  scan->flags == 0 &&
5300                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5301                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5302             ) {
5303                 regnode *opt;
5304                 regnode *upto= regnext(scan);
5305                 DEBUG_PARSE_r({
5306                     DEBUG_STUDYDATA("OPFAIL",data,depth);
5307
5308                     /*DEBUG_PARSE_MSG("opfail");*/
5309                     regprop(RExC_rx, RExC_mysv, upto, NULL, pRExC_state);
5310                     PerlIO_printf(Perl_debug_log,
5311                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5312                         SvPV_nolen_const(RExC_mysv),
5313                         (IV)REG_NODE_NUM(upto),
5314                         (IV)(upto - scan)
5315                     );
5316                 });
5317                 OP(scan) = OPFAIL;
5318                 NEXT_OFF(scan) = upto - scan;
5319                 for (opt= scan + 1; opt < upto ; opt++)
5320                     OP(opt) = OPTIMIZED;
5321                 scan= upto;
5322                 continue;
5323             }
5324             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5325                 || OP(scan) == UNLESSM )
5326             {
5327                 /* Negative Lookahead/lookbehind
5328                    In this case we can't do fixed string optimisation.
5329                 */
5330
5331                 SSize_t deltanext, minnext, fake = 0;
5332                 regnode *nscan;
5333                 regnode_ssc intrnl;
5334                 int f = 0;
5335
5336                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5337                 if (data) {
5338                     data_fake.whilem_c = data->whilem_c;
5339                     data_fake.last_closep = data->last_closep;
5340                 }
5341                 else
5342                     data_fake.last_closep = &fake;
5343                 data_fake.pos_delta = delta;
5344                 if ( flags & SCF_DO_STCLASS && !scan->flags
5345                      && OP(scan) == IFMATCH ) { /* Lookahead */
5346                     ssc_init(pRExC_state, &intrnl);
5347                     data_fake.start_class = &intrnl;
5348                     f |= SCF_DO_STCLASS_AND;
5349                 }
5350                 if (flags & SCF_WHILEM_VISITED_POS)
5351                     f |= SCF_WHILEM_VISITED_POS;
5352                 next = regnext(scan);
5353                 nscan = NEXTOPER(NEXTOPER(scan));
5354                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5355                                       last, &data_fake, stopparen,
5356                                       recursed_depth, NULL, f, depth+1);
5357                 if (scan->flags) {
5358                     if (deltanext) {
5359                         FAIL("Variable length lookbehind not implemented");
5360                     }
5361                     else if (minnext > (I32)U8_MAX) {
5362                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5363                               (UV)U8_MAX);
5364                     }
5365                     scan->flags = (U8)minnext;
5366                 }
5367                 if (data) {
5368                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5369                         pars++;
5370                     if (data_fake.flags & SF_HAS_EVAL)
5371                         data->flags |= SF_HAS_EVAL;
5372                     data->whilem_c = data_fake.whilem_c;
5373                 }
5374                 if (f & SCF_DO_STCLASS_AND) {
5375                     if (flags & SCF_DO_STCLASS_OR) {
5376                         /* OR before, AND after: ideally we would recurse with
5377                          * data_fake to get the AND applied by study of the
5378                          * remainder of the pattern, and then derecurse;
5379                          * *** HACK *** for now just treat as "no information".
5380                          * See [perl #56690].
5381                          */
5382                         ssc_init(pRExC_state, data->start_class);
5383                     }  else {
5384                         /* AND before and after: combine and continue.  These
5385                          * assertions are zero-length, so can match an EMPTY
5386                          * string */
5387                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5388                         ANYOF_FLAGS(data->start_class)
5389                                                    |= SSC_MATCHES_EMPTY_STRING;
5390                     }
5391                 }
5392             }
5393 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5394             else {
5395                 /* Positive Lookahead/lookbehind
5396                    In this case we can do fixed string optimisation,
5397                    but we must be careful about it. Note in the case of
5398                    lookbehind the positions will be offset by the minimum
5399                    length of the pattern, something we won't know about
5400                    until after the recurse.
5401                 */
5402                 SSize_t deltanext, fake = 0;
5403                 regnode *nscan;
5404                 regnode_ssc intrnl;
5405                 int f = 0;
5406                 /* We use SAVEFREEPV so that when the full compile
5407                     is finished perl will clean up the allocated
5408                     minlens when it's all done. This way we don't
5409                     have to worry about freeing them when we know
5410                     they wont be used, which would be a pain.
5411                  */
5412                 SSize_t *minnextp;
5413                 Newx( minnextp, 1, SSize_t );
5414                 SAVEFREEPV(minnextp);
5415
5416                 if (data) {
5417                     StructCopy(data, &data_fake, scan_data_t);
5418                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5419                         f |= SCF_DO_SUBSTR;
5420                         if (scan->flags)
5421                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5422                         data_fake.last_found=newSVsv(data->last_found);
5423                     }
5424                 }
5425                 else
5426                     data_fake.last_closep = &fake;
5427                 data_fake.flags = 0;
5428                 data_fake.pos_delta = delta;
5429                 if (is_inf)
5430                     data_fake.flags |= SF_IS_INF;
5431                 if ( flags & SCF_DO_STCLASS && !scan->flags
5432                      && OP(scan) == IFMATCH ) { /* Lookahead */
5433                     ssc_init(pRExC_state, &intrnl);
5434                     data_fake.start_class = &intrnl;
5435                     f |= SCF_DO_STCLASS_AND;
5436                 }
5437                 if (flags & SCF_WHILEM_VISITED_POS)
5438                     f |= SCF_WHILEM_VISITED_POS;
5439                 next = regnext(scan);
5440                 nscan = NEXTOPER(NEXTOPER(scan));
5441
5442                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5443                                         &deltanext, last, &data_fake,
5444                                         stopparen, recursed_depth, NULL,
5445                                         f,depth+1);
5446                 if (scan->flags) {
5447                     if (deltanext) {
5448                         FAIL("Variable length lookbehind not implemented");
5449                     }
5450                     else if (*minnextp > (I32)U8_MAX) {
5451                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5452                               (UV)U8_MAX);
5453                     }
5454                     scan->flags = (U8)*minnextp;
5455                 }
5456
5457                 *minnextp += min;
5458
5459                 if (f & SCF_DO_STCLASS_AND) {
5460                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5461                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5462                 }
5463                 if (data) {
5464                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5465                         pars++;
5466                     if (data_fake.flags & SF_HAS_EVAL)
5467                         data->flags |= SF_HAS_EVAL;
5468                     data->whilem_c = data_fake.whilem_c;
5469                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5470                         if (RExC_rx->minlen<*minnextp)
5471                             RExC_rx->minlen=*minnextp;
5472                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5473                         SvREFCNT_dec_NN(data_fake.last_found);
5474
5475                         if ( data_fake.minlen_fixed != minlenp )
5476                         {
5477                             data->offset_fixed= data_fake.offset_fixed;
5478                             data->minlen_fixed= data_fake.minlen_fixed;
5479                             data->lookbehind_fixed+= scan->flags;
5480                         }
5481                         if ( data_fake.minlen_float != minlenp )
5482                         {
5483                             data->minlen_float= data_fake.minlen_float;
5484                             data->offset_float_min=data_fake.offset_float_min;
5485                             data->offset_float_max=data_fake.offset_float_max;
5486                             data->lookbehind_float+= scan->flags;
5487                         }
5488                     }
5489                 }
5490             }
5491 #endif
5492         }
5493         else if (OP(scan) == OPEN) {
5494             if (stopparen != (I32)ARG(scan))
5495                 pars++;
5496         }
5497         else if (OP(scan) == CLOSE) {
5498             if (stopparen == (I32)ARG(scan)) {
5499                 break;
5500             }
5501             if ((I32)ARG(scan) == is_par) {
5502                 next = regnext(scan);
5503
5504                 if ( next && (OP(next) != WHILEM) && next < last)
5505                     is_par = 0;         /* Disable optimization */
5506             }
5507             if (data)
5508                 *(data->last_closep) = ARG(scan);
5509         }
5510         else if (OP(scan) == EVAL) {
5511                 if (data)
5512                     data->flags |= SF_HAS_EVAL;
5513         }
5514         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5515             if (flags & SCF_DO_SUBSTR) {
5516                 scan_commit(pRExC_state, data, minlenp, is_inf);
5517                 flags &= ~SCF_DO_SUBSTR;
5518             }
5519             if (data && OP(scan)==ACCEPT) {
5520                 data->flags |= SCF_SEEN_ACCEPT;
5521                 if (stopmin > min)
5522                     stopmin = min;
5523             }
5524         }
5525         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5526         {
5527                 if (flags & SCF_DO_SUBSTR) {
5528                     scan_commit(pRExC_state, data, minlenp, is_inf);
5529                     data->longest = &(data->longest_float);
5530                 }
5531                 is_inf = is_inf_internal = 1;
5532                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5533                     ssc_anything(data->start_class);
5534                 flags &= ~SCF_DO_STCLASS;
5535         }
5536         else if (OP(scan) == GPOS) {
5537             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5538                 !(delta || is_inf || (data && data->pos_delta)))
5539             {
5540                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5541                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5542                 if (RExC_rx->gofs < (STRLEN)min)
5543                     RExC_rx->gofs = min;
5544             } else {
5545                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5546                 RExC_rx->gofs = 0;
5547             }
5548         }
5549 #ifdef TRIE_STUDY_OPT
5550 #ifdef FULL_TRIE_STUDY
5551         else if (PL_regkind[OP(scan)] == TRIE) {
5552             /* NOTE - There is similar code to this block above for handling
5553                BRANCH nodes on the initial study.  If you change stuff here
5554                check there too. */
5555             regnode *trie_node= scan;
5556             regnode *tail= regnext(scan);
5557             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5558             SSize_t max1 = 0, min1 = SSize_t_MAX;
5559             regnode_ssc accum;
5560
5561             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5562                 /* Cannot merge strings after this. */
5563                 scan_commit(pRExC_state, data, minlenp, is_inf);
5564             }
5565             if (flags & SCF_DO_STCLASS)
5566                 ssc_init_zero(pRExC_state, &accum);
5567
5568             if (!trie->jump) {
5569                 min1= trie->minlen;
5570                 max1= trie->maxlen;
5571             } else {
5572                 const regnode *nextbranch= NULL;
5573                 U32 word;
5574
5575                 for ( word=1 ; word <= trie->wordcount ; word++)
5576                 {
5577                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5578                     regnode_ssc this_class;
5579
5580                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5581                     if (data) {
5582                         data_fake.whilem_c = data->whilem_c;
5583                         data_fake.last_closep = data->last_closep;
5584                     }
5585                     else
5586                         data_fake.last_closep = &fake;
5587                     data_fake.pos_delta = delta;
5588                     if (flags & SCF_DO_STCLASS) {
5589                         ssc_init(pRExC_state, &this_class);
5590                         data_fake.start_class = &this_class;
5591                         f = SCF_DO_STCLASS_AND;
5592                     }
5593                     if (flags & SCF_WHILEM_VISITED_POS)
5594                         f |= SCF_WHILEM_VISITED_POS;
5595
5596                     if (trie->jump[word]) {
5597                         if (!nextbranch)
5598                             nextbranch = trie_node + trie->jump[0];
5599                         scan= trie_node + trie->jump[word];
5600                         /* We go from the jump point to the branch that follows
5601                            it. Note this means we need the vestigal unused
5602                            branches even though they arent otherwise used. */
5603                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5604                             &deltanext, (regnode *)nextbranch, &data_fake,
5605                             stopparen, recursed_depth, NULL, f,depth+1);
5606                     }
5607                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5608                         nextbranch= regnext((regnode*)nextbranch);
5609
5610                     if (min1 > (SSize_t)(minnext + trie->minlen))
5611                         min1 = minnext + trie->minlen;
5612                     if (deltanext == SSize_t_MAX) {
5613                         is_inf = is_inf_internal = 1;
5614                         max1 = SSize_t_MAX;
5615                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5616                         max1 = minnext + deltanext + trie->maxlen;
5617
5618                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5619                         pars++;
5620                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5621                         if ( stopmin > min + min1)
5622                             stopmin = min + min1;
5623                         flags &= ~SCF_DO_SUBSTR;
5624                         if (data)
5625                             data->flags |= SCF_SEEN_ACCEPT;
5626                     }
5627                     if (data) {
5628                         if (data_fake.flags & SF_HAS_EVAL)
5629                             data->flags |= SF_HAS_EVAL;
5630                         data->whilem_c = data_fake.whilem_c;
5631                     }
5632                     if (flags & SCF_DO_STCLASS)
5633                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5634                 }
5635             }
5636             if (flags & SCF_DO_SUBSTR) {
5637                 data->pos_min += min1;
5638                 data->pos_delta += max1 - min1;
5639                 if (max1 != min1 || is_inf)
5640                     data->longest = &(data->longest_float);
5641             }
5642             min += min1;
5643             delta += max1 - min1;
5644             if (flags & SCF_DO_STCLASS_OR) {
5645                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5646                 if (min1) {
5647                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5648                     flags &= ~SCF_DO_STCLASS;
5649                 }
5650             }
5651             else if (flags & SCF_DO_STCLASS_AND) {
5652                 if (min1) {
5653                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5654                     flags &= ~SCF_DO_STCLASS;
5655                 }
5656                 else {
5657                     /* Switch to OR mode: cache the old value of
5658                      * data->start_class */
5659                     INIT_AND_WITHP;
5660                     StructCopy(data->start_class, and_withp, regnode_ssc);
5661                     flags &= ~SCF_DO_STCLASS_AND;
5662                     StructCopy(&accum, data->start_class, regnode_ssc);
5663                     flags |= SCF_DO_STCLASS_OR;
5664                 }
5665             }
5666             scan= tail;
5667             continue;
5668         }
5669 #else
5670         else if (PL_regkind[OP(scan)] == TRIE) {
5671             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5672             U8*bang=NULL;
5673
5674             min += trie->minlen;
5675             delta += (trie->maxlen - trie->minlen);
5676             flags &= ~SCF_DO_STCLASS; /* xxx */
5677             if (flags & SCF_DO_SUBSTR) {
5678                 /* Cannot expect anything... */
5679                 scan_commit(pRExC_state, data, minlenp, is_inf);
5680                 data->pos_min += trie->minlen;
5681                 data->pos_delta += (trie->maxlen - trie->minlen);
5682                 if (trie->maxlen != trie->minlen)
5683                     data->longest = &(data->longest_float);
5684             }
5685             if (trie->jump) /* no more substrings -- for now /grr*/
5686                flags &= ~SCF_DO_SUBSTR;
5687         }
5688 #endif /* old or new */
5689 #endif /* TRIE_STUDY_OPT */
5690
5691         /* Else: zero-length, ignore. */
5692         scan = regnext(scan);
5693     }
5694     /* If we are exiting a recursion we can unset its recursed bit
5695      * and allow ourselves to enter it again - no danger of an
5696      * infinite loop there.
5697     if (stopparen > -1 && recursed) {
5698         DEBUG_STUDYDATA("unset:", data,depth);
5699         PAREN_UNSET( recursed, stopparen);
5700     }
5701     */
5702     if (frame) {
5703         depth = depth - 1;
5704
5705         DEBUG_STUDYDATA("frame-end:",data,depth);
5706         DEBUG_PEEP("fend", scan, depth);
5707
5708         /* restore previous context */
5709         last = frame->last_regnode;
5710         scan = frame->next_regnode;
5711         stopparen = frame->stopparen;
5712         recursed_depth = frame->prev_recursed_depth;
5713
5714         RExC_frame_last = frame->prev_frame;
5715         frame = frame->this_prev_frame;
5716         goto fake_study_recurse;
5717     }
5718
5719   finish:
5720     assert(!frame);
5721     DEBUG_STUDYDATA("pre-fin:",data,depth);
5722
5723     *scanp = scan;
5724     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5725
5726     if (flags & SCF_DO_SUBSTR && is_inf)
5727         data->pos_delta = SSize_t_MAX - data->pos_min;
5728     if (is_par > (I32)U8_MAX)
5729         is_par = 0;
5730     if (is_par && pars==1 && data) {
5731         data->flags |= SF_IN_PAR;
5732         data->flags &= ~SF_HAS_PAR;
5733     }
5734     else if (pars && data) {
5735         data->flags |= SF_HAS_PAR;
5736         data->flags &= ~SF_IN_PAR;
5737     }
5738     if (flags & SCF_DO_STCLASS_OR)
5739         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5740     if (flags & SCF_TRIE_RESTUDY)
5741         data->flags |=  SCF_TRIE_RESTUDY;
5742
5743     DEBUG_STUDYDATA("post-fin:",data,depth);
5744
5745     {
5746         SSize_t final_minlen= min < stopmin ? min : stopmin;
5747
5748         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5749             RExC_maxlen = final_minlen + delta;
5750         }
5751         return final_minlen;
5752     }
5753     /* not-reached */
5754 }
5755
5756 STATIC U32
5757 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5758 {
5759     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5760
5761     PERL_ARGS_ASSERT_ADD_DATA;
5762
5763     Renewc(RExC_rxi->data,
5764            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5765            char, struct reg_data);
5766     if(count)
5767         Renew(RExC_rxi->data->what, count + n, U8);
5768     else
5769         Newx(RExC_rxi->data->what, n, U8);
5770     RExC_rxi->data->count = count + n;
5771     Copy(s, RExC_rxi->data->what + count, n, U8);
5772     return count;
5773 }
5774
5775 /*XXX: todo make this not included in a non debugging perl, but appears to be
5776  * used anyway there, in 'use re' */
5777 #ifndef PERL_IN_XSUB_RE
5778 void
5779 Perl_reginitcolors(pTHX)
5780 {
5781     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5782     if (s) {
5783         char *t = savepv(s);
5784         int i = 0;
5785         PL_colors[0] = t;
5786         while (++i < 6) {
5787             t = strchr(t, '\t');
5788             if (t) {
5789                 *t = '\0';
5790                 PL_colors[i] = ++t;
5791             }
5792             else
5793                 PL_colors[i] = t = (char *)"";
5794         }
5795     } else {
5796         int i = 0;
5797         while (i < 6)
5798             PL_colors[i++] = (char *)"";
5799     }
5800     PL_colorset = 1;
5801 }
5802 #endif
5803
5804
5805 #ifdef TRIE_STUDY_OPT
5806 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5807     STMT_START {                                            \
5808         if (                                                \
5809               (data.flags & SCF_TRIE_RESTUDY)               \
5810               && ! restudied++                              \
5811         ) {                                                 \
5812             dOsomething;                                    \
5813             goto reStudy;                                   \
5814         }                                                   \
5815     } STMT_END
5816 #else
5817 #define CHECK_RESTUDY_GOTO_butfirst
5818 #endif
5819
5820 /*
5821  * pregcomp - compile a regular expression into internal code
5822  *
5823  * Decides which engine's compiler to call based on the hint currently in
5824  * scope
5825  */
5826
5827 #ifndef PERL_IN_XSUB_RE
5828
5829 /* return the currently in-scope regex engine (or the default if none)  */
5830
5831 regexp_engine const *
5832 Perl_current_re_engine(pTHX)
5833 {
5834     if (IN_PERL_COMPILETIME) {
5835         HV * const table = GvHV(PL_hintgv);
5836         SV **ptr;
5837
5838         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5839             return &PL_core_reg_engine;
5840         ptr = hv_fetchs(table, "regcomp", FALSE);
5841         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5842             return &PL_core_reg_engine;
5843         return INT2PTR(regexp_engine*,SvIV(*ptr));
5844     }
5845     else {
5846         SV *ptr;
5847         if (!PL_curcop->cop_hints_hash)
5848             return &PL_core_reg_engine;
5849         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5850         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5851             return &PL_core_reg_engine;
5852         return INT2PTR(regexp_engine*,SvIV(ptr));
5853     }
5854 }
5855
5856
5857 REGEXP *
5858 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5859 {
5860     regexp_engine const *eng = current_re_engine();
5861     GET_RE_DEBUG_FLAGS_DECL;
5862
5863     PERL_ARGS_ASSERT_PREGCOMP;
5864
5865     /* Dispatch a request to compile a regexp to correct regexp engine. */
5866     DEBUG_COMPILE_r({
5867         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5868                         PTR2UV(eng));
5869     });
5870     return CALLREGCOMP_ENG(eng, pattern, flags);
5871 }
5872 #endif
5873
5874 /* public(ish) entry point for the perl core's own regex compiling code.
5875  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5876  * pattern rather than a list of OPs, and uses the internal engine rather
5877  * than the current one */
5878
5879 REGEXP *
5880 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5881 {
5882     SV *pat = pattern; /* defeat constness! */
5883     PERL_ARGS_ASSERT_RE_COMPILE;
5884     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5885 #ifdef PERL_IN_XSUB_RE
5886                                 &my_reg_engine,
5887 #else
5888                                 &PL_core_reg_engine,
5889 #endif
5890                                 NULL, NULL, rx_flags, 0);
5891 }
5892
5893
5894 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5895  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5896  * point to the realloced string and length.
5897  *
5898  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5899  * stuff added */
5900
5901 static void
5902 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5903                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5904 {
5905     U8 *const src = (U8*)*pat_p;
5906     U8 *dst, *d;
5907     int n=0;
5908     STRLEN s = 0;
5909     bool do_end = 0;
5910     GET_RE_DEBUG_FLAGS_DECL;
5911
5912     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5913         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5914
5915     Newx(dst, *plen_p * 2 + 1, U8);
5916     d = dst;
5917
5918     while (s < *plen_p) {
5919         append_utf8_from_native_byte(src[s], &d);
5920         if (n < num_code_blocks) {
5921             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5922                 pRExC_state->code_blocks[n].start = d - dst - 1;
5923                 assert(*(d - 1) == '(');
5924                 do_end = 1;
5925             }
5926             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5927                 pRExC_state->code_blocks[n].end = d - dst - 1;
5928                 assert(*(d - 1) == ')');
5929                 do_end = 0;
5930                 n++;
5931             }
5932         }
5933         s++;
5934     }
5935     *d = '\0';
5936     *plen_p = d - dst;
5937     *pat_p = (char*) dst;
5938     SAVEFREEPV(*pat_p);
5939     RExC_orig_utf8 = RExC_utf8 = 1;
5940 }
5941
5942
5943
5944 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5945  * while recording any code block indices, and handling overloading,
5946  * nested qr// objects etc.  If pat is null, it will allocate a new
5947  * string, or just return the first arg, if there's only one.
5948  *
5949  * Returns the malloced/updated pat.
5950  * patternp and pat_count is the array of SVs to be concatted;
5951  * oplist is the optional list of ops that generated the SVs;
5952  * recompile_p is a pointer to a boolean that will be set if
5953  *   the regex will need to be recompiled.
5954  * delim, if non-null is an SV that will be inserted between each element
5955  */
5956
5957 static SV*
5958 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5959                 SV *pat, SV ** const patternp, int pat_count,
5960                 OP *oplist, bool *recompile_p, SV *delim)
5961 {
5962     SV **svp;
5963     int n = 0;
5964     bool use_delim = FALSE;
5965     bool alloced = FALSE;
5966
5967     /* if we know we have at least two args, create an empty string,
5968      * then concatenate args to that. For no args, return an empty string */
5969     if (!pat && pat_count != 1) {
5970         pat = newSVpvs("");
5971         SAVEFREESV(pat);
5972         alloced = TRUE;
5973     }
5974
5975     for (svp = patternp; svp < patternp + pat_count; svp++) {
5976         SV *sv;
5977         SV *rx  = NULL;
5978         STRLEN orig_patlen = 0;
5979         bool code = 0;
5980         SV *msv = use_delim ? delim : *svp;
5981         if (!msv) msv = &PL_sv_undef;
5982
5983         /* if we've got a delimiter, we go round the loop twice for each
5984          * svp slot (except the last), using the delimiter the second
5985          * time round */
5986         if (use_delim) {
5987             svp--;
5988             use_delim = FALSE;
5989         }
5990         else if (delim)
5991             use_delim = TRUE;
5992
5993         if (SvTYPE(msv) == SVt_PVAV) {
5994             /* we've encountered an interpolated array within
5995              * the pattern, e.g. /...@a..../. Expand the list of elements,
5996              * then recursively append elements.
5997              * The code in this block is based on S_pushav() */
5998
5999             AV *const av = (AV*)msv;
6000             const SSize_t maxarg = AvFILL(av) + 1;
6001             SV **array;
6002
6003             if (oplist) {
6004                 assert(oplist->op_type == OP_PADAV
6005                     || oplist->op_type == OP_RV2AV);
6006                 oplist = OP_SIBLING(oplist);
6007             }
6008
6009             if (SvRMAGICAL(av)) {
6010                 SSize_t i;
6011
6012                 Newx(array, maxarg, SV*);
6013                 SAVEFREEPV(array);
6014                 for (i=0; i < maxarg; i++) {
6015                     SV ** const svp = av_fetch(av, i, FALSE);
6016                     array[i] = svp ? *svp : &PL_sv_undef;
6017                 }
6018             }
6019             else
6020                 array = AvARRAY(av);
6021
6022             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6023                                 array, maxarg, NULL, recompile_p,
6024                                 /* $" */
6025                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6026
6027             continue;
6028         }
6029
6030
6031         /* we make the assumption here that each op in the list of
6032          * op_siblings maps to one SV pushed onto the stack,
6033          * except for code blocks, with have both an OP_NULL and
6034          * and OP_CONST.
6035          * This allows us to match up the list of SVs against the
6036          * list of OPs to find the next code block.
6037          *
6038          * Note that       PUSHMARK PADSV PADSV ..
6039          * is optimised to
6040          *                 PADRANGE PADSV  PADSV  ..
6041          * so the alignment still works. */
6042
6043         if (oplist) {
6044             if (oplist->op_type == OP_NULL
6045                 && (oplist->op_flags & OPf_SPECIAL))
6046             {
6047                 assert(n < pRExC_state->num_code_blocks);
6048                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6049                 pRExC_state->code_blocks[n].block = oplist;
6050                 pRExC_state->code_blocks[n].src_regex = NULL;
6051                 n++;
6052                 code = 1;
6053                 oplist = OP_SIBLING(oplist); /* skip CONST */
6054                 assert(oplist);
6055             }
6056             oplist = OP_SIBLING(oplist);;
6057         }
6058
6059         /* apply magic and QR overloading to arg */
6060
6061         SvGETMAGIC(msv);
6062         if (SvROK(msv) && SvAMAGIC(msv)) {
6063             SV *sv = AMG_CALLunary(msv, regexp_amg);
6064             if (sv) {
6065                 if (SvROK(sv))
6066                     sv = SvRV(sv);
6067                 if (SvTYPE(sv) != SVt_REGEXP)
6068                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6069                 msv = sv;
6070             }
6071         }
6072
6073         /* try concatenation overload ... */
6074         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6075                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6076         {
6077             sv_setsv(pat, sv);
6078             /* overloading involved: all bets are off over literal
6079              * code. Pretend we haven't seen it */
6080             pRExC_state->num_code_blocks -= n;
6081             n = 0;
6082         }
6083         else  {
6084             /* ... or failing that, try "" overload */
6085             while (SvAMAGIC(msv)
6086                     && (sv = AMG_CALLunary(msv, string_amg))
6087                     && sv != msv
6088                     &&  !(   SvROK(msv)
6089                           && SvROK(sv)
6090                           && SvRV(msv) == SvRV(sv))
6091             ) {
6092                 msv = sv;
6093                 SvGETMAGIC(msv);
6094             }
6095             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6096                 msv = SvRV(msv);
6097
6098             if (pat) {
6099                 /* this is a partially unrolled
6100                  *     sv_catsv_nomg(pat, msv);
6101                  * that allows us to adjust code block indices if
6102                  * needed */
6103                 STRLEN dlen;
6104                 char *dst = SvPV_force_nomg(pat, dlen);
6105                 orig_patlen = dlen;
6106                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6107                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6108                     sv_setpvn(pat, dst, dlen);
6109                     SvUTF8_on(pat);
6110                 }
6111                 sv_catsv_nomg(pat, msv);
6112                 rx = msv;
6113             }
6114             else
6115                 pat = msv;
6116
6117             if (code)
6118                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6119         }
6120
6121         /* extract any code blocks within any embedded qr//'s */
6122         if (rx && SvTYPE(rx) == SVt_REGEXP
6123             && RX_ENGINE((REGEXP*)rx)->op_comp)
6124         {
6125
6126             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6127             if (ri->num_code_blocks) {
6128                 int i;
6129                 /* the presence of an embedded qr// with code means
6130                  * we should always recompile: the text of the
6131                  * qr// may not have changed, but it may be a
6132                  * different closure than last time */
6133                 *recompile_p = 1;
6134                 Renew(pRExC_state->code_blocks,
6135                     pRExC_state->num_code_blocks + ri->num_code_blocks,
6136                     struct reg_code_block);
6137                 pRExC_state->num_code_blocks += ri->num_code_blocks;
6138
6139                 for (i=0; i < ri->num_code_blocks; i++) {
6140                     struct reg_code_block *src, *dst;
6141                     STRLEN offset =  orig_patlen
6142                         + ReANY((REGEXP *)rx)->pre_prefix;
6143                     assert(n < pRExC_state->num_code_blocks);
6144                     src = &ri->code_blocks[i];
6145                     dst = &pRExC_state->code_blocks[n];
6146                     dst->start      = src->start + offset;
6147                     dst->end        = src->end   + offset;
6148                     dst->block      = src->block;
6149                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6150                                             src->src_regex
6151                                                 ? src->src_regex
6152                                                 : (REGEXP*)rx);
6153                     n++;
6154                 }
6155             }
6156         }
6157     }
6158     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6159     if (alloced)
6160         SvSETMAGIC(pat);
6161
6162     return pat;
6163 }
6164
6165
6166
6167 /* see if there are any run-time code blocks in the pattern.
6168  * False positives are allowed */
6169
6170 static bool
6171 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6172                     char *pat, STRLEN plen)
6173 {
6174     int n = 0;
6175     STRLEN s;
6176
6177     PERL_UNUSED_CONTEXT;
6178
6179     for (s = 0; s < plen; s++) {
6180         if (n < pRExC_state->num_code_blocks
6181             && s == pRExC_state->code_blocks[n].start)
6182         {
6183             s = pRExC_state->code_blocks[n].end;
6184             n++;
6185             continue;
6186         }
6187         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6188          * positives here */
6189         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6190             (pat[s+2] == '{'
6191                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6192         )
6193             return 1;
6194     }
6195     return 0;
6196 }
6197
6198 /* Handle run-time code blocks. We will already have compiled any direct
6199  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6200  * copy of it, but with any literal code blocks blanked out and
6201  * appropriate chars escaped; then feed it into
6202  *
6203  *    eval "qr'modified_pattern'"
6204  *
6205  * For example,
6206  *
6207  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6208  *
6209  * becomes
6210  *
6211  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6212  *
6213  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6214  * and merge them with any code blocks of the original regexp.
6215  *
6216  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6217  * instead, just save the qr and return FALSE; this tells our caller that
6218  * the original pattern needs upgrading to utf8.
6219  */
6220
6221 static bool
6222 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6223     char *pat, STRLEN plen)
6224 {
6225     SV *qr;
6226
6227     GET_RE_DEBUG_FLAGS_DECL;
6228
6229     if (pRExC_state->runtime_code_qr) {
6230         /* this is the second time we've been called; this should
6231          * only happen if the main pattern got upgraded to utf8
6232          * during compilation; re-use the qr we compiled first time
6233          * round (which should be utf8 too)
6234          */
6235         qr = pRExC_state->runtime_code_qr;
6236         pRExC_state->runtime_code_qr = NULL;
6237         assert(RExC_utf8 && SvUTF8(qr));
6238     }
6239     else {
6240         int n = 0;
6241         STRLEN s;
6242         char *p, *newpat;
6243         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6244         SV *sv, *qr_ref;
6245         dSP;
6246
6247         /* determine how many extra chars we need for ' and \ escaping */
6248         for (s = 0; s < plen; s++) {
6249             if (pat[s] == '\'' || pat[s] == '\\')
6250                 newlen++;
6251         }
6252
6253         Newx(newpat, newlen, char);
6254         p = newpat;
6255         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6256
6257         for (s = 0; s < plen; s++) {
6258             if (n < pRExC_state->num_code_blocks
6259                 && s == pRExC_state->code_blocks[n].start)
6260             {
6261                 /* blank out literal code block */
6262                 assert(pat[s] == '(');
6263                 while (s <= pRExC_state->code_blocks[n].end) {
6264                     *p++ = '_';
6265                     s++;
6266                 }
6267                 s--;
6268                 n++;
6269                 continue;
6270             }
6271             if (pat[s] == '\'' || pat[s] == '\\')
6272                 *p++ = '\\';
6273             *p++ = pat[s];
6274         }
6275         *p++ = '\'';
6276         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6277             *p++ = 'x';
6278         *p++ = '\0';
6279         DEBUG_COMPILE_r({
6280             PerlIO_printf(Perl_debug_log,
6281                 "%sre-parsing pattern for runtime code:%s %s\n",
6282                 PL_colors[4],PL_colors[5],newpat);
6283         });
6284
6285         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6286         Safefree(newpat);
6287
6288         ENTER;
6289         SAVETMPS;
6290         PUSHSTACKi(PERLSI_REQUIRE);
6291         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6292          * parsing qr''; normally only q'' does this. It also alters
6293          * hints handling */
6294         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6295         SvREFCNT_dec_NN(sv);
6296         SPAGAIN;
6297         qr_ref = POPs;
6298         PUTBACK;
6299         {
6300             SV * const errsv = ERRSV;
6301             if (SvTRUE_NN(errsv))
6302             {
6303                 Safefree(pRExC_state->code_blocks);
6304                 /* use croak_sv ? */
6305                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6306             }
6307         }
6308         assert(SvROK(qr_ref));
6309         qr = SvRV(qr_ref);
6310         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6311         /* the leaving below frees the tmp qr_ref.
6312          * Give qr a life of its own */
6313         SvREFCNT_inc(qr);
6314         POPSTACK;
6315         FREETMPS;
6316         LEAVE;
6317
6318     }
6319
6320     if (!RExC_utf8 && SvUTF8(qr)) {
6321         /* first time through; the pattern got upgraded; save the
6322          * qr for the next time through */
6323         assert(!pRExC_state->runtime_code_qr);
6324         pRExC_state->runtime_code_qr = qr;
6325         return 0;
6326     }
6327
6328
6329     /* extract any code blocks within the returned qr//  */
6330
6331
6332     /* merge the main (r1) and run-time (r2) code blocks into one */
6333     {
6334         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6335         struct reg_code_block *new_block, *dst;
6336         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6337         int i1 = 0, i2 = 0;
6338
6339         if (!r2->num_code_blocks) /* we guessed wrong */
6340         {
6341             SvREFCNT_dec_NN(qr);
6342             return 1;
6343         }
6344
6345         Newx(new_block,
6346             r1->num_code_blocks + r2->num_code_blocks,
6347             struct reg_code_block);
6348         dst = new_block;
6349
6350         while (    i1 < r1->num_code_blocks
6351                 || i2 < r2->num_code_blocks)
6352         {
6353             struct reg_code_block *src;
6354             bool is_qr = 0;
6355
6356             if (i1 == r1->num_code_blocks) {
6357                 src = &r2->code_blocks[i2++];
6358                 is_qr = 1;
6359             }
6360             else if (i2 == r2->num_code_blocks)
6361                 src = &r1->code_blocks[i1++];
6362             else if (  r1->code_blocks[i1].start
6363                      < r2->code_blocks[i2].start)
6364             {
6365                 src = &r1->code_blocks[i1++];
6366                 assert(src->end < r2->code_blocks[i2].start);
6367             }
6368             else {
6369                 assert(  r1->code_blocks[i1].start
6370                        > r2->code_blocks[i2].start);
6371                 src = &r2->code_blocks[i2++];
6372                 is_qr = 1;
6373                 assert(src->end < r1->code_blocks[i1].start);
6374             }
6375
6376             assert(pat[src->start] == '(');
6377             assert(pat[src->end]   == ')');
6378             dst->start      = src->start;
6379             dst->end        = src->end;
6380             dst->block      = src->block;
6381             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6382                                     : src->src_regex;
6383             dst++;
6384         }
6385         r1->num_code_blocks += r2->num_code_blocks;
6386         Safefree(r1->code_blocks);
6387         r1->code_blocks = new_block;
6388     }
6389
6390     SvREFCNT_dec_NN(qr);
6391     return 1;
6392 }
6393
6394
6395 STATIC bool
6396 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6397                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6398                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6399                       STRLEN longest_length, bool eol, bool meol)
6400 {
6401     /* This is the common code for setting up the floating and fixed length
6402      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6403      * as to whether succeeded or not */
6404
6405     I32 t;
6406     SSize_t ml;
6407
6408     if (! (longest_length
6409            || (eol /* Can't have SEOL and MULTI */
6410                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6411           )
6412             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6413         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6414     {
6415         return FALSE;
6416     }
6417
6418     /* copy the information about the longest from the reg_scan_data
6419         over to the program. */
6420     if (SvUTF8(sv_longest)) {
6421         *rx_utf8 = sv_longest;
6422         *rx_substr = NULL;
6423     } else {
6424         *rx_substr = sv_longest;
6425         *rx_utf8 = NULL;
6426     }
6427     /* end_shift is how many chars that must be matched that
6428         follow this item. We calculate it ahead of time as once the
6429         lookbehind offset is added in we lose the ability to correctly
6430         calculate it.*/
6431     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6432     *rx_end_shift = ml - offset
6433         - longest_length + (SvTAIL(sv_longest) != 0)
6434         + lookbehind;
6435
6436     t = (eol/* Can't have SEOL and MULTI */
6437          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6438     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6439
6440     return TRUE;
6441 }
6442
6443 /*
6444  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6445  * regular expression into internal code.
6446  * The pattern may be passed either as:
6447  *    a list of SVs (patternp plus pat_count)
6448  *    a list of OPs (expr)
6449  * If both are passed, the SV list is used, but the OP list indicates
6450  * which SVs are actually pre-compiled code blocks
6451  *
6452  * The SVs in the list have magic and qr overloading applied to them (and
6453  * the list may be modified in-place with replacement SVs in the latter
6454  * case).
6455  *
6456  * If the pattern hasn't changed from old_re, then old_re will be
6457  * returned.
6458  *
6459  * eng is the current engine. If that engine has an op_comp method, then
6460  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6461  * do the initial concatenation of arguments and pass on to the external
6462  * engine.
6463  *
6464  * If is_bare_re is not null, set it to a boolean indicating whether the
6465  * arg list reduced (after overloading) to a single bare regex which has
6466  * been returned (i.e. /$qr/).
6467  *
6468  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6469  *
6470  * pm_flags contains the PMf_* flags, typically based on those from the
6471  * pm_flags field of the related PMOP. Currently we're only interested in
6472  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6473  *
6474  * We can't allocate space until we know how big the compiled form will be,
6475  * but we can't compile it (and thus know how big it is) until we've got a
6476  * place to put the code.  So we cheat:  we compile it twice, once with code
6477  * generation turned off and size counting turned on, and once "for real".
6478  * This also means that we don't allocate space until we are sure that the
6479  * thing really will compile successfully, and we never have to move the
6480  * code and thus invalidate pointers into it.  (Note that it has to be in
6481  * one piece because free() must be able to free it all.) [NB: not true in perl]
6482  *
6483  * Beware that the optimization-preparation code in here knows about some
6484  * of the structure of the compiled regexp.  [I'll say.]
6485  */
6486
6487 REGEXP *
6488 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6489                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6490                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6491 {
6492     REGEXP *rx;
6493     struct regexp *r;
6494     regexp_internal *ri;
6495     STRLEN plen;
6496     char *exp;
6497     regnode *scan;
6498     I32 flags;
6499     SSize_t minlen = 0;
6500     U32 rx_flags;
6501     SV *pat;
6502     SV *code_blocksv = NULL;
6503     SV** new_patternp = patternp;
6504
6505     /* these are all flags - maybe they should be turned
6506      * into a single int with different bit masks */
6507     I32 sawlookahead = 0;
6508     I32 sawplus = 0;
6509     I32 sawopen = 0;
6510     I32 sawminmod = 0;
6511
6512     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6513     bool recompile = 0;
6514     bool runtime_code = 0;
6515     scan_data_t data;
6516     RExC_state_t RExC_state;
6517     RExC_state_t * const pRExC_state = &RExC_state;
6518 #ifdef TRIE_STUDY_OPT
6519     int restudied = 0;
6520     RExC_state_t copyRExC_state;
6521 #endif
6522     GET_RE_DEBUG_FLAGS_DECL;
6523
6524     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6525
6526     DEBUG_r(if (!PL_colorset) reginitcolors());
6527
6528 #ifndef PERL_IN_XSUB_RE
6529     /* Initialize these here instead of as-needed, as is quick and avoids
6530      * having to test them each time otherwise */
6531     if (! PL_AboveLatin1) {
6532         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6533         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6534         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6535         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6536         PL_HasMultiCharFold =
6537                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6538
6539         /* This is calculated here, because the Perl program that generates the
6540          * static global ones doesn't currently have access to
6541          * NUM_ANYOF_CODE_POINTS */
6542         PL_InBitmap = _new_invlist(2);
6543         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6544                                                     NUM_ANYOF_CODE_POINTS - 1);
6545     }
6546 #endif
6547
6548     pRExC_state->code_blocks = NULL;
6549     pRExC_state->num_code_blocks = 0;
6550
6551     if (is_bare_re)
6552         *is_bare_re = FALSE;
6553
6554     if (expr && (expr->op_type == OP_LIST ||
6555                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6556         /* allocate code_blocks if needed */
6557         OP *o;
6558         int ncode = 0;
6559
6560         for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6561             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6562                 ncode++; /* count of DO blocks */
6563         if (ncode) {
6564             pRExC_state->num_code_blocks = ncode;
6565             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6566         }
6567     }
6568
6569     if (!pat_count) {
6570         /* compile-time pattern with just OP_CONSTs and DO blocks */
6571
6572         int n;
6573         OP *o;
6574
6575         /* find how many CONSTs there are */
6576         assert(expr);
6577         n = 0;
6578         if (expr->op_type == OP_CONST)
6579             n = 1;
6580         else
6581             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6582                 if (o->op_type == OP_CONST)
6583                     n++;
6584             }
6585
6586         /* fake up an SV array */
6587
6588         assert(!new_patternp);
6589         Newx(new_patternp, n, SV*);
6590         SAVEFREEPV(new_patternp);
6591         pat_count = n;
6592
6593         n = 0;
6594         if (expr->op_type == OP_CONST)
6595             new_patternp[n] = cSVOPx_sv(expr);
6596         else
6597             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6598                 if (o->op_type == OP_CONST)
6599                     new_patternp[n++] = cSVOPo_sv;
6600             }
6601
6602     }
6603
6604     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6605         "Assembling pattern from %d elements%s\n", pat_count,
6606             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6607
6608     /* set expr to the first arg op */
6609
6610     if (pRExC_state->num_code_blocks
6611          && expr->op_type != OP_CONST)
6612     {
6613             expr = cLISTOPx(expr)->op_first;
6614             assert(   expr->op_type == OP_PUSHMARK
6615                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6616                    || expr->op_type == OP_PADRANGE);
6617             expr = OP_SIBLING(expr);
6618     }
6619
6620     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6621                         expr, &recompile, NULL);
6622
6623     /* handle bare (possibly after overloading) regex: foo =~ $re */
6624     {
6625         SV *re = pat;
6626         if (SvROK(re))
6627             re = SvRV(re);
6628         if (SvTYPE(re) == SVt_REGEXP) {
6629             if (is_bare_re)
6630                 *is_bare_re = TRUE;
6631             SvREFCNT_inc(re);
6632             Safefree(pRExC_state->code_blocks);
6633             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6634                 "Precompiled pattern%s\n",
6635                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6636
6637             return (REGEXP*)re;
6638         }
6639     }
6640
6641     exp = SvPV_nomg(pat, plen);
6642
6643     if (!eng->op_comp) {
6644         if ((SvUTF8(pat) && IN_BYTES)
6645                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6646         {
6647             /* make a temporary copy; either to convert to bytes,
6648              * or to avoid repeating get-magic / overloaded stringify */
6649             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6650                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6651         }
6652         Safefree(pRExC_state->code_blocks);
6653         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6654     }
6655
6656     /* ignore the utf8ness if the pattern is 0 length */
6657     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6658     RExC_uni_semantics = 0;
6659     RExC_contains_locale = 0;
6660     RExC_contains_i = 0;
6661     pRExC_state->runtime_code_qr = NULL;
6662     RExC_frame_head= NULL;
6663     RExC_frame_last= NULL;
6664     RExC_frame_count= 0;
6665
6666     DEBUG_r({
6667         RExC_mysv1= sv_newmortal();
6668         RExC_mysv2= sv_newmortal();
6669     });
6670     DEBUG_COMPILE_r({
6671             SV *dsv= sv_newmortal();
6672             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6673             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6674                           PL_colors[4],PL_colors[5],s);
6675         });
6676
6677   redo_first_pass:
6678     /* we jump here if we upgrade the pattern to utf8 and have to
6679      * recompile */
6680
6681     if ((pm_flags & PMf_USE_RE_EVAL)
6682                 /* this second condition covers the non-regex literal case,
6683                  * i.e.  $foo =~ '(?{})'. */
6684                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6685     )
6686         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6687
6688     /* return old regex if pattern hasn't changed */
6689     /* XXX: note in the below we have to check the flags as well as the
6690      * pattern.
6691      *
6692      * Things get a touch tricky as we have to compare the utf8 flag
6693      * independently from the compile flags.  */
6694
6695     if (   old_re
6696         && !recompile
6697         && !!RX_UTF8(old_re) == !!RExC_utf8
6698         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6699         && RX_PRECOMP(old_re)
6700         && RX_PRELEN(old_re) == plen
6701         && memEQ(RX_PRECOMP(old_re), exp, plen)
6702         && !runtime_code /* with runtime code, always recompile */ )
6703     {
6704         Safefree(pRExC_state->code_blocks);
6705         return old_re;
6706     }
6707
6708     rx_flags = orig_rx_flags;
6709
6710     if (rx_flags & PMf_FOLD) {
6711         RExC_contains_i = 1;
6712     }
6713     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6714
6715         /* Set to use unicode semantics if the pattern is in utf8 and has the
6716          * 'depends' charset specified, as it means unicode when utf8  */
6717         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6718     }
6719
6720     RExC_precomp = exp;
6721     RExC_flags = rx_flags;
6722     RExC_pm_flags = pm_flags;
6723
6724     if (runtime_code) {
6725         if (TAINTING_get && TAINT_get)
6726             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6727
6728         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6729             /* whoops, we have a non-utf8 pattern, whilst run-time code
6730              * got compiled as utf8. Try again with a utf8 pattern */
6731             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6732                                     pRExC_state->num_code_blocks);
6733             goto redo_first_pass;
6734         }
6735     }
6736     assert(!pRExC_state->runtime_code_qr);
6737
6738     RExC_sawback = 0;
6739
6740     RExC_seen = 0;
6741     RExC_maxlen = 0;
6742     RExC_in_lookbehind = 0;
6743     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6744     RExC_extralen = 0;
6745     RExC_override_recoding = 0;
6746     RExC_in_multi_char_class = 0;
6747
6748     /* First pass: determine size, legality. */
6749     RExC_parse = exp;
6750     RExC_start = exp;
6751     RExC_end = exp + plen;
6752     RExC_naughty = 0;
6753     RExC_npar = 1;
6754     RExC_nestroot = 0;
6755     RExC_size = 0L;
6756     RExC_emit = (regnode *) &RExC_emit_dummy;
6757     RExC_whilem_seen = 0;
6758     RExC_open_parens = NULL;
6759     RExC_close_parens = NULL;
6760     RExC_opend = NULL;
6761     RExC_paren_names = NULL;
6762 #ifdef DEBUGGING
6763     RExC_paren_name_list = NULL;
6764 #endif
6765     RExC_recurse = NULL;
6766     RExC_study_chunk_recursed = NULL;
6767     RExC_study_chunk_recursed_bytes= 0;
6768     RExC_recurse_count = 0;
6769     pRExC_state->code_index = 0;
6770
6771 #if 0 /* REGC() is (currently) a NOP at the first pass.
6772        * Clever compilers notice this and complain. --jhi */
6773     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6774 #endif
6775     DEBUG_PARSE_r(
6776         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6777         RExC_lastnum=0;
6778         RExC_lastparse=NULL;
6779     );
6780     /* reg may croak on us, not giving us a chance to free
6781        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6782        need it to survive as long as the regexp (qr/(?{})/).
6783        We must check that code_blocksv is not already set, because we may
6784        have jumped back to restart the sizing pass. */
6785     if (pRExC_state->code_blocks && !code_blocksv) {
6786         code_blocksv = newSV_type(SVt_PV);
6787         SAVEFREESV(code_blocksv);
6788         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6789         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6790     }
6791     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6792         /* It's possible to write a regexp in ascii that represents Unicode
6793         codepoints outside of the byte range, such as via \x{100}. If we
6794         detect such a sequence we have to convert the entire pattern to utf8
6795         and then recompile, as our sizing calculation will have been based
6796         on 1 byte == 1 character, but we will need to use utf8 to encode
6797         at least some part of the pattern, and therefore must convert the whole
6798         thing.
6799         -- dmq */
6800         if (flags & RESTART_UTF8) {
6801             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6802                                     pRExC_state->num_code_blocks);
6803             goto redo_first_pass;
6804         }
6805         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6806     }
6807     if (code_blocksv)
6808         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6809
6810     DEBUG_PARSE_r({
6811         PerlIO_printf(Perl_debug_log,
6812             "Required size %"IVdf" nodes\n"
6813             "Starting second pass (creation)\n",
6814             (IV)RExC_size);
6815         RExC_lastnum=0;
6816         RExC_lastparse=NULL;
6817     });
6818
6819     /* The first pass could have found things that force Unicode semantics */
6820     if ((RExC_utf8 || RExC_uni_semantics)
6821          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6822     {
6823         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6824     }
6825
6826     /* Small enough for pointer-storage convention?
6827        If extralen==0, this means that we will not need long jumps. */
6828     if (RExC_size >= 0x10000L && RExC_extralen)
6829         RExC_size += RExC_extralen;
6830     else
6831         RExC_extralen = 0;
6832     if (RExC_whilem_seen > 15)
6833         RExC_whilem_seen = 15;
6834
6835     /* Allocate space and zero-initialize. Note, the two step process
6836        of zeroing when in debug mode, thus anything assigned has to
6837        happen after that */
6838     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6839     r = ReANY(rx);
6840     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6841          char, regexp_internal);
6842     if ( r == NULL || ri == NULL )
6843         FAIL("Regexp out of space");
6844 #ifdef DEBUGGING
6845     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6846     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6847          char);
6848 #else
6849     /* bulk initialize base fields with 0. */
6850     Zero(ri, sizeof(regexp_internal), char);
6851 #endif
6852
6853     /* non-zero initialization begins here */
6854     RXi_SET( r, ri );
6855     r->engine= eng;
6856     r->extflags = rx_flags;
6857     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6858
6859     if (pm_flags & PMf_IS_QR) {
6860         ri->code_blocks = pRExC_state->code_blocks;
6861         ri->num_code_blocks = pRExC_state->num_code_blocks;
6862     }
6863     else
6864     {
6865         int n;
6866         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6867             if (pRExC_state->code_blocks[n].src_regex)
6868                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6869         SAVEFREEPV(pRExC_state->code_blocks);
6870     }
6871
6872     {
6873         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6874         bool has_charset = (get_regex_charset(r->extflags)
6875                                                     != REGEX_DEPENDS_CHARSET);
6876
6877         /* The caret is output if there are any defaults: if not all the STD
6878          * flags are set, or if no character set specifier is needed */
6879         bool has_default =
6880                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6881                     || ! has_charset);
6882         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6883                                                    == REG_RUN_ON_COMMENT_SEEN);
6884         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6885                             >> RXf_PMf_STD_PMMOD_SHIFT);
6886         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6887         char *p;
6888         /* Allocate for the worst case, which is all the std flags are turned
6889          * on.  If more precision is desired, we could do a population count of
6890          * the flags set.  This could be done with a small lookup table, or by
6891          * shifting, masking and adding, or even, when available, assembly
6892          * language for a machine-language population count.
6893          * We never output a minus, as all those are defaults, so are
6894          * covered by the caret */
6895         const STRLEN wraplen = plen + has_p + has_runon
6896             + has_default       /* If needs a caret */
6897
6898                 /* If needs a character set specifier */
6899             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6900             + (sizeof(STD_PAT_MODS) - 1)
6901             + (sizeof("(?:)") - 1);
6902
6903         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6904         r->xpv_len_u.xpvlenu_pv = p;
6905         if (RExC_utf8)
6906             SvFLAGS(rx) |= SVf_UTF8;
6907         *p++='('; *p++='?';
6908
6909         /* If a default, cover it using the caret */
6910         if (has_default) {
6911             *p++= DEFAULT_PAT_MOD;
6912         }
6913         if (has_charset) {
6914             STRLEN len;
6915             const char* const name = get_regex_charset_name(r->extflags, &len);
6916             Copy(name, p, len, char);
6917             p += len;
6918         }
6919         if (has_p)
6920             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6921         {
6922             char ch;
6923             while((ch = *fptr++)) {
6924                 if(reganch & 1)
6925                     *p++ = ch;
6926                 reganch >>= 1;
6927             }
6928         }
6929
6930         *p++ = ':';
6931         Copy(RExC_precomp, p, plen, char);
6932         assert ((RX_WRAPPED(rx) - p) < 16);
6933         r->pre_prefix = p - RX_WRAPPED(rx);
6934         p += plen;
6935         if (has_runon)
6936             *p++ = '\n';
6937         *p++ = ')';
6938         *p = 0;
6939         SvCUR_set(rx, p - RX_WRAPPED(rx));
6940     }
6941
6942     r->intflags = 0;
6943     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6944
6945     /* setup various meta data about recursion, this all requires
6946      * RExC_npar to be correctly set, and a bit later on we clear it */
6947     if (RExC_seen & REG_RECURSE_SEEN) {
6948         Newxz(RExC_open_parens, RExC_npar,regnode *);
6949         SAVEFREEPV(RExC_open_parens);
6950         Newxz(RExC_close_parens,RExC_npar,regnode *);
6951         SAVEFREEPV(RExC_close_parens);
6952     }
6953     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6954         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6955          * So its 1 if there are no parens. */
6956         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6957                                          ((RExC_npar & 0x07) != 0);
6958         Newx(RExC_study_chunk_recursed,
6959              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6960         SAVEFREEPV(RExC_study_chunk_recursed);
6961     }
6962
6963     /* Useful during FAIL. */
6964 #ifdef RE_TRACK_PATTERN_OFFSETS
6965     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6966     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6967                           "%s %"UVuf" bytes for offset annotations.\n",
6968                           ri->u.offsets ? "Got" : "Couldn't get",
6969                           (UV)((2*RExC_size+1) * sizeof(U32))));
6970 #endif
6971     SetProgLen(ri,RExC_size);
6972     RExC_rx_sv = rx;
6973     RExC_rx = r;
6974     RExC_rxi = ri;
6975
6976     /* Second pass: emit code. */
6977     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6978     RExC_pm_flags = pm_flags;
6979     RExC_parse = exp;
6980     RExC_end = exp + plen;
6981     RExC_naughty = 0;
6982     RExC_npar = 1;
6983     RExC_emit_start = ri->program;
6984     RExC_emit = ri->program;
6985     RExC_emit_bound = ri->program + RExC_size + 1;
6986     pRExC_state->code_index = 0;
6987
6988     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6989     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6990         ReREFCNT_dec(rx);
6991         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6992     }
6993     /* XXXX To minimize changes to RE engine we always allocate
6994        3-units-long substrs field. */
6995     Newx(r->substrs, 1, struct reg_substr_data);
6996     if (RExC_recurse_count) {
6997         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6998         SAVEFREEPV(RExC_recurse);
6999     }
7000
7001 reStudy:
7002     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7003     DEBUG_r(
7004         RExC_study_chunk_recursed_count= 0;
7005     );
7006     Zero(r->substrs, 1, struct reg_substr_data);
7007     if (RExC_study_chunk_recursed) {
7008         Zero(RExC_study_chunk_recursed,
7009              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7010     }
7011
7012
7013 #ifdef TRIE_STUDY_OPT
7014     if (!restudied) {
7015         StructCopy(&zero_scan_data, &data, scan_data_t);
7016         copyRExC_state = RExC_state;
7017     } else {
7018         U32 seen=RExC_seen;
7019         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
7020
7021         RExC_state = copyRExC_state;
7022         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7023             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7024         else
7025             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7026         StructCopy(&zero_scan_data, &data, scan_data_t);
7027     }
7028 #else
7029     StructCopy(&zero_scan_data, &data, scan_data_t);
7030 #endif
7031
7032     /* Dig out information for optimizations. */
7033     r->extflags = RExC_flags; /* was pm_op */
7034     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7035
7036     if (UTF)
7037         SvUTF8_on(rx);  /* Unicode in it? */
7038     ri->regstclass = NULL;
7039     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
7040         r->intflags |= PREGf_NAUGHTY;
7041     scan = ri->program + 1;             /* First BRANCH. */
7042
7043     /* testing for BRANCH here tells us whether there is "must appear"
7044        data in the pattern. If there is then we can use it for optimisations */
7045     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7046                                                   */
7047         SSize_t fake;
7048         STRLEN longest_float_length, longest_fixed_length;
7049         regnode_ssc ch_class; /* pointed to by data */
7050         int stclass_flag;
7051         SSize_t last_close = 0; /* pointed to by data */
7052         regnode *first= scan;
7053         regnode *first_next= regnext(first);
7054         /*
7055          * Skip introductions and multiplicators >= 1
7056          * so that we can extract the 'meat' of the pattern that must
7057          * match in the large if() sequence following.
7058          * NOTE that EXACT is NOT covered here, as it is normally
7059          * picked up by the optimiser separately.
7060          *
7061          * This is unfortunate as the optimiser isnt handling lookahead
7062          * properly currently.
7063          *
7064          */
7065         while ((OP(first) == OPEN && (sawopen = 1)) ||
7066                /* An OR of *one* alternative - should not happen now. */
7067             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7068             /* for now we can't handle lookbehind IFMATCH*/
7069             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7070             (OP(first) == PLUS) ||
7071             (OP(first) == MINMOD) ||
7072                /* An {n,m} with n>0 */
7073             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7074             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7075         {
7076                 /*
7077                  * the only op that could be a regnode is PLUS, all the rest
7078                  * will be regnode_1 or regnode_2.
7079                  *
7080                  * (yves doesn't think this is true)
7081                  */
7082                 if (OP(first) == PLUS)
7083                     sawplus = 1;
7084                 else {
7085                     if (OP(first) == MINMOD)
7086                         sawminmod = 1;
7087                     first += regarglen[OP(first)];
7088                 }
7089                 first = NEXTOPER(first);
7090                 first_next= regnext(first);
7091         }
7092
7093         /* Starting-point info. */
7094       again:
7095         DEBUG_PEEP("first:",first,0);
7096         /* Ignore EXACT as we deal with it later. */
7097         if (PL_regkind[OP(first)] == EXACT) {
7098             if (OP(first) == EXACT)
7099                 NOOP;   /* Empty, get anchored substr later. */
7100             else
7101                 ri->regstclass = first;
7102         }
7103 #ifdef TRIE_STCLASS
7104         else if (PL_regkind[OP(first)] == TRIE &&
7105                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7106         {
7107             /* this can happen only on restudy */
7108             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7109         }
7110 #endif
7111         else if (REGNODE_SIMPLE(OP(first)))
7112             ri->regstclass = first;
7113         else if (PL_regkind[OP(first)] == BOUND ||
7114                  PL_regkind[OP(first)] == NBOUND)
7115             ri->regstclass = first;
7116         else if (PL_regkind[OP(first)] == BOL) {
7117             r->intflags |= (OP(first) == MBOL
7118                            ? PREGf_ANCH_MBOL
7119                            : PREGf_ANCH_SBOL);
7120             first = NEXTOPER(first);
7121             goto again;
7122         }
7123         else if (OP(first) == GPOS) {
7124             r->intflags |= PREGf_ANCH_GPOS;
7125             first = NEXTOPER(first);
7126             goto again;
7127         }
7128         else if ((!sawopen || !RExC_sawback) &&
7129             !sawlookahead &&
7130             (OP(first) == STAR &&
7131             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7132             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7133         {
7134             /* turn .* into ^.* with an implied $*=1 */
7135             const int type =
7136                 (OP(NEXTOPER(first)) == REG_ANY)
7137                     ? PREGf_ANCH_MBOL
7138                     : PREGf_ANCH_SBOL;
7139             r->intflags |= (type | PREGf_IMPLICIT);
7140             first = NEXTOPER(first);
7141             goto again;
7142         }
7143         if (sawplus && !sawminmod && !sawlookahead
7144             && (!sawopen || !RExC_sawback)
7145             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7146             /* x+ must match at the 1st pos of run of x's */
7147             r->intflags |= PREGf_SKIP;
7148
7149         /* Scan is after the zeroth branch, first is atomic matcher. */
7150 #ifdef TRIE_STUDY_OPT
7151         DEBUG_PARSE_r(
7152             if (!restudied)
7153                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7154                               (IV)(first - scan + 1))
7155         );
7156 #else
7157         DEBUG_PARSE_r(
7158             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7159                 (IV)(first - scan + 1))
7160         );
7161 #endif
7162
7163
7164         /*
7165         * If there's something expensive in the r.e., find the
7166         * longest literal string that must appear and make it the
7167         * regmust.  Resolve ties in favor of later strings, since
7168         * the regstart check works with the beginning of the r.e.
7169         * and avoiding duplication strengthens checking.  Not a
7170         * strong reason, but sufficient in the absence of others.
7171         * [Now we resolve ties in favor of the earlier string if
7172         * it happens that c_offset_min has been invalidated, since the
7173         * earlier string may buy us something the later one won't.]
7174         */
7175
7176         data.longest_fixed = newSVpvs("");
7177         data.longest_float = newSVpvs("");
7178         data.last_found = newSVpvs("");
7179         data.longest = &(data.longest_fixed);
7180         ENTER_with_name("study_chunk");
7181         SAVEFREESV(data.longest_fixed);
7182         SAVEFREESV(data.longest_float);
7183         SAVEFREESV(data.last_found);
7184         first = scan;
7185         if (!ri->regstclass) {
7186             ssc_init(pRExC_state, &ch_class);
7187             data.start_class = &ch_class;
7188             stclass_flag = SCF_DO_STCLASS_AND;
7189         } else                          /* XXXX Check for BOUND? */
7190             stclass_flag = 0;
7191         data.last_closep = &last_close;
7192
7193         DEBUG_RExC_seen();
7194         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7195                              scan + RExC_size, /* Up to end */
7196             &data, -1, 0, NULL,
7197             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7198                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7199             0);
7200
7201
7202         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7203
7204
7205         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7206              && data.last_start_min == 0 && data.last_end > 0
7207              && !RExC_seen_zerolen
7208              && !(RExC_seen & REG_VERBARG_SEEN)
7209              && !(RExC_seen & REG_GPOS_SEEN)
7210         ){
7211             r->extflags |= RXf_CHECK_ALL;
7212         }
7213         scan_commit(pRExC_state, &data,&minlen,0);
7214
7215         longest_float_length = CHR_SVLEN(data.longest_float);
7216
7217         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7218                    && data.offset_fixed == data.offset_float_min
7219                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7220             && S_setup_longest (aTHX_ pRExC_state,
7221                                     data.longest_float,
7222                                     &(r->float_utf8),
7223                                     &(r->float_substr),
7224                                     &(r->float_end_shift),
7225                                     data.lookbehind_float,
7226                                     data.offset_float_min,
7227                                     data.minlen_float,
7228                                     longest_float_length,
7229                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7230                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7231         {
7232             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7233             r->float_max_offset = data.offset_float_max;
7234             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7235                 r->float_max_offset -= data.lookbehind_float;
7236             SvREFCNT_inc_simple_void_NN(data.longest_float);
7237         }
7238         else {
7239             r->float_substr = r->float_utf8 = NULL;
7240             longest_float_length = 0;
7241         }
7242
7243         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7244
7245         if (S_setup_longest (aTHX_ pRExC_state,
7246                                 data.longest_fixed,
7247                                 &(r->anchored_utf8),
7248                                 &(r->anchored_substr),
7249                                 &(r->anchored_end_shift),
7250                                 data.lookbehind_fixed,
7251                                 data.offset_fixed,
7252                                 data.minlen_fixed,
7253                                 longest_fixed_length,
7254                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7255                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7256         {
7257             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7258             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7259         }
7260         else {
7261             r->anchored_substr = r->anchored_utf8 = NULL;
7262             longest_fixed_length = 0;
7263         }
7264         LEAVE_with_name("study_chunk");
7265
7266         if (ri->regstclass
7267             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7268             ri->regstclass = NULL;
7269
7270         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7271             && stclass_flag
7272             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7273             && is_ssc_worth_it(pRExC_state, data.start_class))
7274         {
7275             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7276
7277             ssc_finalize(pRExC_state, data.start_class);
7278
7279             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7280             StructCopy(data.start_class,
7281                        (regnode_ssc*)RExC_rxi->data->data[n],
7282                        regnode_ssc);
7283             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7284             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7285             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7286                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7287                       PerlIO_printf(Perl_debug_log,
7288                                     "synthetic stclass \"%s\".\n",
7289                                     SvPVX_const(sv));});
7290             data.start_class = NULL;
7291         }
7292
7293         /* A temporary algorithm prefers floated substr to fixed one to dig
7294          * more info. */
7295         if (longest_fixed_length > longest_float_length) {
7296             r->substrs->check_ix = 0;
7297             r->check_end_shift = r->anchored_end_shift;
7298             r->check_substr = r->anchored_substr;
7299             r->check_utf8 = r->anchored_utf8;
7300             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7301             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7302                 r->intflags |= PREGf_NOSCAN;
7303         }
7304         else {
7305             r->substrs->check_ix = 1;
7306             r->check_end_shift = r->float_end_shift;
7307             r->check_substr = r->float_substr;
7308             r->check_utf8 = r->float_utf8;
7309             r->check_offset_min = r->float_min_offset;
7310             r->check_offset_max = r->float_max_offset;
7311         }
7312         if ((r->check_substr || r->check_utf8) ) {
7313             r->extflags |= RXf_USE_INTUIT;
7314             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7315                 r->extflags |= RXf_INTUIT_TAIL;
7316         }
7317         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7318
7319         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7320         if ( (STRLEN)minlen < longest_float_length )
7321             minlen= longest_float_length;
7322         if ( (STRLEN)minlen < longest_fixed_length )
7323             minlen= longest_fixed_length;
7324         */
7325     }
7326     else {
7327         /* Several toplevels. Best we can is to set minlen. */
7328         SSize_t fake;
7329         regnode_ssc ch_class;
7330         SSize_t last_close = 0;
7331
7332         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7333
7334         scan = ri->program + 1;
7335         ssc_init(pRExC_state, &ch_class);
7336         data.start_class = &ch_class;
7337         data.last_closep = &last_close;
7338
7339         DEBUG_RExC_seen();
7340         minlen = study_chunk(pRExC_state,
7341             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7342             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7343                                                       ? SCF_TRIE_DOING_RESTUDY
7344                                                       : 0),
7345             0);
7346
7347         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7348
7349         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7350                 = r->float_substr = r->float_utf8 = NULL;
7351
7352         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7353             && is_ssc_worth_it(pRExC_state, data.start_class))
7354         {
7355             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7356
7357             ssc_finalize(pRExC_state, data.start_class);
7358
7359             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7360             StructCopy(data.start_class,
7361                        (regnode_ssc*)RExC_rxi->data->data[n],
7362                        regnode_ssc);
7363             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7364             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7365             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7366                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7367                       PerlIO_printf(Perl_debug_log,
7368                                     "synthetic stclass \"%s\".\n",
7369                                     SvPVX_const(sv));});
7370             data.start_class = NULL;
7371         }
7372     }
7373
7374     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7375         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7376         r->maxlen = REG_INFTY;
7377     }
7378     else {
7379         r->maxlen = RExC_maxlen;
7380     }
7381
7382     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7383        the "real" pattern. */
7384     DEBUG_OPTIMISE_r({
7385         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7386                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7387     });
7388     r->minlenret = minlen;
7389     if (r->minlen < minlen)
7390         r->minlen = minlen;
7391
7392     if (RExC_seen & REG_GPOS_SEEN)
7393         r->intflags |= PREGf_GPOS_SEEN;
7394     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7395         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7396                                                 lookbehind */
7397     if (pRExC_state->num_code_blocks)
7398         r->extflags |= RXf_EVAL_SEEN;
7399     if (RExC_seen & REG_CANY_SEEN)
7400         r->intflags |= PREGf_CANY_SEEN;
7401     if (RExC_seen & REG_VERBARG_SEEN)
7402     {
7403         r->intflags |= PREGf_VERBARG_SEEN;
7404         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7405     }
7406     if (RExC_seen & REG_CUTGROUP_SEEN)
7407         r->intflags |= PREGf_CUTGROUP_SEEN;
7408     if (pm_flags & PMf_USE_RE_EVAL)
7409         r->intflags |= PREGf_USE_RE_EVAL;
7410     if (RExC_paren_names)
7411         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7412     else
7413         RXp_PAREN_NAMES(r) = NULL;
7414
7415     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7416      * so it can be used in pp.c */
7417     if (r->intflags & PREGf_ANCH)
7418         r->extflags |= RXf_IS_ANCHORED;
7419
7420
7421     {
7422         /* this is used to identify "special" patterns that might result
7423          * in Perl NOT calling the regex engine and instead doing the match "itself",
7424          * particularly special cases in split//. By having the regex compiler
7425          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7426          * we avoid weird issues with equivalent patterns resulting in different behavior,
7427          * AND we allow non Perl engines to get the same optimizations by the setting the
7428          * flags appropriately - Yves */
7429         regnode *first = ri->program + 1;
7430         U8 fop = OP(first);
7431         regnode *next = NEXTOPER(first);
7432         U8 nop = OP(next);
7433
7434         if (PL_regkind[fop] == NOTHING && nop == END)
7435             r->extflags |= RXf_NULL;
7436         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7437             /* when fop is SBOL first->flags will be true only when it was
7438              * produced by parsing /\A/, and not when parsing /^/. This is
7439              * very important for the split code as there we want to
7440              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7441              * See rt #122761 for more details. -- Yves */
7442             r->extflags |= RXf_START_ONLY;
7443         else if (fop == PLUS
7444                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7445                  && OP(regnext(first)) == END)
7446             r->extflags |= RXf_WHITE;
7447         else if ( r->extflags & RXf_SPLIT
7448                   && fop == EXACT
7449                   && STR_LEN(first) == 1
7450                   && *(STRING(first)) == ' '
7451                   && OP(regnext(first)) == END )
7452             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7453
7454     }
7455
7456     if (RExC_contains_locale) {
7457         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7458     }
7459
7460 #ifdef DEBUGGING
7461     if (RExC_paren_names) {
7462         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7463         ri->data->data[ri->name_list_idx]
7464                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7465     } else
7466 #endif
7467         ri->name_list_idx = 0;
7468
7469     if (RExC_recurse_count) {
7470         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7471             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7472             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7473         }
7474     }
7475     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7476     /* assume we don't need to swap parens around before we match */
7477     DEBUG_TEST_r({
7478         PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7479             (unsigned long)RExC_study_chunk_recursed_count);
7480     });
7481     DEBUG_DUMP_r({
7482         DEBUG_RExC_seen();
7483         PerlIO_printf(Perl_debug_log,"Final program:\n");
7484         regdump(r);
7485     });
7486 #ifdef RE_TRACK_PATTERN_OFFSETS
7487     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7488         const STRLEN len = ri->u.offsets[0];
7489         STRLEN i;
7490         GET_RE_DEBUG_FLAGS_DECL;
7491         PerlIO_printf(Perl_debug_log,
7492                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7493         for (i = 1; i <= len; i++) {
7494             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7495                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7496                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7497             }
7498         PerlIO_printf(Perl_debug_log, "\n");
7499     });
7500 #endif
7501
7502 #ifdef USE_ITHREADS
7503     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7504      * by setting the regexp SV to readonly-only instead. If the
7505      * pattern's been recompiled, the USEDness should remain. */
7506     if (old_re && SvREADONLY(old_re))
7507         SvREADONLY_on(rx);
7508 #endif
7509     return rx;
7510 }
7511
7512
7513 SV*
7514 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7515                     const U32 flags)
7516 {
7517     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7518
7519     PERL_UNUSED_ARG(value);
7520
7521     if (flags & RXapif_FETCH) {
7522         return reg_named_buff_fetch(rx, key, flags);
7523     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7524         Perl_croak_no_modify();
7525         return NULL;
7526     } else if (flags & RXapif_EXISTS) {
7527         return reg_named_buff_exists(rx, key, flags)
7528             ? &PL_sv_yes
7529             : &PL_sv_no;
7530     } else if (flags & RXapif_REGNAMES) {
7531         return reg_named_buff_all(rx, flags);
7532     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7533         return reg_named_buff_scalar(rx, flags);
7534     } else {
7535         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7536         return NULL;
7537     }
7538 }
7539
7540 SV*
7541 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7542                          const U32 flags)
7543 {
7544     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7545     PERL_UNUSED_ARG(lastkey);
7546
7547     if (flags & RXapif_FIRSTKEY)
7548         return reg_named_buff_firstkey(rx, flags);
7549     else if (flags & RXapif_NEXTKEY)
7550         return reg_named_buff_nextkey(rx, flags);
7551     else {
7552         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7553                                             (int)flags);
7554         return NULL;
7555     }
7556 }
7557
7558 SV*
7559 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7560                           const U32 flags)
7561 {
7562     AV *retarray = NULL;
7563     SV *ret;
7564     struct regexp *const rx = ReANY(r);
7565
7566     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7567
7568     if (flags & RXapif_ALL)
7569         retarray=newAV();
7570
7571     if (rx && RXp_PAREN_NAMES(rx)) {
7572         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7573         if (he_str) {
7574             IV i;
7575             SV* sv_dat=HeVAL(he_str);
7576             I32 *nums=(I32*)SvPVX(sv_dat);
7577             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7578                 if ((I32)(rx->nparens) >= nums[i]
7579                     && rx->offs[nums[i]].start != -1
7580                     && rx->offs[nums[i]].end != -1)
7581                 {
7582                     ret = newSVpvs("");
7583                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7584                     if (!retarray)
7585                         return ret;
7586                 } else {
7587                     if (retarray)
7588                         ret = newSVsv(&PL_sv_undef);
7589                 }
7590                 if (retarray)
7591                     av_push(retarray, ret);
7592             }
7593             if (retarray)
7594                 return newRV_noinc(MUTABLE_SV(retarray));
7595         }
7596     }
7597     return NULL;
7598 }
7599
7600 bool
7601 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7602                            const U32 flags)
7603 {
7604     struct regexp *const rx = ReANY(r);
7605
7606     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7607
7608     if (rx && RXp_PAREN_NAMES(rx)) {
7609         if (flags & RXapif_ALL) {
7610             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7611         } else {
7612             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7613             if (sv) {
7614                 SvREFCNT_dec_NN(sv);
7615                 return TRUE;
7616             } else {
7617                 return FALSE;
7618             }
7619         }
7620     } else {
7621         return FALSE;
7622     }
7623 }
7624
7625 SV*
7626 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7627 {
7628     struct regexp *const rx = ReANY(r);
7629
7630     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7631
7632     if ( rx && RXp_PAREN_NAMES(rx) ) {
7633         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7634
7635         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7636     } else {
7637         return FALSE;
7638     }
7639 }
7640
7641 SV*
7642 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7643 {
7644     struct regexp *const rx = ReANY(r);
7645     GET_RE_DEBUG_FLAGS_DECL;
7646
7647     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7648
7649     if (rx && RXp_PAREN_NAMES(rx)) {
7650         HV *hv = RXp_PAREN_NAMES(rx);
7651         HE *temphe;
7652         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7653             IV i;
7654             IV parno = 0;
7655             SV* sv_dat = HeVAL(temphe);
7656             I32 *nums = (I32*)SvPVX(sv_dat);
7657             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7658                 if ((I32)(rx->lastparen) >= nums[i] &&
7659                     rx->offs[nums[i]].start != -1 &&
7660                     rx->offs[nums[i]].end != -1)
7661                 {
7662                     parno = nums[i];
7663                     break;
7664                 }
7665             }
7666             if (parno || flags & RXapif_ALL) {
7667                 return newSVhek(HeKEY_hek(temphe));
7668             }
7669         }
7670     }
7671     return NULL;
7672 }
7673
7674 SV*
7675 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7676 {
7677     SV *ret;
7678     AV *av;
7679     SSize_t length;
7680     struct regexp *const rx = ReANY(r);
7681
7682     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7683
7684     if (rx && RXp_PAREN_NAMES(rx)) {
7685         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7686             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7687         } else if (flags & RXapif_ONE) {
7688             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7689             av = MUTABLE_AV(SvRV(ret));
7690             length = av_tindex(av);
7691             SvREFCNT_dec_NN(ret);
7692             return newSViv(length + 1);
7693         } else {
7694             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7695                                                 (int)flags);
7696             return NULL;
7697         }
7698     }
7699     return &PL_sv_undef;
7700 }
7701
7702 SV*
7703 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7704 {
7705     struct regexp *const rx = ReANY(r);
7706     AV *av = newAV();
7707
7708     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7709
7710     if (rx && RXp_PAREN_NAMES(rx)) {
7711         HV *hv= RXp_PAREN_NAMES(rx);
7712         HE *temphe;
7713         (void)hv_iterinit(hv);
7714         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7715             IV i;
7716             IV parno = 0;
7717             SV* sv_dat = HeVAL(temphe);
7718             I32 *nums = (I32*)SvPVX(sv_dat);
7719             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7720                 if ((I32)(rx->lastparen) >= nums[i] &&
7721                     rx->offs[nums[i]].start != -1 &&
7722                     rx->offs[nums[i]].end != -1)
7723                 {
7724                     parno = nums[i];
7725                     break;
7726                 }
7727             }
7728             if (parno || flags & RXapif_ALL) {
7729                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7730             }
7731         }
7732     }
7733
7734     return newRV_noinc(MUTABLE_SV(av));
7735 }
7736
7737 void
7738 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7739                              SV * const sv)
7740 {
7741     struct regexp *const rx = ReANY(r);
7742     char *s = NULL;
7743     SSize_t i = 0;
7744     SSize_t s1, t1;
7745     I32 n = paren;
7746
7747     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7748
7749     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7750            || n == RX_BUFF_IDX_CARET_FULLMATCH
7751            || n == RX_BUFF_IDX_CARET_POSTMATCH
7752        )
7753     {
7754         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7755         if (!keepcopy) {
7756             /* on something like
7757              *    $r = qr/.../;
7758              *    /$qr/p;
7759              * the KEEPCOPY is set on the PMOP rather than the regex */
7760             if (PL_curpm && r == PM_GETRE(PL_curpm))
7761                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7762         }
7763         if (!keepcopy)
7764             goto ret_undef;
7765     }
7766
7767     if (!rx->subbeg)
7768         goto ret_undef;
7769
7770     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7771         /* no need to distinguish between them any more */
7772         n = RX_BUFF_IDX_FULLMATCH;
7773
7774     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7775         && rx->offs[0].start != -1)
7776     {
7777         /* $`, ${^PREMATCH} */
7778         i = rx->offs[0].start;
7779         s = rx->subbeg;
7780     }
7781     else
7782     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7783         && rx->offs[0].end != -1)
7784     {
7785         /* $', ${^POSTMATCH} */
7786         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7787         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7788     }
7789     else
7790     if ( 0 <= n && n <= (I32)rx->nparens &&
7791         (s1 = rx->offs[n].start) != -1 &&
7792         (t1 = rx->offs[n].end) != -1)
7793     {
7794         /* $&, ${^MATCH},  $1 ... */
7795         i = t1 - s1;
7796         s = rx->subbeg + s1 - rx->suboffset;
7797     } else {
7798         goto ret_undef;
7799     }
7800
7801     assert(s >= rx->subbeg);
7802     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7803     if (i >= 0) {
7804 #ifdef NO_TAINT_SUPPORT
7805         sv_setpvn(sv, s, i);
7806 #else
7807         const int oldtainted = TAINT_get;
7808         TAINT_NOT;
7809         sv_setpvn(sv, s, i);
7810         TAINT_set(oldtainted);
7811 #endif
7812         if ( (rx->intflags & PREGf_CANY_SEEN)
7813             ? (RXp_MATCH_UTF8(rx)
7814                         && (!i || is_utf8_string((U8*)s, i)))
7815             : (RXp_MATCH_UTF8(rx)) )
7816         {
7817             SvUTF8_on(sv);
7818         }
7819         else
7820             SvUTF8_off(sv);
7821         if (TAINTING_get) {
7822             if (RXp_MATCH_TAINTED(rx)) {
7823                 if (SvTYPE(sv) >= SVt_PVMG) {
7824                     MAGIC* const mg = SvMAGIC(sv);
7825                     MAGIC* mgt;
7826                     TAINT;
7827                     SvMAGIC_set(sv, mg->mg_moremagic);
7828                     SvTAINT(sv);
7829                     if ((mgt = SvMAGIC(sv))) {
7830                         mg->mg_moremagic = mgt;
7831                         SvMAGIC_set(sv, mg);
7832                     }
7833                 } else {
7834                     TAINT;
7835                     SvTAINT(sv);
7836                 }
7837             } else
7838                 SvTAINTED_off(sv);
7839         }
7840     } else {
7841       ret_undef:
7842         sv_setsv(sv,&PL_sv_undef);
7843         return;
7844     }
7845 }
7846
7847 void
7848 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7849                                                          SV const * const value)
7850 {
7851     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7852
7853     PERL_UNUSED_ARG(rx);
7854     PERL_UNUSED_ARG(paren);
7855     PERL_UNUSED_ARG(value);
7856
7857     if (!PL_localizing)
7858         Perl_croak_no_modify();
7859 }
7860
7861 I32
7862 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7863                               const I32 paren)
7864 {
7865     struct regexp *const rx = ReANY(r);
7866     I32 i;
7867     I32 s1, t1;
7868
7869     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7870
7871     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7872         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7873         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7874     )
7875     {
7876         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7877         if (!keepcopy) {
7878             /* on something like
7879              *    $r = qr/.../;
7880              *    /$qr/p;
7881              * the KEEPCOPY is set on the PMOP rather than the regex */
7882             if (PL_curpm && r == PM_GETRE(PL_curpm))
7883                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7884         }
7885         if (!keepcopy)
7886             goto warn_undef;
7887     }
7888
7889     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7890     switch (paren) {
7891       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7892       case RX_BUFF_IDX_PREMATCH:       /* $` */
7893         if (rx->offs[0].start != -1) {
7894                         i = rx->offs[0].start;
7895                         if (i > 0) {
7896                                 s1 = 0;
7897                                 t1 = i;
7898                                 goto getlen;
7899                         }
7900             }
7901         return 0;
7902
7903       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7904       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7905             if (rx->offs[0].end != -1) {
7906                         i = rx->sublen - rx->offs[0].end;
7907                         if (i > 0) {
7908                                 s1 = rx->offs[0].end;
7909                                 t1 = rx->sublen;
7910                                 goto getlen;
7911                         }
7912             }
7913         return 0;
7914
7915       default: /* $& / ${^MATCH}, $1, $2, ... */
7916             if (paren <= (I32)rx->nparens &&
7917             (s1 = rx->offs[paren].start) != -1 &&
7918             (t1 = rx->offs[paren].end) != -1)
7919             {
7920             i = t1 - s1;
7921             goto getlen;
7922         } else {
7923           warn_undef:
7924             if (ckWARN(WARN_UNINITIALIZED))
7925                 report_uninit((const SV *)sv);
7926             return 0;
7927         }
7928     }
7929   getlen:
7930     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7931         const char * const s = rx->subbeg - rx->suboffset + s1;
7932         const U8 *ep;
7933         STRLEN el;
7934
7935         i = t1 - s1;
7936         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7937                         i = el;
7938     }
7939     return i;
7940 }
7941
7942 SV*
7943 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7944 {
7945     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7946         PERL_UNUSED_ARG(rx);
7947         if (0)
7948             return NULL;
7949         else
7950             return newSVpvs("Regexp");
7951 }
7952
7953 /* Scans the name of a named buffer from the pattern.
7954  * If flags is REG_RSN_RETURN_NULL returns null.
7955  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7956  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7957  * to the parsed name as looked up in the RExC_paren_names hash.
7958  * If there is an error throws a vFAIL().. type exception.
7959  */
7960
7961 #define REG_RSN_RETURN_NULL    0
7962 #define REG_RSN_RETURN_NAME    1
7963 #define REG_RSN_RETURN_DATA    2
7964
7965 STATIC SV*
7966 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7967 {
7968     char *name_start = RExC_parse;
7969
7970     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7971
7972     assert (RExC_parse <= RExC_end);
7973     if (RExC_parse == RExC_end) NOOP;
7974     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7975          /* skip IDFIRST by using do...while */
7976         if (UTF)
7977             do {
7978                 RExC_parse += UTF8SKIP(RExC_parse);
7979             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7980         else
7981             do {
7982                 RExC_parse++;
7983             } while (isWORDCHAR(*RExC_parse));
7984     } else {
7985         RExC_parse++; /* so the <- from the vFAIL is after the offending
7986                          character */
7987         vFAIL("Group name must start with a non-digit word character");
7988     }
7989     if ( flags ) {
7990         SV* sv_name
7991             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7992                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7993         if ( flags == REG_RSN_RETURN_NAME)
7994             return sv_name;
7995         else if (flags==REG_RSN_RETURN_DATA) {
7996             HE *he_str = NULL;
7997             SV *sv_dat = NULL;
7998             if ( ! sv_name )      /* should not happen*/
7999                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8000             if (RExC_paren_names)
8001                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8002             if ( he_str )
8003                 sv_dat = HeVAL(he_str);
8004             if ( ! sv_dat )
8005                 vFAIL("Reference to nonexistent named group");
8006             return sv_dat;
8007         }
8008         else {
8009             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8010                        (unsigned long) flags);
8011         }
8012         assert(0); /* NOT REACHED */
8013     }
8014     return NULL;
8015 }
8016
8017 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8018     int num;                                                    \
8019     if (RExC_lastparse!=RExC_parse) {                           \
8020         PerlIO_printf(Perl_debug_log, "%s",                     \
8021             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8022                 RExC_end - RExC_parse, 16,                      \
8023                 "", "",                                         \
8024                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8025                 PERL_PV_PRETTY_ELLIPSES   |                     \
8026                 PERL_PV_PRETTY_LTGT       |                     \
8027                 PERL_PV_ESCAPE_RE         |                     \
8028                 PERL_PV_PRETTY_EXACTSIZE                        \
8029             )                                                   \
8030         );                                                      \
8031     } else                                                      \
8032         PerlIO_printf(Perl_debug_log,"%16s","");                \
8033                                                                 \
8034     if (SIZE_ONLY)                                              \
8035        num = RExC_size + 1;                                     \
8036     else                                                        \
8037        num=REG_NODE_NUM(RExC_emit);                             \
8038     if (RExC_lastnum!=num)                                      \
8039        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
8040     else                                                        \
8041        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
8042     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
8043         (int)((depth*2)), "",                                   \
8044         (funcname)                                              \
8045     );                                                          \
8046     RExC_lastnum=num;                                           \
8047     RExC_lastparse=RExC_parse;                                  \
8048 })
8049
8050
8051
8052 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8053     DEBUG_PARSE_MSG((funcname));                            \
8054     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
8055 })
8056 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
8057     DEBUG_PARSE_MSG((funcname));                            \
8058     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
8059 })
8060
8061 /* This section of code defines the inversion list object and its methods.  The
8062  * interfaces are highly subject to change, so as much as possible is static to
8063  * this file.  An inversion list is here implemented as a malloc'd C UV array
8064  * as an SVt_INVLIST scalar.
8065  *
8066  * An inversion list for Unicode is an array of code points, sorted by ordinal
8067  * number.  The zeroth element is the first code point in the list.  The 1th
8068  * element is the first element beyond that not in the list.  In other words,
8069  * the first range is
8070  *  invlist[0]..(invlist[1]-1)
8071  * The other ranges follow.  Thus every element whose index is divisible by two
8072  * marks the beginning of a range that is in the list, and every element not
8073  * divisible by two marks the beginning of a range not in the list.  A single
8074  * element inversion list that contains the single code point N generally
8075  * consists of two elements
8076  *  invlist[0] == N
8077  *  invlist[1] == N+1
8078  * (The exception is when N is the highest representable value on the
8079  * machine, in which case the list containing just it would be a single
8080  * element, itself.  By extension, if the last range in the list extends to
8081  * infinity, then the first element of that range will be in the inversion list
8082  * at a position that is divisible by two, and is the final element in the
8083  * list.)
8084  * Taking the complement (inverting) an inversion list is quite simple, if the
8085  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8086  * This implementation reserves an element at the beginning of each inversion
8087  * list to always contain 0; there is an additional flag in the header which
8088  * indicates if the list begins at the 0, or is offset to begin at the next
8089  * element.
8090  *
8091  * More about inversion lists can be found in "Unicode Demystified"
8092  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8093  * More will be coming when functionality is added later.
8094  *
8095  * The inversion list data structure is currently implemented as an SV pointing
8096  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8097  * array of UV whose memory management is automatically handled by the existing
8098  * facilities for SV's.
8099  *
8100  * Some of the methods should always be private to the implementation, and some
8101  * should eventually be made public */
8102
8103 /* The header definitions are in F<inline_invlist.c> */
8104
8105 PERL_STATIC_INLINE UV*
8106 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8107 {
8108     /* Returns a pointer to the first element in the inversion list's array.
8109      * This is called upon initialization of an inversion list.  Where the
8110      * array begins depends on whether the list has the code point U+0000 in it
8111      * or not.  The other parameter tells it whether the code that follows this
8112      * call is about to put a 0 in the inversion list or not.  The first
8113      * element is either the element reserved for 0, if TRUE, or the element
8114      * after it, if FALSE */
8115
8116     bool* offset = get_invlist_offset_addr(invlist);
8117     UV* zero_addr = (UV *) SvPVX(invlist);
8118
8119     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8120
8121     /* Must be empty */
8122     assert(! _invlist_len(invlist));
8123
8124     *zero_addr = 0;
8125
8126     /* 1^1 = 0; 1^0 = 1 */
8127     *offset = 1 ^ will_have_0;
8128     return zero_addr + *offset;
8129 }
8130
8131 PERL_STATIC_INLINE UV*
8132 S_invlist_array(SV* const invlist)
8133 {
8134     /* Returns the pointer to the inversion list's array.  Every time the
8135      * length changes, this needs to be called in case malloc or realloc moved
8136      * it */
8137
8138     PERL_ARGS_ASSERT_INVLIST_ARRAY;
8139
8140     /* Must not be empty.  If these fail, you probably didn't check for <len>
8141      * being non-zero before trying to get the array */
8142     assert(_invlist_len(invlist));
8143
8144     /* The very first element always contains zero, The array begins either
8145      * there, or if the inversion list is offset, at the element after it.
8146      * The offset header field determines which; it contains 0 or 1 to indicate
8147      * how much additionally to add */
8148     assert(0 == *(SvPVX(invlist)));
8149     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8150 }
8151
8152 PERL_STATIC_INLINE void
8153 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8154 {
8155     /* Sets the current number of elements stored in the inversion list.
8156      * Updates SvCUR correspondingly */
8157     PERL_UNUSED_CONTEXT;
8158     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8159
8160     assert(SvTYPE(invlist) == SVt_INVLIST);
8161
8162     SvCUR_set(invlist,
8163               (len == 0)
8164                ? 0
8165                : TO_INTERNAL_SIZE(len + offset));
8166     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8167 }
8168
8169 PERL_STATIC_INLINE IV*
8170 S_get_invlist_previous_index_addr(SV* invlist)
8171 {
8172     /* Return the address of the IV that is reserved to hold the cached index
8173      * */
8174     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8175
8176     assert(SvTYPE(invlist) == SVt_INVLIST);
8177
8178     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8179 }
8180
8181 PERL_STATIC_INLINE IV
8182 S_invlist_previous_index(SV* const invlist)
8183 {
8184     /* Returns cached index of previous search */
8185
8186     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8187
8188     return *get_invlist_previous_index_addr(invlist);
8189 }
8190
8191 PERL_STATIC_INLINE void
8192 S_invlist_set_previous_index(SV* const invlist, const IV index)
8193 {
8194     /* Caches <index> for later retrieval */
8195
8196     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8197
8198     assert(index == 0 || index < (int) _invlist_len(invlist));
8199
8200     *get_invlist_previous_index_addr(invlist) = index;
8201 }
8202
8203 PERL_STATIC_INLINE UV
8204 S_invlist_max(SV* const invlist)
8205 {
8206     /* Returns the maximum number of elements storable in the inversion list's
8207      * array, without having to realloc() */
8208
8209     PERL_ARGS_ASSERT_INVLIST_MAX;
8210
8211     assert(SvTYPE(invlist) == SVt_INVLIST);
8212
8213     /* Assumes worst case, in which the 0 element is not counted in the
8214      * inversion list, so subtracts 1 for that */
8215     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8216            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8217            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8218 }
8219
8220 #ifndef PERL_IN_XSUB_RE
8221 SV*
8222 Perl__new_invlist(pTHX_ IV initial_size)
8223 {
8224
8225     /* Return a pointer to a newly constructed inversion list, with enough
8226      * space to store 'initial_size' elements.  If that number is negative, a
8227      * system default is used instead */
8228
8229     SV* new_list;
8230
8231     if (initial_size < 0) {
8232         initial_size = 10;
8233     }
8234
8235     /* Allocate the initial space */
8236     new_list = newSV_type(SVt_INVLIST);
8237
8238     /* First 1 is in case the zero element isn't in the list; second 1 is for
8239      * trailing NUL */
8240     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8241     invlist_set_len(new_list, 0, 0);
8242
8243     /* Force iterinit() to be used to get iteration to work */
8244     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8245
8246     *get_invlist_previous_index_addr(new_list) = 0;
8247
8248     return new_list;
8249 }
8250
8251 SV*
8252 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8253 {
8254     /* Return a pointer to a newly constructed inversion list, initialized to
8255      * point to <list>, which has to be in the exact correct inversion list
8256      * form, including internal fields.  Thus this is a dangerous routine that
8257      * should not be used in the wrong hands.  The passed in 'list' contains
8258      * several header fields at the beginning that are not part of the
8259      * inversion list body proper */
8260
8261     const STRLEN length = (STRLEN) list[0];
8262     const UV version_id =          list[1];
8263     const bool offset   =    cBOOL(list[2]);
8264 #define HEADER_LENGTH 3
8265     /* If any of the above changes in any way, you must change HEADER_LENGTH
8266      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8267      *      perl -E 'say int(rand 2**31-1)'
8268      */
8269 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8270                                         data structure type, so that one being
8271                                         passed in can be validated to be an
8272                                         inversion list of the correct vintage.
8273                                        */
8274
8275     SV* invlist = newSV_type(SVt_INVLIST);
8276
8277     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8278
8279     if (version_id != INVLIST_VERSION_ID) {
8280         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8281     }
8282
8283     /* The generated array passed in includes header elements that aren't part
8284      * of the list proper, so start it just after them */
8285     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8286
8287     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8288                                shouldn't touch it */
8289
8290     *(get_invlist_offset_addr(invlist)) = offset;
8291
8292     /* The 'length' passed to us is the physical number of elements in the
8293      * inversion list.  But if there is an offset the logical number is one
8294      * less than that */
8295     invlist_set_len(invlist, length  - offset, offset);
8296
8297     invlist_set_previous_index(invlist, 0);
8298
8299     /* Initialize the iteration pointer. */
8300     invlist_iterfinish(invlist);
8301
8302     SvREADONLY_on(invlist);
8303
8304     return invlist;
8305 }
8306 #endif /* ifndef PERL_IN_XSUB_RE */
8307
8308 STATIC void
8309 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8310 {
8311     /* Grow the maximum size of an inversion list */
8312
8313     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8314
8315     assert(SvTYPE(invlist) == SVt_INVLIST);
8316
8317     /* Add one to account for the zero element at the beginning which may not
8318      * be counted by the calling parameters */
8319     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8320 }
8321
8322 PERL_STATIC_INLINE void
8323 S_invlist_trim(SV* const invlist)
8324 {
8325     PERL_ARGS_ASSERT_INVLIST_TRIM;
8326
8327     assert(SvTYPE(invlist) == SVt_INVLIST);
8328
8329     /* Change the length of the inversion list to how many entries it currently
8330      * has */
8331     SvPV_shrink_to_cur((SV *) invlist);
8332 }
8333
8334 STATIC void
8335 S__append_range_to_invlist(pTHX_ SV* const invlist,
8336                                  const UV start, const UV end)
8337 {
8338    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8339     * the end of the inversion list.  The range must be above any existing
8340     * ones. */
8341
8342     UV* array;
8343     UV max = invlist_max(invlist);
8344     UV len = _invlist_len(invlist);
8345     bool offset;
8346
8347     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8348
8349     if (len == 0) { /* Empty lists must be initialized */
8350         offset = start != 0;
8351         array = _invlist_array_init(invlist, ! offset);
8352     }
8353     else {
8354         /* Here, the existing list is non-empty. The current max entry in the
8355          * list is generally the first value not in the set, except when the
8356          * set extends to the end of permissible values, in which case it is
8357          * the first entry in that final set, and so this call is an attempt to
8358          * append out-of-order */
8359
8360         UV final_element = len - 1;
8361         array = invlist_array(invlist);
8362         if (array[final_element] > start
8363             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8364         {
8365             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8366                      array[final_element], start,
8367                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8368         }
8369
8370         /* Here, it is a legal append.  If the new range begins with the first
8371          * value not in the set, it is extending the set, so the new first
8372          * value not in the set is one greater than the newly extended range.
8373          * */
8374         offset = *get_invlist_offset_addr(invlist);
8375         if (array[final_element] == start) {
8376             if (end != UV_MAX) {
8377                 array[final_element] = end + 1;
8378             }
8379             else {
8380                 /* But if the end is the maximum representable on the machine,
8381                  * just let the range that this would extend to have no end */
8382                 invlist_set_len(invlist, len - 1, offset);
8383             }
8384             return;
8385         }
8386     }
8387
8388     /* Here the new range doesn't extend any existing set.  Add it */
8389
8390     len += 2;   /* Includes an element each for the start and end of range */
8391
8392     /* If wll overflow the existing space, extend, which may cause the array to
8393      * be moved */
8394     if (max < len) {
8395         invlist_extend(invlist, len);
8396
8397         /* Have to set len here to avoid assert failure in invlist_array() */
8398         invlist_set_len(invlist, len, offset);
8399
8400         array = invlist_array(invlist);
8401     }
8402     else {
8403         invlist_set_len(invlist, len, offset);
8404     }
8405
8406     /* The next item on the list starts the range, the one after that is
8407      * one past the new range.  */
8408     array[len - 2] = start;
8409     if (end != UV_MAX) {
8410         array[len - 1] = end + 1;
8411     }
8412     else {
8413         /* But if the end is the maximum representable on the machine, just let
8414          * the range have no end */
8415         invlist_set_len(invlist, len - 1, offset);
8416     }
8417 }
8418
8419 #ifndef PERL_IN_XSUB_RE
8420
8421 IV
8422 Perl__invlist_search(SV* const invlist, const UV cp)
8423 {
8424     /* Searches the inversion list for the entry that contains the input code
8425      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8426      * return value is the index into the list's array of the range that
8427      * contains <cp> */
8428
8429     IV low = 0;
8430     IV mid;
8431     IV high = _invlist_len(invlist);
8432     const IV highest_element = high - 1;
8433     const UV* array;
8434
8435     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8436
8437     /* If list is empty, return failure. */
8438     if (high == 0) {
8439         return -1;
8440     }
8441
8442     /* (We can't get the array unless we know the list is non-empty) */
8443     array = invlist_array(invlist);
8444
8445     mid = invlist_previous_index(invlist);
8446     assert(mid >=0 && mid <= highest_element);
8447
8448     /* <mid> contains the cache of the result of the previous call to this
8449      * function (0 the first time).  See if this call is for the same result,
8450      * or if it is for mid-1.  This is under the theory that calls to this
8451      * function will often be for related code points that are near each other.
8452      * And benchmarks show that caching gives better results.  We also test
8453      * here if the code point is within the bounds of the list.  These tests
8454      * replace others that would have had to be made anyway to make sure that
8455      * the array bounds were not exceeded, and these give us extra information
8456      * at the same time */
8457     if (cp >= array[mid]) {
8458         if (cp >= array[highest_element]) {
8459             return highest_element;
8460         }
8461
8462         /* Here, array[mid] <= cp < array[highest_element].  This means that
8463          * the final element is not the answer, so can exclude it; it also
8464          * means that <mid> is not the final element, so can refer to 'mid + 1'
8465          * safely */
8466         if (cp < array[mid + 1]) {
8467             return mid;
8468         }
8469         high--;
8470         low = mid + 1;
8471     }
8472     else { /* cp < aray[mid] */
8473         if (cp < array[0]) { /* Fail if outside the array */
8474             return -1;
8475         }
8476         high = mid;
8477         if (cp >= array[mid - 1]) {
8478             goto found_entry;
8479         }
8480     }
8481
8482     /* Binary search.  What we are looking for is <i> such that
8483      *  array[i] <= cp < array[i+1]
8484      * The loop below converges on the i+1.  Note that there may not be an
8485      * (i+1)th element in the array, and things work nonetheless */
8486     while (low < high) {
8487         mid = (low + high) / 2;
8488         assert(mid <= highest_element);
8489         if (array[mid] <= cp) { /* cp >= array[mid] */
8490             low = mid + 1;
8491
8492             /* We could do this extra test to exit the loop early.
8493             if (cp < array[low]) {
8494                 return mid;
8495             }
8496             */
8497         }
8498         else { /* cp < array[mid] */
8499             high = mid;
8500         }
8501     }
8502
8503   found_entry:
8504     high--;
8505     invlist_set_previous_index(invlist, high);
8506     return high;
8507 }
8508
8509 void
8510 Perl__invlist_populate_swatch(SV* const invlist,
8511                               const UV start, const UV end, U8* swatch)
8512 {
8513     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8514      * but is used when the swash has an inversion list.  This makes this much
8515      * faster, as it uses a binary search instead of a linear one.  This is
8516      * intimately tied to that function, and perhaps should be in utf8.c,
8517      * except it is intimately tied to inversion lists as well.  It assumes
8518      * that <swatch> is all 0's on input */
8519
8520     UV current = start;
8521     const IV len = _invlist_len(invlist);
8522     IV i;
8523     const UV * array;
8524
8525     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8526
8527     if (len == 0) { /* Empty inversion list */
8528         return;
8529     }
8530
8531     array = invlist_array(invlist);
8532
8533     /* Find which element it is */
8534     i = _invlist_search(invlist, start);
8535
8536     /* We populate from <start> to <end> */
8537     while (current < end) {
8538         UV upper;
8539
8540         /* The inversion list gives the results for every possible code point
8541          * after the first one in the list.  Only those ranges whose index is
8542          * even are ones that the inversion list matches.  For the odd ones,
8543          * and if the initial code point is not in the list, we have to skip
8544          * forward to the next element */
8545         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8546             i++;
8547             if (i >= len) { /* Finished if beyond the end of the array */
8548                 return;
8549             }
8550             current = array[i];
8551             if (current >= end) {   /* Finished if beyond the end of what we
8552                                        are populating */
8553                 if (LIKELY(end < UV_MAX)) {
8554                     return;
8555                 }
8556
8557                 /* We get here when the upper bound is the maximum
8558                  * representable on the machine, and we are looking for just
8559                  * that code point.  Have to special case it */
8560                 i = len;
8561                 goto join_end_of_list;
8562             }
8563         }
8564         assert(current >= start);
8565
8566         /* The current range ends one below the next one, except don't go past
8567          * <end> */
8568         i++;
8569         upper = (i < len && array[i] < end) ? array[i] : end;
8570
8571         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8572          * for each code point in it */
8573         for (; current < upper; current++) {
8574             const STRLEN offset = (STRLEN)(current - start);
8575             swatch[offset >> 3] |= 1 << (offset & 7);
8576         }
8577
8578     join_end_of_list:
8579
8580         /* Quit if at the end of the list */
8581         if (i >= len) {
8582
8583             /* But first, have to deal with the highest possible code point on
8584              * the platform.  The previous code assumes that <end> is one
8585              * beyond where we want to populate, but that is impossible at the
8586              * platform's infinity, so have to handle it specially */
8587             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8588             {
8589                 const STRLEN offset = (STRLEN)(end - start);
8590                 swatch[offset >> 3] |= 1 << (offset & 7);
8591             }
8592             return;
8593         }
8594
8595         /* Advance to the next range, which will be for code points not in the
8596          * inversion list */
8597         current = array[i];
8598     }
8599
8600     return;
8601 }
8602
8603 void
8604 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8605                                          const bool complement_b, SV** output)
8606 {
8607     /* Take the union of two inversion lists and point <output> to it.  *output
8608      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8609      * the reference count to that list will be decremented if not already a
8610      * temporary (mortal); otherwise *output will be made correspondingly
8611      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8612      * second list is returned.  If <complement_b> is TRUE, the union is taken
8613      * of the complement (inversion) of <b> instead of b itself.
8614      *
8615      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8616      * Richard Gillam, published by Addison-Wesley, and explained at some
8617      * length there.  The preface says to incorporate its examples into your
8618      * code at your own risk.
8619      *
8620      * The algorithm is like a merge sort.
8621      *
8622      * XXX A potential performance improvement is to keep track as we go along
8623      * if only one of the inputs contributes to the result, meaning the other
8624      * is a subset of that one.  In that case, we can skip the final copy and
8625      * return the larger of the input lists, but then outside code might need
8626      * to keep track of whether to free the input list or not */
8627
8628     const UV* array_a;    /* a's array */
8629     const UV* array_b;
8630     UV len_a;       /* length of a's array */
8631     UV len_b;
8632
8633     SV* u;                      /* the resulting union */
8634     UV* array_u;
8635     UV len_u;
8636
8637     UV i_a = 0;             /* current index into a's array */
8638     UV i_b = 0;
8639     UV i_u = 0;
8640
8641     /* running count, as explained in the algorithm source book; items are
8642      * stopped accumulating and are output when the count changes to/from 0.
8643      * The count is incremented when we start a range that's in the set, and
8644      * decremented when we start a range that's not in the set.  So its range
8645      * is 0 to 2.  Only when the count is zero is something not in the set.
8646      */
8647     UV count = 0;
8648
8649     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8650     assert(a != b);
8651
8652     /* If either one is empty, the union is the other one */
8653     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8654         bool make_temp = FALSE; /* Should we mortalize the result? */
8655
8656         if (*output == a) {
8657             if (a != NULL) {
8658                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8659                     SvREFCNT_dec_NN(a);
8660                 }
8661             }
8662         }
8663         if (*output != b) {
8664             *output = invlist_clone(b);
8665             if (complement_b) {
8666                 _invlist_invert(*output);
8667             }
8668         } /* else *output already = b; */
8669
8670         if (make_temp) {
8671             sv_2mortal(*output);
8672         }
8673         return;
8674     }
8675     else if ((len_b = _invlist_len(b)) == 0) {
8676         bool make_temp = FALSE;
8677         if (*output == b) {
8678             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8679                 SvREFCNT_dec_NN(b);
8680             }
8681         }
8682
8683         /* The complement of an empty list is a list that has everything in it,
8684          * so the union with <a> includes everything too */
8685         if (complement_b) {
8686             if (a == *output) {
8687                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8688                     SvREFCNT_dec_NN(a);
8689                 }
8690             }
8691             *output = _new_invlist(1);
8692             _append_range_to_invlist(*output, 0, UV_MAX);
8693         }
8694         else if (*output != a) {
8695             *output = invlist_clone(a);
8696         }
8697         /* else *output already = a; */
8698
8699         if (make_temp) {
8700             sv_2mortal(*output);
8701         }
8702         return;
8703     }
8704
8705     /* Here both lists exist and are non-empty */
8706     array_a = invlist_array(a);
8707     array_b = invlist_array(b);
8708
8709     /* If are to take the union of 'a' with the complement of b, set it
8710      * up so are looking at b's complement. */
8711     if (complement_b) {
8712
8713         /* To complement, we invert: if the first element is 0, remove it.  To
8714          * do this, we just pretend the array starts one later */
8715         if (array_b[0] == 0) {
8716             array_b++;
8717             len_b--;
8718         }
8719         else {
8720
8721             /* But if the first element is not zero, we pretend the list starts
8722              * at the 0 that is always stored immediately before the array. */
8723             array_b--;
8724             len_b++;
8725         }
8726     }
8727
8728     /* Size the union for the worst case: that the sets are completely
8729      * disjoint */
8730     u = _new_invlist(len_a + len_b);
8731
8732     /* Will contain U+0000 if either component does */
8733     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8734                                       || (len_b > 0 && array_b[0] == 0));
8735
8736     /* Go through each list item by item, stopping when exhausted one of
8737      * them */
8738     while (i_a < len_a && i_b < len_b) {
8739         UV cp;      /* The element to potentially add to the union's array */
8740         bool cp_in_set;   /* is it in the the input list's set or not */
8741
8742         /* We need to take one or the other of the two inputs for the union.
8743          * Since we are merging two sorted lists, we take the smaller of the
8744          * next items.  In case of a tie, we take the one that is in its set
8745          * first.  If we took one not in the set first, it would decrement the
8746          * count, possibly to 0 which would cause it to be output as ending the
8747          * range, and the next time through we would take the same number, and
8748          * output it again as beginning the next range.  By doing it the
8749          * opposite way, there is no possibility that the count will be
8750          * momentarily decremented to 0, and thus the two adjoining ranges will
8751          * be seamlessly merged.  (In a tie and both are in the set or both not
8752          * in the set, it doesn't matter which we take first.) */
8753         if (array_a[i_a] < array_b[i_b]
8754             || (array_a[i_a] == array_b[i_b]
8755                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8756         {
8757             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8758             cp= array_a[i_a++];
8759         }
8760         else {
8761             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8762             cp = array_b[i_b++];
8763         }
8764
8765         /* Here, have chosen which of the two inputs to look at.  Only output
8766          * if the running count changes to/from 0, which marks the
8767          * beginning/end of a range in that's in the set */
8768         if (cp_in_set) {
8769             if (count == 0) {
8770                 array_u[i_u++] = cp;
8771             }
8772             count++;
8773         }
8774         else {
8775             count--;
8776             if (count == 0) {
8777                 array_u[i_u++] = cp;
8778             }
8779         }
8780     }
8781
8782     /* Here, we are finished going through at least one of the lists, which
8783      * means there is something remaining in at most one.  We check if the list
8784      * that hasn't been exhausted is positioned such that we are in the middle
8785      * of a range in its set or not.  (i_a and i_b point to the element beyond
8786      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8787      * is potentially more to output.
8788      * There are four cases:
8789      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8790      *     in the union is entirely from the non-exhausted set.
8791      *  2) Both were in their sets, count is 2.  Nothing further should
8792      *     be output, as everything that remains will be in the exhausted
8793      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8794      *     that
8795      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8796      *     Nothing further should be output because the union includes
8797      *     everything from the exhausted set.  Not decrementing ensures that.
8798      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8799      *     decrementing to 0 insures that we look at the remainder of the
8800      *     non-exhausted set */
8801     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8802         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8803     {
8804         count--;
8805     }
8806
8807     /* The final length is what we've output so far, plus what else is about to
8808      * be output.  (If 'count' is non-zero, then the input list we exhausted
8809      * has everything remaining up to the machine's limit in its set, and hence
8810      * in the union, so there will be no further output. */
8811     len_u = i_u;
8812     if (count == 0) {
8813         /* At most one of the subexpressions will be non-zero */
8814         len_u += (len_a - i_a) + (len_b - i_b);
8815     }
8816
8817     /* Set result to final length, which can change the pointer to array_u, so
8818      * re-find it */
8819     if (len_u != _invlist_len(u)) {
8820         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8821         invlist_trim(u);
8822         array_u = invlist_array(u);
8823     }
8824
8825     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8826      * the other) ended with everything above it not in its set.  That means
8827      * that the remaining part of the union is precisely the same as the
8828      * non-exhausted list, so can just copy it unchanged.  (If both list were
8829      * exhausted at the same time, then the operations below will be both 0.)
8830      */
8831     if (count == 0) {
8832         IV copy_count; /* At most one will have a non-zero copy count */
8833         if ((copy_count = len_a - i_a) > 0) {
8834             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8835         }
8836         else if ((copy_count = len_b - i_b) > 0) {
8837             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8838         }
8839     }
8840
8841     /*  We may be removing a reference to one of the inputs.  If so, the output
8842      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8843      *  count decremented) */
8844     if (a == *output || b == *output) {
8845         assert(! invlist_is_iterating(*output));
8846         if ((SvTEMP(*output))) {
8847             sv_2mortal(u);
8848         }
8849         else {
8850             SvREFCNT_dec_NN(*output);
8851         }
8852     }
8853
8854     *output = u;
8855
8856     return;
8857 }
8858
8859 void
8860 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8861                                                const bool complement_b, SV** i)
8862 {
8863     /* Take the intersection of two inversion lists and point <i> to it.  *i
8864      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8865      * the reference count to that list will be decremented if not already a
8866      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8867      * The first list, <a>, may be NULL, in which case an empty list is
8868      * returned.  If <complement_b> is TRUE, the result will be the
8869      * intersection of <a> and the complement (or inversion) of <b> instead of
8870      * <b> directly.
8871      *
8872      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8873      * Richard Gillam, published by Addison-Wesley, and explained at some
8874      * length there.  The preface says to incorporate its examples into your
8875      * code at your own risk.  In fact, it had bugs
8876      *
8877      * The algorithm is like a merge sort, and is essentially the same as the
8878      * union above
8879      */
8880
8881     const UV* array_a;          /* a's array */
8882     const UV* array_b;
8883     UV len_a;   /* length of a's array */
8884     UV len_b;
8885
8886     SV* r;                   /* the resulting intersection */
8887     UV* array_r;
8888     UV len_r;
8889
8890     UV i_a = 0;             /* current index into a's array */
8891     UV i_b = 0;
8892     UV i_r = 0;
8893
8894     /* running count, as explained in the algorithm source book; items are
8895      * stopped accumulating and are output when the count changes to/from 2.
8896      * The count is incremented when we start a range that's in the set, and
8897      * decremented when we start a range that's not in the set.  So its range
8898      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8899      */
8900     UV count = 0;
8901
8902     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8903     assert(a != b);
8904
8905     /* Special case if either one is empty */
8906     len_a = (a == NULL) ? 0 : _invlist_len(a);
8907     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8908         bool make_temp = FALSE;
8909
8910         if (len_a != 0 && complement_b) {
8911
8912             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8913              * be empty.  Here, also we are using 'b's complement, which hence
8914              * must be every possible code point.  Thus the intersection is
8915              * simply 'a'. */
8916             if (*i != a) {
8917                 if (*i == b) {
8918                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8919                         SvREFCNT_dec_NN(b);
8920                     }
8921                 }
8922
8923                 *i = invlist_clone(a);
8924             }
8925             /* else *i is already 'a' */
8926
8927             if (make_temp) {
8928                 sv_2mortal(*i);
8929             }
8930             return;
8931         }
8932
8933         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8934          * intersection must be empty */
8935         if (*i == a) {
8936             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8937                 SvREFCNT_dec_NN(a);
8938             }
8939         }
8940         else if (*i == b) {
8941             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8942                 SvREFCNT_dec_NN(b);
8943             }
8944         }
8945         *i = _new_invlist(0);
8946         if (make_temp) {
8947             sv_2mortal(*i);
8948         }
8949
8950         return;
8951     }
8952
8953     /* Here both lists exist and are non-empty */
8954     array_a = invlist_array(a);
8955     array_b = invlist_array(b);
8956
8957     /* If are to take the intersection of 'a' with the complement of b, set it
8958      * up so are looking at b's complement. */
8959     if (complement_b) {
8960
8961         /* To complement, we invert: if the first element is 0, remove it.  To
8962          * do this, we just pretend the array starts one later */
8963         if (array_b[0] == 0) {
8964             array_b++;
8965             len_b--;
8966         }
8967         else {
8968
8969             /* But if the first element is not zero, we pretend the list starts
8970              * at the 0 that is always stored immediately before the array. */
8971             array_b--;
8972             len_b++;
8973         }
8974     }
8975
8976     /* Size the intersection for the worst case: that the intersection ends up
8977      * fragmenting everything to be completely disjoint */
8978     r= _new_invlist(len_a + len_b);
8979
8980     /* Will contain U+0000 iff both components do */
8981     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8982                                      && len_b > 0 && array_b[0] == 0);
8983
8984     /* Go through each list item by item, stopping when exhausted one of
8985      * them */
8986     while (i_a < len_a && i_b < len_b) {
8987         UV cp;      /* The element to potentially add to the intersection's
8988                        array */
8989         bool cp_in_set; /* Is it in the input list's set or not */
8990
8991         /* We need to take one or the other of the two inputs for the
8992          * intersection.  Since we are merging two sorted lists, we take the
8993          * smaller of the next items.  In case of a tie, we take the one that
8994          * is not in its set first (a difference from the union algorithm).  If
8995          * we took one in the set first, it would increment the count, possibly
8996          * to 2 which would cause it to be output as starting a range in the
8997          * intersection, and the next time through we would take that same
8998          * number, and output it again as ending the set.  By doing it the
8999          * opposite of this, there is no possibility that the count will be
9000          * momentarily incremented to 2.  (In a tie and both are in the set or
9001          * both not in the set, it doesn't matter which we take first.) */
9002         if (array_a[i_a] < array_b[i_b]
9003             || (array_a[i_a] == array_b[i_b]
9004                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9005         {
9006             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9007             cp= array_a[i_a++];
9008         }
9009         else {
9010             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9011             cp= array_b[i_b++];
9012         }
9013
9014         /* Here, have chosen which of the two inputs to look at.  Only output
9015          * if the running count changes to/from 2, which marks the
9016          * beginning/end of a range that's in the intersection */
9017         if (cp_in_set) {
9018             count++;
9019             if (count == 2) {
9020                 array_r[i_r++] = cp;
9021             }
9022         }
9023         else {
9024             if (count == 2) {
9025                 array_r[i_r++] = cp;
9026             }
9027             count--;
9028         }
9029     }
9030
9031     /* Here, we are finished going through at least one of the lists, which
9032      * means there is something remaining in at most one.  We check if the list
9033      * that has been exhausted is positioned such that we are in the middle
9034      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
9035      * the ones we care about.)  There are four cases:
9036      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
9037      *     nothing left in the intersection.
9038      *  2) Both were in their sets, count is 2 and perhaps is incremented to
9039      *     above 2.  What should be output is exactly that which is in the
9040      *     non-exhausted set, as everything it has is also in the intersection
9041      *     set, and everything it doesn't have can't be in the intersection
9042      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9043      *     gets incremented to 2.  Like the previous case, the intersection is
9044      *     everything that remains in the non-exhausted set.
9045      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9046      *     remains 1.  And the intersection has nothing more. */
9047     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9048         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9049     {
9050         count++;
9051     }
9052
9053     /* The final length is what we've output so far plus what else is in the
9054      * intersection.  At most one of the subexpressions below will be non-zero
9055      * */
9056     len_r = i_r;
9057     if (count >= 2) {
9058         len_r += (len_a - i_a) + (len_b - i_b);
9059     }
9060
9061     /* Set result to final length, which can change the pointer to array_r, so
9062      * re-find it */
9063     if (len_r != _invlist_len(r)) {
9064         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9065         invlist_trim(r);
9066         array_r = invlist_array(r);
9067     }
9068
9069     /* Finish outputting any remaining */
9070     if (count >= 2) { /* At most one will have a non-zero copy count */
9071         IV copy_count;
9072         if ((copy_count = len_a - i_a) > 0) {
9073             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9074         }
9075         else if ((copy_count = len_b - i_b) > 0) {
9076             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9077         }
9078     }
9079
9080     /*  We may be removing a reference to one of the inputs.  If so, the output
9081      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
9082      *  count decremented) */
9083     if (a == *i || b == *i) {
9084         assert(! invlist_is_iterating(*i));
9085         if (SvTEMP(*i)) {
9086             sv_2mortal(r);
9087         }
9088         else {
9089             SvREFCNT_dec_NN(*i);
9090         }
9091     }
9092
9093     *i = r;
9094
9095     return;
9096 }
9097
9098 SV*
9099 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9100 {
9101     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9102      * set.  A pointer to the inversion list is returned.  This may actually be
9103      * a new list, in which case the passed in one has been destroyed.  The
9104      * passed in inversion list can be NULL, in which case a new one is created
9105      * with just the one range in it */
9106
9107     SV* range_invlist;
9108     UV len;
9109
9110     if (invlist == NULL) {
9111         invlist = _new_invlist(2);
9112         len = 0;
9113     }
9114     else {
9115         len = _invlist_len(invlist);
9116     }
9117
9118     /* If comes after the final entry actually in the list, can just append it
9119      * to the end, */
9120     if (len == 0
9121         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9122             && start >= invlist_array(invlist)[len - 1]))
9123     {
9124         _append_range_to_invlist(invlist, start, end);
9125         return invlist;
9126     }
9127
9128     /* Here, can't just append things, create and return a new inversion list
9129      * which is the union of this range and the existing inversion list */
9130     range_invlist = _new_invlist(2);
9131     _append_range_to_invlist(range_invlist, start, end);
9132
9133     _invlist_union(invlist, range_invlist, &invlist);
9134
9135     /* The temporary can be freed */
9136     SvREFCNT_dec_NN(range_invlist);
9137
9138     return invlist;
9139 }
9140
9141 SV*
9142 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9143                                  UV** other_elements_ptr)
9144 {
9145     /* Create and return an inversion list whose contents are to be populated
9146      * by the caller.  The caller gives the number of elements (in 'size') and
9147      * the very first element ('element0').  This function will set
9148      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9149      * are to be placed.
9150      *
9151      * Obviously there is some trust involved that the caller will properly
9152      * fill in the other elements of the array.
9153      *
9154      * (The first element needs to be passed in, as the underlying code does
9155      * things differently depending on whether it is zero or non-zero) */
9156
9157     SV* invlist = _new_invlist(size);
9158     bool offset;
9159
9160     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9161
9162     _append_range_to_invlist(invlist, element0, element0);
9163     offset = *get_invlist_offset_addr(invlist);
9164
9165     invlist_set_len(invlist, size, offset);
9166     *other_elements_ptr = invlist_array(invlist) + 1;
9167     return invlist;
9168 }
9169
9170 #endif
9171
9172 PERL_STATIC_INLINE SV*
9173 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9174     return _add_range_to_invlist(invlist, cp, cp);
9175 }
9176
9177 #ifndef PERL_IN_XSUB_RE
9178 void
9179 Perl__invlist_invert(pTHX_ SV* const invlist)
9180 {
9181     /* Complement the input inversion list.  This adds a 0 if the list didn't
9182      * have a zero; removes it otherwise.  As described above, the data
9183      * structure is set up so that this is very efficient */
9184
9185     PERL_ARGS_ASSERT__INVLIST_INVERT;
9186
9187     assert(! invlist_is_iterating(invlist));
9188
9189     /* The inverse of matching nothing is matching everything */
9190     if (_invlist_len(invlist) == 0) {
9191         _append_range_to_invlist(invlist, 0, UV_MAX);
9192         return;
9193     }
9194
9195     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9196 }
9197
9198 #endif
9199
9200 PERL_STATIC_INLINE SV*
9201 S_invlist_clone(pTHX_ SV* const invlist)
9202 {
9203
9204     /* Return a new inversion list that is a copy of the input one, which is
9205      * unchanged.  The new list will not be mortal even if the old one was. */
9206
9207     /* Need to allocate extra space to accommodate Perl's addition of a
9208      * trailing NUL to SvPV's, since it thinks they are always strings */
9209     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9210     STRLEN physical_length = SvCUR(invlist);
9211     bool offset = *(get_invlist_offset_addr(invlist));
9212
9213     PERL_ARGS_ASSERT_INVLIST_CLONE;
9214
9215     *(get_invlist_offset_addr(new_invlist)) = offset;
9216     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9217     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9218
9219     return new_invlist;
9220 }
9221
9222 PERL_STATIC_INLINE STRLEN*
9223 S_get_invlist_iter_addr(SV* invlist)
9224 {
9225     /* Return the address of the UV that contains the current iteration
9226      * position */
9227
9228     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9229
9230     assert(SvTYPE(invlist) == SVt_INVLIST);
9231
9232     return &(((XINVLIST*) SvANY(invlist))->iterator);
9233 }
9234
9235 PERL_STATIC_INLINE void
9236 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9237 {
9238     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9239
9240     *get_invlist_iter_addr(invlist) = 0;
9241 }
9242
9243 PERL_STATIC_INLINE void
9244 S_invlist_iterfinish(SV* invlist)
9245 {
9246     /* Terminate iterator for invlist.  This is to catch development errors.
9247      * Any iteration that is interrupted before completed should call this
9248      * function.  Functions that add code points anywhere else but to the end
9249      * of an inversion list assert that they are not in the middle of an
9250      * iteration.  If they were, the addition would make the iteration
9251      * problematical: if the iteration hadn't reached the place where things
9252      * were being added, it would be ok */
9253
9254     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9255
9256     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9257 }
9258
9259 STATIC bool
9260 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9261 {
9262     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9263      * This call sets in <*start> and <*end>, the next range in <invlist>.
9264      * Returns <TRUE> if successful and the next call will return the next
9265      * range; <FALSE> if was already at the end of the list.  If the latter,
9266      * <*start> and <*end> are unchanged, and the next call to this function
9267      * will start over at the beginning of the list */
9268
9269     STRLEN* pos = get_invlist_iter_addr(invlist);
9270     UV len = _invlist_len(invlist);
9271     UV *array;
9272
9273     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9274
9275     if (*pos >= len) {
9276         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9277         return FALSE;
9278     }
9279
9280     array = invlist_array(invlist);
9281
9282     *start = array[(*pos)++];
9283
9284     if (*pos >= len) {
9285         *end = UV_MAX;
9286     }
9287     else {
9288         *end = array[(*pos)++] - 1;
9289     }
9290
9291     return TRUE;
9292 }
9293
9294 PERL_STATIC_INLINE bool
9295 S_invlist_is_iterating(SV* const invlist)
9296 {
9297     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9298
9299     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9300 }
9301
9302 PERL_STATIC_INLINE UV
9303 S_invlist_highest(SV* const invlist)
9304 {
9305     /* Returns the highest code point that matches an inversion list.  This API
9306      * has an ambiguity, as it returns 0 under either the highest is actually
9307      * 0, or if the list is empty.  If this distinction matters to you, check
9308      * for emptiness before calling this function */
9309
9310     UV len = _invlist_len(invlist);
9311     UV *array;
9312
9313     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9314
9315     if (len == 0) {
9316         return 0;
9317     }
9318
9319     array = invlist_array(invlist);
9320
9321     /* The last element in the array in the inversion list always starts a
9322      * range that goes to infinity.  That range may be for code points that are
9323      * matched in the inversion list, or it may be for ones that aren't
9324      * matched.  In the latter case, the highest code point in the set is one
9325      * less than the beginning of this range; otherwise it is the final element
9326      * of this range: infinity */
9327     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9328            ? UV_MAX
9329            : array[len - 1] - 1;
9330 }
9331
9332 #ifndef PERL_IN_XSUB_RE
9333 SV *
9334 Perl__invlist_contents(pTHX_ SV* const invlist)
9335 {
9336     /* Get the contents of an inversion list into a string SV so that they can
9337      * be printed out.  It uses the format traditionally done for debug tracing
9338      */
9339
9340     UV start, end;
9341     SV* output = newSVpvs("\n");
9342
9343     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9344
9345     assert(! invlist_is_iterating(invlist));
9346
9347     invlist_iterinit(invlist);
9348     while (invlist_iternext(invlist, &start, &end)) {
9349         if (end == UV_MAX) {
9350             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9351         }
9352         else if (end != start) {
9353             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9354                     start,       end);
9355         }
9356         else {
9357             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9358         }
9359     }
9360
9361     return output;
9362 }
9363 #endif
9364
9365 #ifndef PERL_IN_XSUB_RE
9366 void
9367 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9368                          const char * const indent, SV* const invlist)
9369 {
9370     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9371      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9372      * the string 'indent'.  The output looks like this:
9373          [0] 0x000A .. 0x000D
9374          [2] 0x0085
9375          [4] 0x2028 .. 0x2029
9376          [6] 0x3104 .. INFINITY
9377      * This means that the first range of code points matched by the list are
9378      * 0xA through 0xD; the second range contains only the single code point
9379      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9380      * are used to define each range (except if the final range extends to
9381      * infinity, only a single element is needed).  The array index of the
9382      * first element for the corresponding range is given in brackets. */
9383
9384     UV start, end;
9385     STRLEN count = 0;
9386
9387     PERL_ARGS_ASSERT__INVLIST_DUMP;
9388
9389     if (invlist_is_iterating(invlist)) {
9390         Perl_dump_indent(aTHX_ level, file,
9391              "%sCan't dump inversion list because is in middle of iterating\n",
9392              indent);
9393         return;
9394     }
9395
9396     invlist_iterinit(invlist);
9397     while (invlist_iternext(invlist, &start, &end)) {
9398         if (end == UV_MAX) {
9399             Perl_dump_indent(aTHX_ level, file,
9400                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9401                                    indent, (UV)count, start);
9402         }
9403         else if (end != start) {
9404             Perl_dump_indent(aTHX_ level, file,
9405                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9406                                 indent, (UV)count, start,         end);
9407         }
9408         else {
9409             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9410                                             indent, (UV)count, start);
9411         }
9412         count += 2;
9413     }
9414 }
9415
9416 void
9417 Perl__load_PL_utf8_foldclosures (pTHX)
9418 {
9419     assert(! PL_utf8_foldclosures);
9420
9421     /* If the folds haven't been read in, call a fold function
9422      * to force that */
9423     if (! PL_utf8_tofold) {
9424         U8 dummy[UTF8_MAXBYTES_CASE+1];
9425
9426         /* This string is just a short named one above \xff */
9427         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9428         assert(PL_utf8_tofold); /* Verify that worked */
9429     }
9430     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9431 }
9432 #endif
9433
9434 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9435 bool
9436 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9437 {
9438     /* Return a boolean as to if the two passed in inversion lists are
9439      * identical.  The final argument, if TRUE, says to take the complement of
9440      * the second inversion list before doing the comparison */
9441
9442     const UV* array_a = invlist_array(a);
9443     const UV* array_b = invlist_array(b);
9444     UV len_a = _invlist_len(a);
9445     UV len_b = _invlist_len(b);
9446
9447     UV i = 0;               /* current index into the arrays */
9448     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9449
9450     PERL_ARGS_ASSERT__INVLISTEQ;
9451
9452     /* If are to compare 'a' with the complement of b, set it
9453      * up so are looking at b's complement. */
9454     if (complement_b) {
9455
9456         /* The complement of nothing is everything, so <a> would have to have
9457          * just one element, starting at zero (ending at infinity) */
9458         if (len_b == 0) {
9459             return (len_a == 1 && array_a[0] == 0);
9460         }
9461         else if (array_b[0] == 0) {
9462
9463             /* Otherwise, to complement, we invert.  Here, the first element is
9464              * 0, just remove it.  To do this, we just pretend the array starts
9465              * one later */
9466
9467             array_b++;
9468             len_b--;
9469         }
9470         else {
9471
9472             /* But if the first element is not zero, we pretend the list starts
9473              * at the 0 that is always stored immediately before the array. */
9474             array_b--;
9475             len_b++;
9476         }
9477     }
9478
9479     /* Make sure that the lengths are the same, as well as the final element
9480      * before looping through the remainder.  (Thus we test the length, final,
9481      * and first elements right off the bat) */
9482     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9483         retval = FALSE;
9484     }
9485     else for (i = 0; i < len_a - 1; i++) {
9486         if (array_a[i] != array_b[i]) {
9487             retval = FALSE;
9488             break;
9489         }
9490     }
9491
9492     return retval;
9493 }
9494 #endif
9495
9496 #undef HEADER_LENGTH
9497 #undef TO_INTERNAL_SIZE
9498 #undef FROM_INTERNAL_SIZE
9499 #undef INVLIST_VERSION_ID
9500
9501 /* End of inversion list object */
9502
9503 STATIC void
9504 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9505 {
9506     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9507      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9508      * should point to the first flag; it is updated on output to point to the
9509      * final ')' or ':'.  There needs to be at least one flag, or this will
9510      * abort */
9511
9512     /* for (?g), (?gc), and (?o) warnings; warning
9513        about (?c) will warn about (?g) -- japhy    */
9514
9515 #define WASTED_O  0x01
9516 #define WASTED_G  0x02
9517 #define WASTED_C  0x04
9518 #define WASTED_GC (WASTED_G|WASTED_C)
9519     I32 wastedflags = 0x00;
9520     U32 posflags = 0, negflags = 0;
9521     U32 *flagsp = &posflags;
9522     char has_charset_modifier = '\0';
9523     regex_charset cs;
9524     bool has_use_defaults = FALSE;
9525     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9526     int x_mod_count = 0;
9527
9528     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9529
9530     /* '^' as an initial flag sets certain defaults */
9531     if (UCHARAT(RExC_parse) == '^') {
9532         RExC_parse++;
9533         has_use_defaults = TRUE;
9534         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9535         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9536                                         ? REGEX_UNICODE_CHARSET
9537                                         : REGEX_DEPENDS_CHARSET);
9538     }
9539
9540     cs = get_regex_charset(RExC_flags);
9541     if (cs == REGEX_DEPENDS_CHARSET
9542         && (RExC_utf8 || RExC_uni_semantics))
9543     {
9544         cs = REGEX_UNICODE_CHARSET;
9545     }
9546
9547     while (*RExC_parse) {
9548         /* && strchr("iogcmsx", *RExC_parse) */
9549         /* (?g), (?gc) and (?o) are useless here
9550            and must be globally applied -- japhy */
9551         switch (*RExC_parse) {
9552
9553             /* Code for the imsx flags */
9554             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9555
9556             case LOCALE_PAT_MOD:
9557                 if (has_charset_modifier) {
9558                     goto excess_modifier;
9559                 }
9560                 else if (flagsp == &negflags) {
9561                     goto neg_modifier;
9562                 }
9563                 cs = REGEX_LOCALE_CHARSET;
9564                 has_charset_modifier = LOCALE_PAT_MOD;
9565                 break;
9566             case UNICODE_PAT_MOD:
9567                 if (has_charset_modifier) {
9568                     goto excess_modifier;
9569                 }
9570                 else if (flagsp == &negflags) {
9571                     goto neg_modifier;
9572                 }
9573                 cs = REGEX_UNICODE_CHARSET;
9574                 has_charset_modifier = UNICODE_PAT_MOD;
9575                 break;
9576             case ASCII_RESTRICT_PAT_MOD:
9577                 if (flagsp == &negflags) {
9578                     goto neg_modifier;
9579                 }
9580                 if (has_charset_modifier) {
9581                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9582                         goto excess_modifier;
9583                     }
9584                     /* Doubled modifier implies more restricted */
9585                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9586                 }
9587                 else {
9588                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9589                 }
9590                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9591                 break;
9592             case DEPENDS_PAT_MOD:
9593                 if (has_use_defaults) {
9594                     goto fail_modifiers;
9595                 }
9596                 else if (flagsp == &negflags) {
9597                     goto neg_modifier;
9598                 }
9599                 else if (has_charset_modifier) {
9600                     goto excess_modifier;
9601                 }
9602
9603                 /* The dual charset means unicode semantics if the
9604                  * pattern (or target, not known until runtime) are
9605                  * utf8, or something in the pattern indicates unicode
9606                  * semantics */
9607                 cs = (RExC_utf8 || RExC_uni_semantics)
9608                      ? REGEX_UNICODE_CHARSET
9609                      : REGEX_DEPENDS_CHARSET;
9610                 has_charset_modifier = DEPENDS_PAT_MOD;
9611                 break;
9612             excess_modifier:
9613                 RExC_parse++;
9614                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9615                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9616                 }
9617                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9618                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9619                                         *(RExC_parse - 1));
9620                 }
9621                 else {
9622                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9623                 }
9624                 /*NOTREACHED*/
9625             neg_modifier:
9626                 RExC_parse++;
9627                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9628                                     *(RExC_parse - 1));
9629                 /*NOTREACHED*/
9630             case ONCE_PAT_MOD: /* 'o' */
9631             case GLOBAL_PAT_MOD: /* 'g' */
9632                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9633                     const I32 wflagbit = *RExC_parse == 'o'
9634                                          ? WASTED_O
9635                                          : WASTED_G;
9636                     if (! (wastedflags & wflagbit) ) {
9637                         wastedflags |= wflagbit;
9638                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9639                         vWARN5(
9640                             RExC_parse + 1,
9641                             "Useless (%s%c) - %suse /%c modifier",
9642                             flagsp == &negflags ? "?-" : "?",
9643                             *RExC_parse,
9644                             flagsp == &negflags ? "don't " : "",
9645                             *RExC_parse
9646                         );
9647                     }
9648                 }
9649                 break;
9650
9651             case CONTINUE_PAT_MOD: /* 'c' */
9652                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9653                     if (! (wastedflags & WASTED_C) ) {
9654                         wastedflags |= WASTED_GC;
9655                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9656                         vWARN3(
9657                             RExC_parse + 1,
9658                             "Useless (%sc) - %suse /gc modifier",
9659                             flagsp == &negflags ? "?-" : "?",
9660                             flagsp == &negflags ? "don't " : ""
9661                         );
9662                     }
9663                 }
9664                 break;
9665             case KEEPCOPY_PAT_MOD: /* 'p' */
9666                 if (flagsp == &negflags) {
9667                     if (PASS2)
9668                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9669                 } else {
9670                     *flagsp |= RXf_PMf_KEEPCOPY;
9671                 }
9672                 break;
9673             case '-':
9674                 /* A flag is a default iff it is following a minus, so
9675                  * if there is a minus, it means will be trying to
9676                  * re-specify a default which is an error */
9677                 if (has_use_defaults || flagsp == &negflags) {
9678                     goto fail_modifiers;
9679                 }
9680                 flagsp = &negflags;
9681                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9682                 break;
9683             case ':':
9684             case ')':
9685                 RExC_flags |= posflags;
9686                 RExC_flags &= ~negflags;
9687                 set_regex_charset(&RExC_flags, cs);
9688                 if (RExC_flags & RXf_PMf_FOLD) {
9689                     RExC_contains_i = 1;
9690                 }
9691                 if (PASS2) {
9692                     STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9693                 }
9694                 return;
9695                 /*NOTREACHED*/
9696             default:
9697             fail_modifiers:
9698                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9699                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9700                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9701                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9702                 /*NOTREACHED*/
9703         }
9704
9705         ++RExC_parse;
9706     }
9707
9708     if (PASS2) {
9709         STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9710     }
9711 }
9712
9713 /*
9714  - reg - regular expression, i.e. main body or parenthesized thing
9715  *
9716  * Caller must absorb opening parenthesis.
9717  *
9718  * Combining parenthesis handling with the base level of regular expression
9719  * is a trifle forced, but the need to tie the tails of the branches to what
9720  * follows makes it hard to avoid.
9721  */
9722 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9723 #ifdef DEBUGGING
9724 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9725 #else
9726 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9727 #endif
9728
9729 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9730    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9731    needs to be restarted.
9732    Otherwise would only return NULL if regbranch() returns NULL, which
9733    cannot happen.  */
9734 STATIC regnode *
9735 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9736     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9737      * 2 is like 1, but indicates that nextchar() has been called to advance
9738      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9739      * this flag alerts us to the need to check for that */
9740 {
9741     regnode *ret;               /* Will be the head of the group. */
9742     regnode *br;
9743     regnode *lastbr;
9744     regnode *ender = NULL;
9745     I32 parno = 0;
9746     I32 flags;
9747     U32 oregflags = RExC_flags;
9748     bool have_branch = 0;
9749     bool is_open = 0;
9750     I32 freeze_paren = 0;
9751     I32 after_freeze = 0;
9752     I32 num; /* numeric backreferences */
9753
9754     char * parse_start = RExC_parse; /* MJD */
9755     char * const oregcomp_parse = RExC_parse;
9756
9757     GET_RE_DEBUG_FLAGS_DECL;
9758
9759     PERL_ARGS_ASSERT_REG;
9760     DEBUG_PARSE("reg ");
9761
9762     *flagp = 0;                         /* Tentatively. */
9763
9764
9765     /* Make an OPEN node, if parenthesized. */
9766     if (paren) {
9767
9768         /* Under /x, space and comments can be gobbled up between the '(' and
9769          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9770          * intervening space, as the sequence is a token, and a token should be
9771          * indivisible */
9772         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9773
9774         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9775             char *start_verb = RExC_parse;
9776             STRLEN verb_len = 0;
9777             char *start_arg = NULL;
9778             unsigned char op = 0;
9779             int argok = 1;
9780             int internal_argval = 0; /* internal_argval is only useful if
9781                                         !argok */
9782
9783             if (has_intervening_patws) {
9784                 RExC_parse++;
9785                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9786             }
9787             while ( *RExC_parse && *RExC_parse != ')' ) {
9788                 if ( *RExC_parse == ':' ) {
9789                     start_arg = RExC_parse + 1;
9790                     break;
9791                 }
9792                 RExC_parse++;
9793             }
9794             ++start_verb;
9795             verb_len = RExC_parse - start_verb;
9796             if ( start_arg ) {
9797                 RExC_parse++;
9798                 while ( *RExC_parse && *RExC_parse != ')' )
9799                     RExC_parse++;
9800                 if ( *RExC_parse != ')' )
9801                     vFAIL("Unterminated verb pattern argument");
9802                 if ( RExC_parse == start_arg )
9803                     start_arg = NULL;
9804             } else {
9805                 if ( *RExC_parse != ')' )
9806                     vFAIL("Unterminated verb pattern");
9807             }
9808
9809             switch ( *start_verb ) {
9810             case 'A':  /* (*ACCEPT) */
9811                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9812                     op = ACCEPT;
9813                     internal_argval = RExC_nestroot;
9814                 }
9815                 break;
9816             case 'C':  /* (*COMMIT) */
9817                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9818                     op = COMMIT;
9819                 break;
9820             case 'F':  /* (*FAIL) */
9821                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9822                     op = OPFAIL;
9823                     argok = 0;
9824                 }
9825                 break;
9826             case ':':  /* (*:NAME) */
9827             case 'M':  /* (*MARK:NAME) */
9828                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9829                     op = MARKPOINT;
9830                     argok = -1;
9831                 }
9832                 break;
9833             case 'P':  /* (*PRUNE) */
9834                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9835                     op = PRUNE;
9836                 break;
9837             case 'S':   /* (*SKIP) */
9838                 if ( memEQs(start_verb,verb_len,"SKIP") )
9839                     op = SKIP;
9840                 break;
9841             case 'T':  /* (*THEN) */
9842                 /* [19:06] <TimToady> :: is then */
9843                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9844                     op = CUTGROUP;
9845                     RExC_seen |= REG_CUTGROUP_SEEN;
9846                 }
9847                 break;
9848             }
9849             if ( ! op ) {
9850                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9851                 vFAIL2utf8f(
9852                     "Unknown verb pattern '%"UTF8f"'",
9853                     UTF8fARG(UTF, verb_len, start_verb));
9854             }
9855             if ( argok ) {
9856                 if ( start_arg && internal_argval ) {
9857                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9858                         verb_len, start_verb);
9859                 } else if ( argok < 0 && !start_arg ) {
9860                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9861                         verb_len, start_verb);
9862                 } else {
9863                     ret = reganode(pRExC_state, op, internal_argval);
9864                     if ( ! internal_argval && ! SIZE_ONLY ) {
9865                         if (start_arg) {
9866                             SV *sv = newSVpvn( start_arg,
9867                                                RExC_parse - start_arg);
9868                             ARG(ret) = add_data( pRExC_state,
9869                                                  STR_WITH_LEN("S"));
9870                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9871                             ret->flags = 0;
9872                         } else {
9873                             ret->flags = 1;
9874                         }
9875                     }
9876                 }
9877                 if (!internal_argval)
9878                     RExC_seen |= REG_VERBARG_SEEN;
9879             } else if ( start_arg ) {
9880                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9881                         verb_len, start_verb);
9882             } else {
9883                 ret = reg_node(pRExC_state, op);
9884             }
9885             nextchar(pRExC_state);
9886             return ret;
9887         }
9888         else if (*RExC_parse == '?') { /* (?...) */
9889             bool is_logical = 0;
9890             const char * const seqstart = RExC_parse;
9891             const char * endptr;
9892             if (has_intervening_patws) {
9893                 RExC_parse++;
9894                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9895             }
9896
9897             RExC_parse++;
9898             paren = *RExC_parse++;
9899             ret = NULL;                 /* For look-ahead/behind. */
9900             switch (paren) {
9901
9902             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9903                 paren = *RExC_parse++;
9904                 if ( paren == '<')         /* (?P<...>) named capture */
9905                     goto named_capture;
9906                 else if (paren == '>') {   /* (?P>name) named recursion */
9907                     goto named_recursion;
9908                 }
9909                 else if (paren == '=') {   /* (?P=...)  named backref */
9910                     /* this pretty much dupes the code for \k<NAME> in
9911                      * regatom(), if you change this make sure you change that
9912                      * */
9913                     char* name_start = RExC_parse;
9914                     U32 num = 0;
9915                     SV *sv_dat = reg_scan_name(pRExC_state,
9916                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9917                     if (RExC_parse == name_start || *RExC_parse != ')')
9918                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9919                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9920
9921                     if (!SIZE_ONLY) {
9922                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9923                         RExC_rxi->data->data[num]=(void*)sv_dat;
9924                         SvREFCNT_inc_simple_void(sv_dat);
9925                     }
9926                     RExC_sawback = 1;
9927                     ret = reganode(pRExC_state,
9928                                    ((! FOLD)
9929                                      ? NREF
9930                                      : (ASCII_FOLD_RESTRICTED)
9931                                        ? NREFFA
9932                                        : (AT_LEAST_UNI_SEMANTICS)
9933                                          ? NREFFU
9934                                          : (LOC)
9935                                            ? NREFFL
9936                                            : NREFF),
9937                                     num);
9938                     *flagp |= HASWIDTH;
9939
9940                     Set_Node_Offset(ret, parse_start+1);
9941                     Set_Node_Cur_Length(ret, parse_start);
9942
9943                     nextchar(pRExC_state);
9944                     return ret;
9945                 }
9946                 RExC_parse++;
9947                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9948                 vFAIL3("Sequence (%.*s...) not recognized",
9949                                 RExC_parse-seqstart, seqstart);
9950                 /*NOTREACHED*/
9951             case '<':           /* (?<...) */
9952                 if (*RExC_parse == '!')
9953                     paren = ',';
9954                 else if (*RExC_parse != '=')
9955               named_capture:
9956                 {               /* (?<...>) */
9957                     char *name_start;
9958                     SV *svname;
9959                     paren= '>';
9960             case '\'':          /* (?'...') */
9961                     name_start= RExC_parse;
9962                     svname = reg_scan_name(pRExC_state,
9963                         SIZE_ONLY    /* reverse test from the others */
9964                         ? REG_RSN_RETURN_NAME
9965                         : REG_RSN_RETURN_NULL);
9966                     if (RExC_parse == name_start || *RExC_parse != paren)
9967                         vFAIL2("Sequence (?%c... not terminated",
9968                             paren=='>' ? '<' : paren);
9969                     if (SIZE_ONLY) {
9970                         HE *he_str;
9971                         SV *sv_dat = NULL;
9972                         if (!svname) /* shouldn't happen */
9973                             Perl_croak(aTHX_
9974                                 "panic: reg_scan_name returned NULL");
9975                         if (!RExC_paren_names) {
9976                             RExC_paren_names= newHV();
9977                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9978 #ifdef DEBUGGING
9979                             RExC_paren_name_list= newAV();
9980                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9981 #endif
9982                         }
9983                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9984                         if ( he_str )
9985                             sv_dat = HeVAL(he_str);
9986                         if ( ! sv_dat ) {
9987                             /* croak baby croak */
9988                             Perl_croak(aTHX_
9989                                 "panic: paren_name hash element allocation failed");
9990                         } else if ( SvPOK(sv_dat) ) {
9991                             /* (?|...) can mean we have dupes so scan to check
9992                                its already been stored. Maybe a flag indicating
9993                                we are inside such a construct would be useful,
9994                                but the arrays are likely to be quite small, so
9995                                for now we punt -- dmq */
9996                             IV count = SvIV(sv_dat);
9997                             I32 *pv = (I32*)SvPVX(sv_dat);
9998                             IV i;
9999                             for ( i = 0 ; i < count ; i++ ) {
10000                                 if ( pv[i] == RExC_npar ) {
10001                                     count = 0;
10002                                     break;
10003                                 }
10004                             }
10005                             if ( count ) {
10006                                 pv = (I32*)SvGROW(sv_dat,
10007                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10008                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10009                                 pv[count] = RExC_npar;
10010                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10011                             }
10012                         } else {
10013                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10014                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10015                                                                 sizeof(I32));
10016                             SvIOK_on(sv_dat);
10017                             SvIV_set(sv_dat, 1);
10018                         }
10019 #ifdef DEBUGGING
10020                         /* Yes this does cause a memory leak in debugging Perls
10021                          * */
10022                         if (!av_store(RExC_paren_name_list,
10023                                       RExC_npar, SvREFCNT_inc(svname)))
10024                             SvREFCNT_dec_NN(svname);
10025 #endif
10026
10027                         /*sv_dump(sv_dat);*/
10028                     }
10029                     nextchar(pRExC_state);
10030                     paren = 1;
10031                     goto capturing_parens;
10032                 }
10033                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10034                 RExC_in_lookbehind++;
10035                 RExC_parse++;
10036                 /* FALLTHROUGH */
10037             case '=':           /* (?=...) */
10038                 RExC_seen_zerolen++;
10039                 break;
10040             case '!':           /* (?!...) */
10041                 RExC_seen_zerolen++;
10042                 if (*RExC_parse == ')') {
10043                     ret=reg_node(pRExC_state, OPFAIL);
10044                     nextchar(pRExC_state);
10045                     return ret;
10046                 }
10047                 break;
10048             case '|':           /* (?|...) */
10049                 /* branch reset, behave like a (?:...) except that
10050                    buffers in alternations share the same numbers */
10051                 paren = ':';
10052                 after_freeze = freeze_paren = RExC_npar;
10053                 break;
10054             case ':':           /* (?:...) */
10055             case '>':           /* (?>...) */
10056                 break;
10057             case '$':           /* (?$...) */
10058             case '@':           /* (?@...) */
10059                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10060                 break;
10061             case '0' :           /* (?0) */
10062             case 'R' :           /* (?R) */
10063                 if (*RExC_parse != ')')
10064                     FAIL("Sequence (?R) not terminated");
10065                 ret = reg_node(pRExC_state, GOSTART);
10066                     RExC_seen |= REG_GOSTART_SEEN;
10067                 *flagp |= POSTPONED;
10068                 nextchar(pRExC_state);
10069                 return ret;
10070                 /*notreached*/
10071             /* named and numeric backreferences */
10072             case '&':            /* (?&NAME) */
10073                 parse_start = RExC_parse - 1;
10074               named_recursion:
10075                 {
10076                     SV *sv_dat = reg_scan_name(pRExC_state,
10077                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10078                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10079                 }
10080                 if (RExC_parse == RExC_end || *RExC_parse != ')')
10081                     vFAIL("Sequence (?&... not terminated");
10082                 goto gen_recurse_regop;
10083                 assert(0); /* NOT REACHED */
10084             case '+':
10085                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10086                     RExC_parse++;
10087                     vFAIL("Illegal pattern");
10088                 }
10089                 goto parse_recursion;
10090                 /* NOT REACHED*/
10091             case '-': /* (?-1) */
10092                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10093                     RExC_parse--; /* rewind to let it be handled later */
10094                     goto parse_flags;
10095                 }
10096                 /* FALLTHROUGH */
10097             case '1': case '2': case '3': case '4': /* (?1) */
10098             case '5': case '6': case '7': case '8': case '9':
10099                 RExC_parse--;
10100               parse_recursion:
10101                 {
10102                     bool is_neg = FALSE;
10103                     parse_start = RExC_parse - 1; /* MJD */
10104                     if (*RExC_parse == '-') {
10105                         RExC_parse++;
10106                         is_neg = TRUE;
10107                     }
10108                     num = grok_atou(RExC_parse, &endptr);
10109                     if (endptr)
10110                         RExC_parse = (char*)endptr;
10111                     if (is_neg) {
10112                         /* Some limit for num? */
10113                         num = -num;
10114                     }
10115                 }
10116                 if (*RExC_parse!=')')
10117                     vFAIL("Expecting close bracket");
10118
10119               gen_recurse_regop:
10120                 if ( paren == '-' ) {
10121                     /*
10122                     Diagram of capture buffer numbering.
10123                     Top line is the normal capture buffer numbers
10124                     Bottom line is the negative indexing as from
10125                     the X (the (?-2))
10126
10127                     +   1 2    3 4 5 X          6 7
10128                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10129                     -   5 4    3 2 1 X          x x
10130
10131                     */
10132                     num = RExC_npar + num;
10133                     if (num < 1)  {
10134                         RExC_parse++;
10135                         vFAIL("Reference to nonexistent group");
10136                     }
10137                 } else if ( paren == '+' ) {
10138                     num = RExC_npar + num - 1;
10139                 }
10140
10141                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10142                 if (!SIZE_ONLY) {
10143                     if (num > (I32)RExC_rx->nparens) {
10144                         RExC_parse++;
10145                         vFAIL("Reference to nonexistent group");
10146                     }
10147                     RExC_recurse_count++;
10148                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10149                         "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10150                               22, "|    |", 1 + depth * 2, "",
10151                               (UV)ARG(ret), (IV)ARG2L(ret)));
10152                 }
10153                 RExC_seen |= REG_RECURSE_SEEN;
10154                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10155                 Set_Node_Offset(ret, parse_start); /* MJD */
10156
10157                 *flagp |= POSTPONED;
10158                 nextchar(pRExC_state);
10159                 return ret;
10160
10161             assert(0); /* NOT REACHED */
10162
10163             case '?':           /* (??...) */
10164                 is_logical = 1;
10165                 if (*RExC_parse != '{') {
10166                     RExC_parse++;
10167                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10168                     vFAIL2utf8f(
10169                         "Sequence (%"UTF8f"...) not recognized",
10170                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10171                     /*NOTREACHED*/
10172                 }
10173                 *flagp |= POSTPONED;
10174                 paren = *RExC_parse++;
10175                 /* FALLTHROUGH */
10176             case '{':           /* (?{...}) */
10177             {
10178                 U32 n = 0;
10179                 struct reg_code_block *cb;
10180
10181                 RExC_seen_zerolen++;
10182
10183                 if (   !pRExC_state->num_code_blocks
10184                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
10185                     || pRExC_state->code_blocks[pRExC_state->code_index].start
10186                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10187                             - RExC_start)
10188                 ) {
10189                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
10190                         FAIL("panic: Sequence (?{...}): no code block found\n");
10191                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
10192                 }
10193                 /* this is a pre-compiled code block (?{...}) */
10194                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10195                 RExC_parse = RExC_start + cb->end;
10196                 if (!SIZE_ONLY) {
10197                     OP *o = cb->block;
10198                     if (cb->src_regex) {
10199                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10200                         RExC_rxi->data->data[n] =
10201                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
10202                         RExC_rxi->data->data[n+1] = (void*)o;
10203                     }
10204                     else {
10205                         n = add_data(pRExC_state,
10206                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10207                         RExC_rxi->data->data[n] = (void*)o;
10208                     }
10209                 }
10210                 pRExC_state->code_index++;
10211                 nextchar(pRExC_state);
10212
10213                 if (is_logical) {
10214                     regnode *eval;
10215                     ret = reg_node(pRExC_state, LOGICAL);
10216
10217                     eval = reg2Lanode(pRExC_state, EVAL,
10218                                        n,
10219
10220                                        /* for later propagation into (??{})
10221                                         * return value */
10222                                        RExC_flags & RXf_PMf_COMPILETIME
10223                                       );
10224                     if (!SIZE_ONLY) {
10225                         ret->flags = 2;
10226                     }
10227                     REGTAIL(pRExC_state, ret, eval);
10228                     /* deal with the length of this later - MJD */
10229                     return ret;
10230                 }
10231                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10232                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10233                 Set_Node_Offset(ret, parse_start);
10234                 return ret;
10235             }
10236             case '(':           /* (?(?{...})...) and (?(?=...)...) */
10237             {
10238                 int is_define= 0;
10239                 const int DEFINE_len = sizeof("DEFINE") - 1;
10240                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
10241                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10242                         || RExC_parse[1] == '<'
10243                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
10244                         I32 flag;
10245                         regnode *tail;
10246
10247                         ret = reg_node(pRExC_state, LOGICAL);
10248                         if (!SIZE_ONLY)
10249                             ret->flags = 1;
10250
10251                         tail = reg(pRExC_state, 1, &flag, depth+1);
10252                         if (flag & RESTART_UTF8) {
10253                             *flagp = RESTART_UTF8;
10254                             return NULL;
10255                         }
10256                         REGTAIL(pRExC_state, ret, tail);
10257                         goto insert_if;
10258                     }
10259                     /* Fall through to ‘Unknown switch condition’ at the
10260                        end of the if/else chain. */
10261                 }
10262                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10263                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10264                 {
10265                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
10266                     char *name_start= RExC_parse++;
10267                     U32 num = 0;
10268                     SV *sv_dat=reg_scan_name(pRExC_state,
10269                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10270                     if (RExC_parse == name_start || *RExC_parse != ch)
10271                         vFAIL2("Sequence (?(%c... not terminated",
10272                             (ch == '>' ? '<' : ch));
10273                     RExC_parse++;
10274                     if (!SIZE_ONLY) {
10275                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10276                         RExC_rxi->data->data[num]=(void*)sv_dat;
10277                         SvREFCNT_inc_simple_void(sv_dat);
10278                     }
10279                     ret = reganode(pRExC_state,NGROUPP,num);
10280                     goto insert_if_check_paren;
10281                 }
10282                 else if (strnEQ(RExC_parse, "DEFINE",
10283                                        MIN(DEFINE_len, RExC_end - RExC_parse)))
10284                 {
10285                     ret = reganode(pRExC_state,DEFINEP,0);
10286                     RExC_parse += DEFINE_len;
10287                     is_define = 1;
10288                     goto insert_if_check_paren;
10289                 }
10290                 else if (RExC_parse[0] == 'R') {
10291                     RExC_parse++;
10292                     parno = 0;
10293                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10294                         parno = grok_atou(RExC_parse, &endptr);
10295                         if (endptr)
10296                             RExC_parse = (char*)endptr;
10297                     } else if (RExC_parse[0] == '&') {
10298                         SV *sv_dat;
10299                         RExC_parse++;
10300                         sv_dat = reg_scan_name(pRExC_state,
10301                             SIZE_ONLY
10302                             ? REG_RSN_RETURN_NULL
10303                             : REG_RSN_RETURN_DATA);
10304                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10305                     }
10306                     ret = reganode(pRExC_state,INSUBP,parno);
10307                     goto insert_if_check_paren;
10308                 }
10309                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10310                     /* (?(1)...) */
10311                     char c;
10312                     char *tmp;
10313                     parno = grok_atou(RExC_parse, &endptr);
10314                     if (endptr)
10315                         RExC_parse = (char*)endptr;
10316                     ret = reganode(pRExC_state, GROUPP, parno);
10317
10318                  insert_if_check_paren:
10319                     if (*(tmp = nextchar(pRExC_state)) != ')') {
10320                         /* nextchar also skips comments, so undo its work
10321                          * and skip over the the next character.
10322                          */
10323                         RExC_parse = tmp;
10324                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10325                         vFAIL("Switch condition not recognized");
10326                     }
10327                   insert_if:
10328                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10329                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10330                     if (br == NULL) {
10331                         if (flags & RESTART_UTF8) {
10332                             *flagp = RESTART_UTF8;
10333                             return NULL;
10334                         }
10335                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10336                               (UV) flags);
10337                     } else
10338                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10339                                                           LONGJMP, 0));
10340                     c = *nextchar(pRExC_state);
10341                     if (flags&HASWIDTH)
10342                         *flagp |= HASWIDTH;
10343                     if (c == '|') {
10344                         if (is_define)
10345                             vFAIL("(?(DEFINE)....) does not allow branches");
10346
10347                         /* Fake one for optimizer.  */
10348                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10349
10350                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10351                             if (flags & RESTART_UTF8) {
10352                                 *flagp = RESTART_UTF8;
10353                                 return NULL;
10354                             }
10355                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10356                                   (UV) flags);
10357                         }
10358                         REGTAIL(pRExC_state, ret, lastbr);
10359                         if (flags&HASWIDTH)
10360                             *flagp |= HASWIDTH;
10361                         c = *nextchar(pRExC_state);
10362                     }
10363                     else
10364                         lastbr = NULL;
10365                     if (c != ')') {
10366                         if (RExC_parse>RExC_end)
10367                             vFAIL("Switch (?(condition)... not terminated");
10368                         else
10369                             vFAIL("Switch (?(condition)... contains too many branches");
10370                     }
10371                     ender = reg_node(pRExC_state, TAIL);
10372                     REGTAIL(pRExC_state, br, ender);
10373                     if (lastbr) {
10374                         REGTAIL(pRExC_state, lastbr, ender);
10375                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10376                     }
10377                     else
10378                         REGTAIL(pRExC_state, ret, ender);
10379                     RExC_size++; /* XXX WHY do we need this?!!
10380                                     For large programs it seems to be required
10381                                     but I can't figure out why. -- dmq*/
10382                     return ret;
10383                 }
10384                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10385                 vFAIL("Unknown switch condition (?(...))");
10386             }
10387             case '[':           /* (?[ ... ]) */
10388                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10389                                          oregcomp_parse);
10390             case 0:
10391                 RExC_parse--; /* for vFAIL to print correctly */
10392                 vFAIL("Sequence (? incomplete");
10393                 break;
10394             default: /* e.g., (?i) */
10395                 --RExC_parse;
10396               parse_flags:
10397                 parse_lparen_question_flags(pRExC_state);
10398                 if (UCHARAT(RExC_parse) != ':') {
10399                     nextchar(pRExC_state);
10400                     *flagp = TRYAGAIN;
10401                     return NULL;
10402                 }
10403                 paren = ':';
10404                 nextchar(pRExC_state);
10405                 ret = NULL;
10406                 goto parse_rest;
10407             } /* end switch */
10408         }
10409         else {                  /* (...) */
10410           capturing_parens:
10411             parno = RExC_npar;
10412             RExC_npar++;
10413
10414             ret = reganode(pRExC_state, OPEN, parno);
10415             if (!SIZE_ONLY ){
10416                 if (!RExC_nestroot)
10417                     RExC_nestroot = parno;
10418                 if (RExC_seen & REG_RECURSE_SEEN
10419                     && !RExC_open_parens[parno-1])
10420                 {
10421                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10422                         "%*s%*s Setting open paren #%"IVdf" to %d\n",
10423                         22, "|    |", 1+2 * depth, "",
10424                         (IV)parno, REG_NODE_NUM(ret)));
10425                     RExC_open_parens[parno-1]= ret;
10426                 }
10427             }
10428             Set_Node_Length(ret, 1); /* MJD */
10429             Set_Node_Offset(ret, RExC_parse); /* MJD */
10430             is_open = 1;
10431         }
10432     }
10433     else                        /* ! paren */
10434         ret = NULL;
10435
10436    parse_rest:
10437     /* Pick up the branches, linking them together. */
10438     parse_start = RExC_parse;   /* MJD */
10439     br = regbranch(pRExC_state, &flags, 1,depth+1);
10440
10441     /*     branch_len = (paren != 0); */
10442
10443     if (br == NULL) {
10444         if (flags & RESTART_UTF8) {
10445             *flagp = RESTART_UTF8;
10446             return NULL;
10447         }
10448         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10449     }
10450     if (*RExC_parse == '|') {
10451         if (!SIZE_ONLY && RExC_extralen) {
10452             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10453         }
10454         else {                  /* MJD */
10455             reginsert(pRExC_state, BRANCH, br, depth+1);
10456             Set_Node_Length(br, paren != 0);
10457             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10458         }
10459         have_branch = 1;
10460         if (SIZE_ONLY)
10461             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10462     }
10463     else if (paren == ':') {
10464         *flagp |= flags&SIMPLE;
10465     }
10466     if (is_open) {                              /* Starts with OPEN. */
10467         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10468     }
10469     else if (paren != '?')              /* Not Conditional */
10470         ret = br;
10471     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10472     lastbr = br;
10473     while (*RExC_parse == '|') {
10474         if (!SIZE_ONLY && RExC_extralen) {
10475             ender = reganode(pRExC_state, LONGJMP,0);
10476
10477             /* Append to the previous. */
10478             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10479         }
10480         if (SIZE_ONLY)
10481             RExC_extralen += 2;         /* Account for LONGJMP. */
10482         nextchar(pRExC_state);
10483         if (freeze_paren) {
10484             if (RExC_npar > after_freeze)
10485                 after_freeze = RExC_npar;
10486             RExC_npar = freeze_paren;
10487         }
10488         br = regbranch(pRExC_state, &flags, 0, depth+1);
10489
10490         if (br == NULL) {
10491             if (flags & RESTART_UTF8) {
10492                 *flagp = RESTART_UTF8;
10493                 return NULL;
10494             }
10495             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10496         }
10497         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10498         lastbr = br;
10499         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10500     }
10501
10502     if (have_branch || paren != ':') {
10503         /* Make a closing node, and hook it on the end. */
10504         switch (paren) {
10505         case ':':
10506             ender = reg_node(pRExC_state, TAIL);
10507             break;
10508         case 1: case 2:
10509             ender = reganode(pRExC_state, CLOSE, parno);
10510             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10511                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10512                         "%*s%*s Setting close paren #%"IVdf" to %d\n",
10513                         22, "|    |", 1+2 * depth, "", (IV)parno, REG_NODE_NUM(ender)));
10514                 RExC_close_parens[parno-1]= ender;
10515                 if (RExC_nestroot == parno)
10516                     RExC_nestroot = 0;
10517             }
10518             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10519             Set_Node_Length(ender,1); /* MJD */
10520             break;
10521         case '<':
10522         case ',':
10523         case '=':
10524         case '!':
10525             *flagp &= ~HASWIDTH;
10526             /* FALLTHROUGH */
10527         case '>':
10528             ender = reg_node(pRExC_state, SUCCEED);
10529             break;
10530         case 0:
10531             ender = reg_node(pRExC_state, END);
10532             if (!SIZE_ONLY) {
10533                 assert(!RExC_opend); /* there can only be one! */
10534                 RExC_opend = ender;
10535             }
10536             break;
10537         }
10538         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10539             DEBUG_PARSE_MSG("lsbr");
10540             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10541             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10542             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10543                           SvPV_nolen_const(RExC_mysv1),
10544                           (IV)REG_NODE_NUM(lastbr),
10545                           SvPV_nolen_const(RExC_mysv2),
10546                           (IV)REG_NODE_NUM(ender),
10547                           (IV)(ender - lastbr)
10548             );
10549         });
10550         REGTAIL(pRExC_state, lastbr, ender);
10551
10552         if (have_branch && !SIZE_ONLY) {
10553             char is_nothing= 1;
10554             if (depth==1)
10555                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10556
10557             /* Hook the tails of the branches to the closing node. */
10558             for (br = ret; br; br = regnext(br)) {
10559                 const U8 op = PL_regkind[OP(br)];
10560                 if (op == BRANCH) {
10561                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10562                     if ( OP(NEXTOPER(br)) != NOTHING
10563                          || regnext(NEXTOPER(br)) != ender)
10564                         is_nothing= 0;
10565                 }
10566                 else if (op == BRANCHJ) {
10567                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10568                     /* for now we always disable this optimisation * /
10569                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10570                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10571                     */
10572                         is_nothing= 0;
10573                 }
10574             }
10575             if (is_nothing) {
10576                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10577                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10578                     DEBUG_PARSE_MSG("NADA");
10579                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10580                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10581                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10582                                   SvPV_nolen_const(RExC_mysv1),
10583                                   (IV)REG_NODE_NUM(ret),
10584                                   SvPV_nolen_const(RExC_mysv2),
10585                                   (IV)REG_NODE_NUM(ender),
10586                                   (IV)(ender - ret)
10587                     );
10588                 });
10589                 OP(br)= NOTHING;
10590                 if (OP(ender) == TAIL) {
10591                     NEXT_OFF(br)= 0;
10592                     RExC_emit= br + 1;
10593                 } else {
10594                     regnode *opt;
10595                     for ( opt= br + 1; opt < ender ; opt++ )
10596                         OP(opt)= OPTIMIZED;
10597                     NEXT_OFF(br)= ender - br;
10598                 }
10599             }
10600         }
10601     }
10602
10603     {
10604         const char *p;
10605         static const char parens[] = "=!<,>";
10606
10607         if (paren && (p = strchr(parens, paren))) {
10608             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10609             int flag = (p - parens) > 1;
10610
10611             if (paren == '>')
10612                 node = SUSPEND, flag = 0;
10613             reginsert(pRExC_state, node,ret, depth+1);
10614             Set_Node_Cur_Length(ret, parse_start);
10615             Set_Node_Offset(ret, parse_start + 1);
10616             ret->flags = flag;
10617             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10618         }
10619     }
10620
10621     /* Check for proper termination. */
10622     if (paren) {
10623         /* restore original flags, but keep (?p) */
10624         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10625         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10626             RExC_parse = oregcomp_parse;
10627             vFAIL("Unmatched (");
10628         }
10629     }
10630     else if (!paren && RExC_parse < RExC_end) {
10631         if (*RExC_parse == ')') {
10632             RExC_parse++;
10633             vFAIL("Unmatched )");
10634         }
10635         else
10636             FAIL("Junk on end of regexp");      /* "Can't happen". */
10637         assert(0); /* NOTREACHED */
10638     }
10639
10640     if (RExC_in_lookbehind) {
10641         RExC_in_lookbehind--;
10642     }
10643     if (after_freeze > RExC_npar)
10644         RExC_npar = after_freeze;
10645     return(ret);
10646 }
10647
10648 /*
10649  - regbranch - one alternative of an | operator
10650  *
10651  * Implements the concatenation operator.
10652  *
10653  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10654  * restarted.
10655  */
10656 STATIC regnode *
10657 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10658 {
10659     regnode *ret;
10660     regnode *chain = NULL;
10661     regnode *latest;
10662     I32 flags = 0, c = 0;
10663     GET_RE_DEBUG_FLAGS_DECL;
10664
10665     PERL_ARGS_ASSERT_REGBRANCH;
10666
10667     DEBUG_PARSE("brnc");
10668
10669     if (first)
10670         ret = NULL;
10671     else {
10672         if (!SIZE_ONLY && RExC_extralen)
10673             ret = reganode(pRExC_state, BRANCHJ,0);
10674         else {
10675             ret = reg_node(pRExC_state, BRANCH);
10676             Set_Node_Length(ret, 1);
10677         }
10678     }
10679
10680     if (!first && SIZE_ONLY)
10681         RExC_extralen += 1;                     /* BRANCHJ */
10682
10683     *flagp = WORST;                     /* Tentatively. */
10684
10685     RExC_parse--;
10686     nextchar(pRExC_state);
10687     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10688         flags &= ~TRYAGAIN;
10689         latest = regpiece(pRExC_state, &flags,depth+1);
10690         if (latest == NULL) {
10691             if (flags & TRYAGAIN)
10692                 continue;
10693             if (flags & RESTART_UTF8) {
10694                 *flagp = RESTART_UTF8;
10695                 return NULL;
10696             }
10697             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10698         }
10699         else if (ret == NULL)
10700             ret = latest;
10701         *flagp |= flags&(HASWIDTH|POSTPONED);
10702         if (chain == NULL)      /* First piece. */
10703             *flagp |= flags&SPSTART;
10704         else {
10705             RExC_naughty++;
10706             REGTAIL(pRExC_state, chain, latest);
10707         }
10708         chain = latest;
10709         c++;
10710     }
10711     if (chain == NULL) {        /* Loop ran zero times. */
10712         chain = reg_node(pRExC_state, NOTHING);
10713         if (ret == NULL)
10714             ret = chain;
10715     }
10716     if (c == 1) {
10717         *flagp |= flags&SIMPLE;
10718     }
10719
10720     return ret;
10721 }
10722
10723 /*
10724  - regpiece - something followed by possible [*+?]
10725  *
10726  * Note that the branching code sequences used for ? and the general cases
10727  * of * and + are somewhat optimized:  they use the same NOTHING node as
10728  * both the endmarker for their branch list and the body of the last branch.
10729  * It might seem that this node could be dispensed with entirely, but the
10730  * endmarker role is not redundant.
10731  *
10732  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10733  * TRYAGAIN.
10734  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10735  * restarted.
10736  */
10737 STATIC regnode *
10738 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10739 {
10740     regnode *ret;
10741     char op;
10742     char *next;
10743     I32 flags;
10744     const char * const origparse = RExC_parse;
10745     I32 min;
10746     I32 max = REG_INFTY;
10747 #ifdef RE_TRACK_PATTERN_OFFSETS
10748     char *parse_start;
10749 #endif
10750     const char *maxpos = NULL;
10751
10752     /* Save the original in case we change the emitted regop to a FAIL. */
10753     regnode * const orig_emit = RExC_emit;
10754
10755     GET_RE_DEBUG_FLAGS_DECL;
10756
10757     PERL_ARGS_ASSERT_REGPIECE;
10758
10759     DEBUG_PARSE("piec");
10760
10761     ret = regatom(pRExC_state, &flags,depth+1);
10762     if (ret == NULL) {
10763         if (flags & (TRYAGAIN|RESTART_UTF8))
10764             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10765         else
10766             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10767         return(NULL);
10768     }
10769
10770     op = *RExC_parse;
10771
10772     if (op == '{' && regcurly(RExC_parse)) {
10773         maxpos = NULL;
10774 #ifdef RE_TRACK_PATTERN_OFFSETS
10775         parse_start = RExC_parse; /* MJD */
10776 #endif
10777         next = RExC_parse + 1;
10778         while (isDIGIT(*next) || *next == ',') {
10779             if (*next == ',') {
10780                 if (maxpos)
10781                     break;
10782                 else
10783                     maxpos = next;
10784             }
10785             next++;
10786         }
10787         if (*next == '}') {             /* got one */
10788             const char* endptr;
10789             if (!maxpos)
10790                 maxpos = next;
10791             RExC_parse++;
10792             min = grok_atou(RExC_parse, &endptr);
10793             if (*maxpos == ',')
10794                 maxpos++;
10795             else
10796                 maxpos = RExC_parse;
10797             max = grok_atou(maxpos, &endptr);
10798             if (!max && *maxpos != '0')
10799                 max = REG_INFTY;                /* meaning "infinity" */
10800             else if (max >= REG_INFTY)
10801                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10802             RExC_parse = next;
10803             nextchar(pRExC_state);
10804             if (max < min) {    /* If can't match, warn and optimize to fail
10805                                    unconditionally */
10806                 if (SIZE_ONLY) {
10807
10808                     /* We can't back off the size because we have to reserve
10809                      * enough space for all the things we are about to throw
10810                      * away, but we can shrink it by the ammount we are about
10811                      * to re-use here */
10812                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10813                 }
10814                 else {
10815                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10816                     RExC_emit = orig_emit;
10817                 }
10818                 ret = reg_node(pRExC_state, OPFAIL);
10819                 return ret;
10820             }
10821             else if (min == max
10822                      && RExC_parse < RExC_end
10823                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10824             {
10825                 if (PASS2) {
10826                     ckWARN2reg(RExC_parse + 1,
10827                                "Useless use of greediness modifier '%c'",
10828                                *RExC_parse);
10829                 }
10830                 /* Absorb the modifier, so later code doesn't see nor use
10831                     * it */
10832                 nextchar(pRExC_state);
10833             }
10834
10835         do_curly:
10836             if ((flags&SIMPLE)) {
10837                 RExC_naughty += 2 + RExC_naughty / 2;
10838                 reginsert(pRExC_state, CURLY, ret, depth+1);
10839                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10840                 Set_Node_Cur_Length(ret, parse_start);
10841             }
10842             else {
10843                 regnode * const w = reg_node(pRExC_state, WHILEM);
10844
10845                 w->flags = 0;
10846                 REGTAIL(pRExC_state, ret, w);
10847                 if (!SIZE_ONLY && RExC_extralen) {
10848                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10849                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10850                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10851                 }
10852                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10853                                 /* MJD hk */
10854                 Set_Node_Offset(ret, parse_start+1);
10855                 Set_Node_Length(ret,
10856                                 op == '{' ? (RExC_parse - parse_start) : 1);
10857
10858                 if (!SIZE_ONLY && RExC_extralen)
10859                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10860                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10861                 if (SIZE_ONLY)
10862                     RExC_whilem_seen++, RExC_extralen += 3;
10863                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10864             }
10865             ret->flags = 0;
10866
10867             if (min > 0)
10868                 *flagp = WORST;
10869             if (max > 0)
10870                 *flagp |= HASWIDTH;
10871             if (!SIZE_ONLY) {
10872                 ARG1_SET(ret, (U16)min);
10873                 ARG2_SET(ret, (U16)max);
10874             }
10875             if (max == REG_INFTY)
10876                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10877
10878             goto nest_check;
10879         }
10880     }
10881
10882     if (!ISMULT1(op)) {
10883         *flagp = flags;
10884         return(ret);
10885     }
10886
10887 #if 0                           /* Now runtime fix should be reliable. */
10888
10889     /* if this is reinstated, don't forget to put this back into perldiag:
10890
10891             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10892
10893            (F) The part of the regexp subject to either the * or + quantifier
10894            could match an empty string. The {#} shows in the regular
10895            expression about where the problem was discovered.
10896
10897     */
10898
10899     if (!(flags&HASWIDTH) && op != '?')
10900       vFAIL("Regexp *+ operand could be empty");
10901 #endif
10902
10903 #ifdef RE_TRACK_PATTERN_OFFSETS
10904     parse_start = RExC_parse;
10905 #endif
10906     nextchar(pRExC_state);
10907
10908     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10909
10910     if (op == '*' && (flags&SIMPLE)) {
10911         reginsert(pRExC_state, STAR, ret, depth+1);
10912         ret->flags = 0;
10913         RExC_naughty += 4;
10914         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10915     }
10916     else if (op == '*') {
10917         min = 0;
10918         goto do_curly;
10919     }
10920     else if (op == '+' && (flags&SIMPLE)) {
10921         reginsert(pRExC_state, PLUS, ret, depth+1);
10922         ret->flags = 0;
10923         RExC_naughty += 3;
10924         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10925     }
10926     else if (op == '+') {
10927         min = 1;
10928         goto do_curly;
10929     }
10930     else if (op == '?') {
10931         min = 0; max = 1;
10932         goto do_curly;
10933     }
10934   nest_check:
10935     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10936         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10937         ckWARN2reg(RExC_parse,
10938                    "%"UTF8f" matches null string many times",
10939                    UTF8fARG(UTF, (RExC_parse >= origparse
10940                                  ? RExC_parse - origparse
10941                                  : 0),
10942                    origparse));
10943         (void)ReREFCNT_inc(RExC_rx_sv);
10944     }
10945
10946     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10947         nextchar(pRExC_state);
10948         reginsert(pRExC_state, MINMOD, ret, depth+1);
10949         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10950     }
10951     else
10952     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10953         regnode *ender;
10954         nextchar(pRExC_state);
10955         ender = reg_node(pRExC_state, SUCCEED);
10956         REGTAIL(pRExC_state, ret, ender);
10957         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10958         ret->flags = 0;
10959         ender = reg_node(pRExC_state, TAIL);
10960         REGTAIL(pRExC_state, ret, ender);
10961     }
10962
10963     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10964         RExC_parse++;
10965         vFAIL("Nested quantifiers");
10966     }
10967
10968     return(ret);
10969 }
10970
10971 STATIC STRLEN
10972 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10973                       UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10974     )
10975 {
10976
10977  /* This is expected to be called by a parser routine that has recognized '\N'
10978    and needs to handle the rest. RExC_parse is expected to point at the first
10979    char following the N at the time of the call.  On successful return,
10980    RExC_parse has been updated to point to just after the sequence identified
10981    by this routine, <*flagp> has been updated, and the non-NULL input pointers
10982    have been set appropriately.
10983
10984    The typical case for this is \N{some character name}.  This is usually
10985    called while parsing the input, filling in or ready to fill in an EXACTish
10986    node, and the code point for the character should be returned, so that it
10987    can be added to the node, and parsing continued with the next input
10988    character.  But it may be that instead of a single character the \N{}
10989    expands to more than one, a named sequence.  In this case any following
10990    quantifier applies to the whole sequence, and it is easier, given the code
10991    structure that calls this, to handle it from a different area of the code.
10992    For this reason, the input parameters can be set so that it returns valid
10993    only on one or the other of these cases.
10994
10995    Another possibility is for the input to be an empty \N{}, which for
10996    backwards compatibility we accept, but generate a NOTHING node which should
10997    later get optimized out.  This is handled from the area of code which can
10998    handle a named sequence, so if called with the parameters for the other, it
10999    fails.
11000
11001    Still another possibility is for the \N to mean [^\n], and not a single
11002    character or explicit sequence at all.  This is determined by context.
11003    Again, this is handled from the area of code which can handle a named
11004    sequence, so if called with the parameters for the other, it also fails.
11005
11006    And the final possibility is for the \N to be called from within a bracketed
11007    character class.  In this case the [^\n] meaning makes no sense, and so is
11008    an error.  Other anomalous situations are left to the calling code to handle.
11009
11010    For non-single-quoted regexes, the tokenizer has attempted to decide which
11011    of the above applies, and in the case of a named sequence, has converted it
11012    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11013    where c1... are the characters in the sequence.  For single-quoted regexes,
11014    the tokenizer passes the \N sequence through unchanged; this code will not
11015    attempt to determine this nor expand those, instead raising a syntax error.
11016    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11017    or there is no '}', it signals that this \N occurrence means to match a
11018    non-newline. (This mostly was done because of [perl #56444].)
11019
11020    The API is somewhat convoluted due to historical and the above reasons.
11021
11022    The function raises an error (via vFAIL), and doesn't return for various
11023    syntax errors.  For other failures, it returns (STRLEN) -1.  For successes,
11024    it returns a count of how many characters were accounted for by it.  (This
11025    can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11026    points in the sequence.  It sets <node_p>, <valuep>, and/or
11027    <substitute_parse> on success.
11028
11029    If <valuep> is non-null, it means the caller can accept an input sequence
11030    consisting of a just a single code point; <*valuep> is set to the value
11031    of the only or first code point in the input.
11032
11033    If <substitute_parse> is non-null, it means the caller can accept an input
11034    sequence consisting of one or more code points; <*substitute_parse> is a
11035    newly created mortal SV* in this case, containing \x{} escapes representing
11036    those code points.
11037
11038    Both <valuep> and <substitute_parse> can be non-NULL.
11039
11040    If <node_p> is non-null, <substitute_parse> must be NULL.  This signifies
11041    that the caller can accept any legal sequence other than a single code
11042    point.  To wit, <*node_p> is set as follows:
11043     1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11044     2) \N{}:              points to a new NOTHING node; return is 0
11045     3) otherwise:         points to a new EXACT node containing the resolved
11046                           string; return is the number of code points in the
11047                           string.  This will never be 1.
11048    Note that failure is returned for single code point sequences if <valuep> is
11049    null and <node_p> is not.
11050  */
11051
11052     char * endbrace;    /* '}' following the name */
11053     char* p;
11054     char *endchar;      /* Points to '.' or '}' ending cur char in the input
11055                            stream */
11056     bool has_multiple_chars; /* true if the input stream contains a sequence of
11057                                 more than one character */
11058     bool in_char_class = substitute_parse != NULL;
11059     STRLEN count = 0;   /* Number of characters in this sequence */
11060
11061     GET_RE_DEBUG_FLAGS_DECL;
11062
11063     PERL_ARGS_ASSERT_GROK_BSLASH_N;
11064
11065     GET_RE_DEBUG_FLAGS;
11066
11067     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
11068     assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11069
11070     /* The [^\n] meaning of \N ignores spaces and comments under the /x
11071      * modifier.  The other meaning does not, so use a temporary until we find
11072      * out which we are being called with */
11073     p = (RExC_flags & RXf_PMf_EXTENDED)
11074         ? regpatws(pRExC_state, RExC_parse,
11075                                 TRUE) /* means recognize comments */
11076         : RExC_parse;
11077
11078     /* Disambiguate between \N meaning a named character versus \N meaning
11079      * [^\n].  The former is assumed when it can't be the latter. */
11080     if (*p != '{' || regcurly(p)) {
11081         RExC_parse = p;
11082         if (! node_p) {
11083             /* no bare \N allowed in a charclass */
11084             if (in_char_class) {
11085                 vFAIL("\\N in a character class must be a named character: \\N{...}");
11086             }
11087             return (STRLEN) -1;
11088         }
11089         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
11090                            current char */
11091         nextchar(pRExC_state);
11092         *node_p = reg_node(pRExC_state, REG_ANY);
11093         *flagp |= HASWIDTH|SIMPLE;
11094         RExC_naughty++;
11095         Set_Node_Length(*node_p, 1); /* MJD */
11096         return 1;
11097     }
11098
11099     /* Here, we have decided it should be a named character or sequence */
11100
11101     /* The test above made sure that the next real character is a '{', but
11102      * under the /x modifier, it could be separated by space (or a comment and
11103      * \n) and this is not allowed (for consistency with \x{...} and the
11104      * tokenizer handling of \N{NAME}). */
11105     if (*RExC_parse != '{') {
11106         vFAIL("Missing braces on \\N{}");
11107     }
11108
11109     RExC_parse++;       /* Skip past the '{' */
11110
11111     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11112         || ! (endbrace == RExC_parse            /* nothing between the {} */
11113               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
11114                                                  */
11115                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
11116                                                      */
11117     {
11118         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
11119         vFAIL("\\N{NAME} must be resolved by the lexer");
11120     }
11121
11122     if (endbrace == RExC_parse) {   /* empty: \N{} */
11123         if (node_p) {
11124             *node_p = reg_node(pRExC_state,NOTHING);
11125         }
11126         else if (! in_char_class) {
11127             return (STRLEN) -1;
11128         }
11129         nextchar(pRExC_state);
11130         return 0;
11131     }
11132
11133     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11134     RExC_parse += 2;    /* Skip past the 'U+' */
11135
11136     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11137
11138     /* Code points are separated by dots.  If none, there is only one code
11139      * point, and is terminated by the brace */
11140     has_multiple_chars = (endchar < endbrace);
11141
11142     /* We get the first code point if we want it, and either there is only one,
11143      * or we can accept both cases of one and more than one */
11144     if (valuep && (substitute_parse || ! has_multiple_chars)) {
11145         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11146         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11147                            | PERL_SCAN_DISALLOW_PREFIX
11148
11149                              /* No errors in the first pass (See [perl
11150                               * #122671].)  We let the code below find the
11151                               * errors when there are multiple chars. */
11152                            | ((SIZE_ONLY || has_multiple_chars)
11153                               ? PERL_SCAN_SILENT_ILLDIGIT
11154                               : 0);
11155
11156         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11157
11158         /* The tokenizer should have guaranteed validity, but it's possible to
11159          * bypass it by using single quoting, so check.  Don't do the check
11160          * here when there are multiple chars; we do it below anyway. */
11161         if (! has_multiple_chars) {
11162             if (length_of_hex == 0
11163                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11164             {
11165                 RExC_parse += length_of_hex;    /* Includes all the valid */
11166                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
11167                                 ? UTF8SKIP(RExC_parse)
11168                                 : 1;
11169                 /* Guard against malformed utf8 */
11170                 if (RExC_parse >= endchar) {
11171                     RExC_parse = endchar;
11172                 }
11173                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11174             }
11175
11176             RExC_parse = endbrace + 1;
11177             return 1;
11178         }
11179     }
11180
11181     /* Here, we should have already handled the case where a single character
11182      * is expected and found.  So it is a failure if we aren't expecting
11183      * multiple chars and got them; or didn't get them but wanted them.  We
11184      * fail without advancing the parse, so that the caller can try again with
11185      * different acceptance criteria */
11186     if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11187         RExC_parse = p;
11188         return (STRLEN) -1;
11189     }
11190
11191     {
11192
11193         /* What is done here is to convert this to a sub-pattern of the form
11194          * \x{char1}\x{char2}...
11195          * and then either return it in <*substitute_parse> if non-null; or
11196          * call reg recursively to parse it (enclosing in "(?: ... )" ).  That
11197          * way, it retains its atomicness, while not having to worry about
11198          * special handling that some code points may have.  toke.c has
11199          * converted the original Unicode values to native, so that we can just
11200          * pass on the hex values unchanged.  We do have to set a flag to keep
11201          * recoding from happening in the recursion */
11202
11203         SV * dummy = NULL;
11204         STRLEN len;
11205         char *orig_end = RExC_end;
11206         I32 flags;
11207
11208         if (substitute_parse) {
11209             *substitute_parse = newSVpvs("");
11210         }
11211         else {
11212             substitute_parse = &dummy;
11213             *substitute_parse = newSVpvs("?:");
11214         }
11215         *substitute_parse = sv_2mortal(*substitute_parse);
11216
11217         while (RExC_parse < endbrace) {
11218
11219             /* Convert to notation the rest of the code understands */
11220             sv_catpv(*substitute_parse, "\\x{");
11221             sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11222             sv_catpv(*substitute_parse, "}");
11223
11224             /* Point to the beginning of the next character in the sequence. */
11225             RExC_parse = endchar + 1;
11226             endchar = RExC_parse + strcspn(RExC_parse, ".}");
11227
11228             count++;
11229         }
11230         if (! in_char_class) {
11231             sv_catpv(*substitute_parse, ")");
11232         }
11233
11234         RExC_parse = SvPV(*substitute_parse, len);
11235
11236         /* Don't allow empty number */
11237         if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11238             RExC_parse = endbrace;
11239             vFAIL("Invalid hexadecimal number in \\N{U+...}");
11240         }
11241         RExC_end = RExC_parse + len;
11242
11243         /* The values are Unicode, and therefore not subject to recoding */
11244         RExC_override_recoding = 1;
11245
11246         if (node_p) {
11247             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11248                 if (flags & RESTART_UTF8) {
11249                     *flagp = RESTART_UTF8;
11250                     return (STRLEN) -1;
11251                 }
11252                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11253                     (UV) flags);
11254             }
11255             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11256         }
11257
11258         RExC_parse = endbrace;
11259         RExC_end = orig_end;
11260         RExC_override_recoding = 0;
11261
11262         nextchar(pRExC_state);
11263     }
11264
11265     return count;
11266 }
11267
11268
11269 /*
11270  * reg_recode
11271  *
11272  * It returns the code point in utf8 for the value in *encp.
11273  *    value: a code value in the source encoding
11274  *    encp:  a pointer to an Encode object
11275  *
11276  * If the result from Encode is not a single character,
11277  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11278  */
11279 STATIC UV
11280 S_reg_recode(pTHX_ const char value, SV **encp)
11281 {
11282     STRLEN numlen = 1;
11283     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11284     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11285     const STRLEN newlen = SvCUR(sv);
11286     UV uv = UNICODE_REPLACEMENT;
11287
11288     PERL_ARGS_ASSERT_REG_RECODE;
11289
11290     if (newlen)
11291         uv = SvUTF8(sv)
11292              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11293              : *(U8*)s;
11294
11295     if (!newlen || numlen != newlen) {
11296         uv = UNICODE_REPLACEMENT;
11297         *encp = NULL;
11298     }
11299     return uv;
11300 }
11301
11302 PERL_STATIC_INLINE U8
11303 S_compute_EXACTish(RExC_state_t *pRExC_state)
11304 {
11305     U8 op;
11306
11307     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11308
11309     if (! FOLD) {
11310         return EXACT;
11311     }
11312
11313     op = get_regex_charset(RExC_flags);
11314     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11315         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11316                  been, so there is no hole */
11317     }
11318
11319     return op + EXACTF;
11320 }
11321
11322 PERL_STATIC_INLINE void
11323 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11324                          regnode *node, I32* flagp, STRLEN len, UV code_point,
11325                          bool downgradable)
11326 {
11327     /* This knows the details about sizing an EXACTish node, setting flags for
11328      * it (by setting <*flagp>, and potentially populating it with a single
11329      * character.
11330      *
11331      * If <len> (the length in bytes) is non-zero, this function assumes that
11332      * the node has already been populated, and just does the sizing.  In this
11333      * case <code_point> should be the final code point that has already been
11334      * placed into the node.  This value will be ignored except that under some
11335      * circumstances <*flagp> is set based on it.
11336      *
11337      * If <len> is zero, the function assumes that the node is to contain only
11338      * the single character given by <code_point> and calculates what <len>
11339      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11340      * additionally will populate the node's STRING with <code_point> or its
11341      * fold if folding.
11342      *
11343      * In both cases <*flagp> is appropriately set
11344      *
11345      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11346      * 255, must be folded (the former only when the rules indicate it can
11347      * match 'ss')
11348      *
11349      * When it does the populating, it looks at the flag 'downgradable'.  If
11350      * true with a node that folds, it checks if the single code point
11351      * participates in a fold, and if not downgrades the node to an EXACT.
11352      * This helps the optimizer */
11353
11354     bool len_passed_in = cBOOL(len != 0);
11355     U8 character[UTF8_MAXBYTES_CASE+1];
11356
11357     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11358
11359     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11360      * sizing difference, and is extra work that is thrown away */
11361     if (downgradable && ! PASS2) {
11362         downgradable = FALSE;
11363     }
11364
11365     if (! len_passed_in) {
11366         if (UTF) {
11367             if (UVCHR_IS_INVARIANT(code_point)) {
11368                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11369                     *character = (U8) code_point;
11370                 }
11371                 else { /* Here is /i and not /l. (toFOLD() is defined on just
11372                           ASCII, which isn't the same thing as INVARIANT on
11373                           EBCDIC, but it works there, as the extra invariants
11374                           fold to themselves) */
11375                     *character = toFOLD((U8) code_point);
11376
11377                     /* We can downgrade to an EXACT node if this character
11378                      * isn't a folding one.  Note that this assumes that
11379                      * nothing above Latin1 folds to some other invariant than
11380                      * one of these alphabetics; otherwise we would also have
11381                      * to check:
11382                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11383                      *      || ASCII_FOLD_RESTRICTED))
11384                      */
11385                     if (downgradable && PL_fold[code_point] == code_point) {
11386                         OP(node) = EXACT;
11387                     }
11388                 }
11389                 len = 1;
11390             }
11391             else if (FOLD && (! LOC
11392                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11393             {   /* Folding, and ok to do so now */
11394                 UV folded = _to_uni_fold_flags(
11395                                    code_point,
11396                                    character,
11397                                    &len,
11398                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11399                                                       ? FOLD_FLAGS_NOMIX_ASCII
11400                                                       : 0));
11401                 if (downgradable
11402                     && folded == code_point /* This quickly rules out many
11403                                                cases, avoiding the
11404                                                _invlist_contains_cp() overhead
11405                                                for those.  */
11406                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11407                 {
11408                     OP(node) = EXACT;
11409                 }
11410             }
11411             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11412
11413                 /* Not folding this cp, and can output it directly */
11414                 *character = UTF8_TWO_BYTE_HI(code_point);
11415                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11416                 len = 2;
11417             }
11418             else {
11419                 uvchr_to_utf8( character, code_point);
11420                 len = UTF8SKIP(character);
11421             }
11422         } /* Else pattern isn't UTF8.  */
11423         else if (! FOLD) {
11424             *character = (U8) code_point;
11425             len = 1;
11426         } /* Else is folded non-UTF8 */
11427         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11428
11429             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11430              * comments at join_exact()); */
11431             *character = (U8) code_point;
11432             len = 1;
11433
11434             /* Can turn into an EXACT node if we know the fold at compile time,
11435              * and it folds to itself and doesn't particpate in other folds */
11436             if (downgradable
11437                 && ! LOC
11438                 && PL_fold_latin1[code_point] == code_point
11439                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11440                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11441             {
11442                 OP(node) = EXACT;
11443             }
11444         } /* else is Sharp s.  May need to fold it */
11445         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11446             *character = 's';
11447             *(character + 1) = 's';
11448             len = 2;
11449         }
11450         else {
11451             *character = LATIN_SMALL_LETTER_SHARP_S;
11452             len = 1;
11453         }
11454     }
11455
11456     if (SIZE_ONLY) {
11457         RExC_size += STR_SZ(len);
11458     }
11459     else {
11460         RExC_emit += STR_SZ(len);
11461         STR_LEN(node) = len;
11462         if (! len_passed_in) {
11463             Copy((char *) character, STRING(node), len, char);
11464         }
11465     }
11466
11467     *flagp |= HASWIDTH;
11468
11469     /* A single character node is SIMPLE, except for the special-cased SHARP S
11470      * under /di. */
11471     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11472         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11473             || ! FOLD || ! DEPENDS_SEMANTICS))
11474     {
11475         *flagp |= SIMPLE;
11476     }
11477
11478     /* The OP may not be well defined in PASS1 */
11479     if (PASS2 && OP(node) == EXACTFL) {
11480         RExC_contains_locale = 1;
11481     }
11482 }
11483
11484
11485 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11486  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11487
11488 static I32
11489 S_backref_value(char *p)
11490 {
11491     const char* endptr;
11492     UV val = grok_atou(p, &endptr);
11493     if (endptr == p || endptr == NULL || val > I32_MAX)
11494         return I32_MAX;
11495     return (I32)val;
11496 }
11497
11498
11499 /*
11500  - regatom - the lowest level
11501
11502    Try to identify anything special at the start of the pattern. If there
11503    is, then handle it as required. This may involve generating a single regop,
11504    such as for an assertion; or it may involve recursing, such as to
11505    handle a () structure.
11506
11507    If the string doesn't start with something special then we gobble up
11508    as much literal text as we can.
11509
11510    Once we have been able to handle whatever type of thing started the
11511    sequence, we return.
11512
11513    Note: we have to be careful with escapes, as they can be both literal
11514    and special, and in the case of \10 and friends, context determines which.
11515
11516    A summary of the code structure is:
11517
11518    switch (first_byte) {
11519         cases for each special:
11520             handle this special;
11521             break;
11522         case '\\':
11523             switch (2nd byte) {
11524                 cases for each unambiguous special:
11525                     handle this special;
11526                     break;
11527                 cases for each ambigous special/literal:
11528                     disambiguate;
11529                     if (special)  handle here
11530                     else goto defchar;
11531                 default: // unambiguously literal:
11532                     goto defchar;
11533             }
11534         default:  // is a literal char
11535             // FALL THROUGH
11536         defchar:
11537             create EXACTish node for literal;
11538             while (more input and node isn't full) {
11539                 switch (input_byte) {
11540                    cases for each special;
11541                        make sure parse pointer is set so that the next call to
11542                            regatom will see this special first
11543                        goto loopdone; // EXACTish node terminated by prev. char
11544                    default:
11545                        append char to EXACTISH node;
11546                 }
11547                 get next input byte;
11548             }
11549         loopdone:
11550    }
11551    return the generated node;
11552
11553    Specifically there are two separate switches for handling
11554    escape sequences, with the one for handling literal escapes requiring
11555    a dummy entry for all of the special escapes that are actually handled
11556    by the other.
11557
11558    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11559    TRYAGAIN.
11560    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11561    restarted.
11562    Otherwise does not return NULL.
11563 */
11564
11565 STATIC regnode *
11566 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11567 {
11568     regnode *ret = NULL;
11569     I32 flags = 0;
11570     char *parse_start = RExC_parse;
11571     U8 op;
11572     int invert = 0;
11573     U8 arg;
11574
11575     GET_RE_DEBUG_FLAGS_DECL;
11576
11577     *flagp = WORST;             /* Tentatively. */
11578
11579     DEBUG_PARSE("atom");
11580
11581     PERL_ARGS_ASSERT_REGATOM;
11582
11583 tryagain:
11584     switch ((U8)*RExC_parse) {
11585     case '^':
11586         RExC_seen_zerolen++;
11587         nextchar(pRExC_state);
11588         if (RExC_flags & RXf_PMf_MULTILINE)
11589             ret = reg_node(pRExC_state, MBOL);
11590         else
11591             ret = reg_node(pRExC_state, SBOL);
11592         Set_Node_Length(ret, 1); /* MJD */
11593         break;
11594     case '$':
11595         nextchar(pRExC_state);
11596         if (*RExC_parse)
11597             RExC_seen_zerolen++;
11598         if (RExC_flags & RXf_PMf_MULTILINE)
11599             ret = reg_node(pRExC_state, MEOL);
11600         else
11601             ret = reg_node(pRExC_state, SEOL);
11602         Set_Node_Length(ret, 1); /* MJD */
11603         break;
11604     case '.':
11605         nextchar(pRExC_state);
11606         if (RExC_flags & RXf_PMf_SINGLELINE)
11607             ret = reg_node(pRExC_state, SANY);
11608         else
11609             ret = reg_node(pRExC_state, REG_ANY);
11610         *flagp |= HASWIDTH|SIMPLE;
11611         RExC_naughty++;
11612         Set_Node_Length(ret, 1); /* MJD */
11613         break;
11614     case '[':
11615     {
11616         char * const oregcomp_parse = ++RExC_parse;
11617         ret = regclass(pRExC_state, flagp,depth+1,
11618                        FALSE, /* means parse the whole char class */
11619                        TRUE, /* allow multi-char folds */
11620                        FALSE, /* don't silence non-portable warnings. */
11621                        NULL);
11622         if (*RExC_parse != ']') {
11623             RExC_parse = oregcomp_parse;
11624             vFAIL("Unmatched [");
11625         }
11626         if (ret == NULL) {
11627             if (*flagp & RESTART_UTF8)
11628                 return NULL;
11629             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11630                   (UV) *flagp);
11631         }
11632         nextchar(pRExC_state);
11633         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11634         break;
11635     }
11636     case '(':
11637         nextchar(pRExC_state);
11638         ret = reg(pRExC_state, 2, &flags,depth+1);
11639         if (ret == NULL) {
11640                 if (flags & TRYAGAIN) {
11641                     if (RExC_parse == RExC_end) {
11642                          /* Make parent create an empty node if needed. */
11643                         *flagp |= TRYAGAIN;
11644                         return(NULL);
11645                     }
11646                     goto tryagain;
11647                 }
11648                 if (flags & RESTART_UTF8) {
11649                     *flagp = RESTART_UTF8;
11650                     return NULL;
11651                 }
11652                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11653                                                                  (UV) flags);
11654         }
11655         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11656         break;
11657     case '|':
11658     case ')':
11659         if (flags & TRYAGAIN) {
11660             *flagp |= TRYAGAIN;
11661             return NULL;
11662         }
11663         vFAIL("Internal urp");
11664                                 /* Supposed to be caught earlier. */
11665         break;
11666     case '?':
11667     case '+':
11668     case '*':
11669         RExC_parse++;
11670         vFAIL("Quantifier follows nothing");
11671         break;
11672     case '\\':
11673         /* Special Escapes
11674
11675            This switch handles escape sequences that resolve to some kind
11676            of special regop and not to literal text. Escape sequnces that
11677            resolve to literal text are handled below in the switch marked
11678            "Literal Escapes".
11679
11680            Every entry in this switch *must* have a corresponding entry
11681            in the literal escape switch. However, the opposite is not
11682            required, as the default for this switch is to jump to the
11683            literal text handling code.
11684         */
11685         switch ((U8)*++RExC_parse) {
11686         /* Special Escapes */
11687         case 'A':
11688             RExC_seen_zerolen++;
11689             ret = reg_node(pRExC_state, SBOL);
11690             /* SBOL is shared with /^/ so we set the flags so we can tell
11691              * /\A/ from /^/ in split. We check ret because first pass we
11692              * have no regop struct to set the flags on. */
11693             if (PASS2)
11694                 ret->flags = 1;
11695             *flagp |= SIMPLE;
11696             goto finish_meta_pat;
11697         case 'G':
11698             ret = reg_node(pRExC_state, GPOS);
11699             RExC_seen |= REG_GPOS_SEEN;
11700             *flagp |= SIMPLE;
11701             goto finish_meta_pat;
11702         case 'K':
11703             RExC_seen_zerolen++;
11704             ret = reg_node(pRExC_state, KEEPS);
11705             *flagp |= SIMPLE;
11706             /* XXX:dmq : disabling in-place substitution seems to
11707              * be necessary here to avoid cases of memory corruption, as
11708              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11709              */
11710             RExC_seen |= REG_LOOKBEHIND_SEEN;
11711             goto finish_meta_pat;
11712         case 'Z':
11713             ret = reg_node(pRExC_state, SEOL);
11714             *flagp |= SIMPLE;
11715             RExC_seen_zerolen++;                /* Do not optimize RE away */
11716             goto finish_meta_pat;
11717         case 'z':
11718             ret = reg_node(pRExC_state, EOS);
11719             *flagp |= SIMPLE;
11720             RExC_seen_zerolen++;                /* Do not optimize RE away */
11721             goto finish_meta_pat;
11722         case 'C':
11723             ret = reg_node(pRExC_state, CANY);
11724             RExC_seen |= REG_CANY_SEEN;
11725             *flagp |= HASWIDTH|SIMPLE;
11726             if (PASS2) {
11727                 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11728             }
11729             goto finish_meta_pat;
11730         case 'X':
11731             ret = reg_node(pRExC_state, CLUMP);
11732             *flagp |= HASWIDTH;
11733             goto finish_meta_pat;
11734
11735         case 'W':
11736             invert = 1;
11737             /* FALLTHROUGH */
11738         case 'w':
11739             arg = ANYOF_WORDCHAR;
11740             goto join_posix;
11741
11742         case 'b':
11743             RExC_seen_zerolen++;
11744             RExC_seen |= REG_LOOKBEHIND_SEEN;
11745             op = BOUND + get_regex_charset(RExC_flags);
11746             if (op > BOUNDA) {  /* /aa is same as /a */
11747                 op = BOUNDA;
11748             }
11749             else if (op == BOUNDL) {
11750                 RExC_contains_locale = 1;
11751             }
11752             ret = reg_node(pRExC_state, op);
11753             FLAGS(ret) = get_regex_charset(RExC_flags);
11754             *flagp |= SIMPLE;
11755             if ((U8) *(RExC_parse + 1) == '{') {
11756                 /* diag_listed_as: Use "%s" instead of "%s" */
11757                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11758             }
11759             goto finish_meta_pat;
11760         case 'B':
11761             RExC_seen_zerolen++;
11762             RExC_seen |= REG_LOOKBEHIND_SEEN;
11763             op = NBOUND + get_regex_charset(RExC_flags);
11764             if (op > NBOUNDA) { /* /aa is same as /a */
11765                 op = NBOUNDA;
11766             }
11767             else if (op == NBOUNDL) {
11768                 RExC_contains_locale = 1;
11769             }
11770             ret = reg_node(pRExC_state, op);
11771             FLAGS(ret) = get_regex_charset(RExC_flags);
11772             *flagp |= SIMPLE;
11773             if ((U8) *(RExC_parse + 1) == '{') {
11774                 /* diag_listed_as: Use "%s" instead of "%s" */
11775                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11776             }
11777             goto finish_meta_pat;
11778
11779         case 'D':
11780             invert = 1;
11781             /* FALLTHROUGH */
11782         case 'd':
11783             arg = ANYOF_DIGIT;
11784             goto join_posix;
11785
11786         case 'R':
11787             ret = reg_node(pRExC_state, LNBREAK);
11788             *flagp |= HASWIDTH|SIMPLE;
11789             goto finish_meta_pat;
11790
11791         case 'H':
11792             invert = 1;
11793             /* FALLTHROUGH */
11794         case 'h':
11795             arg = ANYOF_BLANK;
11796             op = POSIXU;
11797             goto join_posix_op_known;
11798
11799         case 'V':
11800             invert = 1;
11801             /* FALLTHROUGH */
11802         case 'v':
11803             arg = ANYOF_VERTWS;
11804             op = POSIXU;
11805             goto join_posix_op_known;
11806
11807         case 'S':
11808             invert = 1;
11809             /* FALLTHROUGH */
11810         case 's':
11811             arg = ANYOF_SPACE;
11812
11813         join_posix:
11814
11815             op = POSIXD + get_regex_charset(RExC_flags);
11816             if (op > POSIXA) {  /* /aa is same as /a */
11817                 op = POSIXA;
11818             }
11819             else if (op == POSIXL) {
11820                 RExC_contains_locale = 1;
11821             }
11822
11823         join_posix_op_known:
11824
11825             if (invert) {
11826                 op += NPOSIXD - POSIXD;
11827             }
11828
11829             ret = reg_node(pRExC_state, op);
11830             if (! SIZE_ONLY) {
11831                 FLAGS(ret) = namedclass_to_classnum(arg);
11832             }
11833
11834             *flagp |= HASWIDTH|SIMPLE;
11835             /* FALLTHROUGH */
11836
11837          finish_meta_pat:
11838             nextchar(pRExC_state);
11839             Set_Node_Length(ret, 2); /* MJD */
11840             break;
11841         case 'p':
11842         case 'P':
11843             {
11844 #ifdef DEBUGGING
11845                 char* parse_start = RExC_parse - 2;
11846 #endif
11847
11848                 RExC_parse--;
11849
11850                 ret = regclass(pRExC_state, flagp,depth+1,
11851                                TRUE, /* means just parse this element */
11852                                FALSE, /* don't allow multi-char folds */
11853                                FALSE, /* don't silence non-portable warnings.
11854                                          It would be a bug if these returned
11855                                          non-portables */
11856                                NULL);
11857                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11858                    are allowed.  */
11859                 if (!ret)
11860                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11861                           (UV) *flagp);
11862
11863                 RExC_parse--;
11864
11865                 Set_Node_Offset(ret, parse_start + 2);
11866                 Set_Node_Cur_Length(ret, parse_start);
11867                 nextchar(pRExC_state);
11868             }
11869             break;
11870         case 'N':
11871             /* Handle \N and \N{NAME} with multiple code points here and not
11872              * below because it can be multicharacter. join_exact() will join
11873              * them up later on.  Also this makes sure that things like
11874              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11875              * The options to the grok function call causes it to fail if the
11876              * sequence is just a single code point.  We then go treat it as
11877              * just another character in the current EXACT node, and hence it
11878              * gets uniform treatment with all the other characters.  The
11879              * special treatment for quantifiers is not needed for such single
11880              * character sequences */
11881             ++RExC_parse;
11882             if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11883                                              depth, FALSE))
11884             {
11885                 if (*flagp & RESTART_UTF8)
11886                     return NULL;
11887                 RExC_parse--;
11888                 goto defchar;
11889             }
11890             break;
11891         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11892         parse_named_seq:
11893         {
11894             char ch= RExC_parse[1];
11895             if (ch != '<' && ch != '\'' && ch != '{') {
11896                 RExC_parse++;
11897                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11898                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11899             } else {
11900                 /* this pretty much dupes the code for (?P=...) in reg(), if
11901                    you change this make sure you change that */
11902                 char* name_start = (RExC_parse += 2);
11903                 U32 num = 0;
11904                 SV *sv_dat = reg_scan_name(pRExC_state,
11905                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11906                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11907                 if (RExC_parse == name_start || *RExC_parse != ch)
11908                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11909                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11910
11911                 if (!SIZE_ONLY) {
11912                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11913                     RExC_rxi->data->data[num]=(void*)sv_dat;
11914                     SvREFCNT_inc_simple_void(sv_dat);
11915                 }
11916
11917                 RExC_sawback = 1;
11918                 ret = reganode(pRExC_state,
11919                                ((! FOLD)
11920                                  ? NREF
11921                                  : (ASCII_FOLD_RESTRICTED)
11922                                    ? NREFFA
11923                                    : (AT_LEAST_UNI_SEMANTICS)
11924                                      ? NREFFU
11925                                      : (LOC)
11926                                        ? NREFFL
11927                                        : NREFF),
11928                                 num);
11929                 *flagp |= HASWIDTH;
11930
11931                 /* override incorrect value set in reganode MJD */
11932                 Set_Node_Offset(ret, parse_start+1);
11933                 Set_Node_Cur_Length(ret, parse_start);
11934                 nextchar(pRExC_state);
11935
11936             }
11937             break;
11938         }
11939         case 'g':
11940         case '1': case '2': case '3': case '4':
11941         case '5': case '6': case '7': case '8': case '9':
11942             {
11943                 I32 num;
11944                 bool hasbrace = 0;
11945
11946                 if (*RExC_parse == 'g') {
11947                     bool isrel = 0;
11948
11949                     RExC_parse++;
11950                     if (*RExC_parse == '{') {
11951                         RExC_parse++;
11952                         hasbrace = 1;
11953                     }
11954                     if (*RExC_parse == '-') {
11955                         RExC_parse++;
11956                         isrel = 1;
11957                     }
11958                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11959                         if (isrel) RExC_parse--;
11960                         RExC_parse -= 2;
11961                         goto parse_named_seq;
11962                     }
11963
11964                     num = S_backref_value(RExC_parse);
11965                     if (num == 0)
11966                         vFAIL("Reference to invalid group 0");
11967                     else if (num == I32_MAX) {
11968                          if (isDIGIT(*RExC_parse))
11969                             vFAIL("Reference to nonexistent group");
11970                         else
11971                             vFAIL("Unterminated \\g... pattern");
11972                     }
11973
11974                     if (isrel) {
11975                         num = RExC_npar - num;
11976                         if (num < 1)
11977                             vFAIL("Reference to nonexistent or unclosed group");
11978                     }
11979                 }
11980                 else {
11981                     num = S_backref_value(RExC_parse);
11982                     /* bare \NNN might be backref or octal - if it is larger than or equal
11983                      * RExC_npar then it is assumed to be and octal escape.
11984                      * Note RExC_npar is +1 from the actual number of parens*/
11985                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11986                             && *RExC_parse != '8' && *RExC_parse != '9'))
11987                     {
11988                         /* Probably a character specified in octal, e.g. \35 */
11989                         goto defchar;
11990                     }
11991                 }
11992
11993                 /* at this point RExC_parse definitely points to a backref
11994                  * number */
11995                 {
11996 #ifdef RE_TRACK_PATTERN_OFFSETS
11997                     char * const parse_start = RExC_parse - 1; /* MJD */
11998 #endif
11999                     while (isDIGIT(*RExC_parse))
12000                         RExC_parse++;
12001                     if (hasbrace) {
12002                         if (*RExC_parse != '}')
12003                             vFAIL("Unterminated \\g{...} pattern");
12004                         RExC_parse++;
12005                     }
12006                     if (!SIZE_ONLY) {
12007                         if (num > (I32)RExC_rx->nparens)
12008                             vFAIL("Reference to nonexistent group");
12009                     }
12010                     RExC_sawback = 1;
12011                     ret = reganode(pRExC_state,
12012                                    ((! FOLD)
12013                                      ? REF
12014                                      : (ASCII_FOLD_RESTRICTED)
12015                                        ? REFFA
12016                                        : (AT_LEAST_UNI_SEMANTICS)
12017                                          ? REFFU
12018                                          : (LOC)
12019                                            ? REFFL
12020                                            : REFF),
12021                                     num);
12022                     *flagp |= HASWIDTH;
12023
12024                     /* override incorrect value set in reganode MJD */
12025                     Set_Node_Offset(ret, parse_start+1);
12026                     Set_Node_Cur_Length(ret, parse_start);
12027                     RExC_parse--;
12028                     nextchar(pRExC_state);
12029                 }
12030             }
12031             break;
12032         case '\0':
12033             if (RExC_parse >= RExC_end)
12034                 FAIL("Trailing \\");
12035             /* FALLTHROUGH */
12036         default:
12037             /* Do not generate "unrecognized" warnings here, we fall
12038                back into the quick-grab loop below */
12039             parse_start--;
12040             goto defchar;
12041         }
12042         break;
12043
12044     case '#':
12045         if (RExC_flags & RXf_PMf_EXTENDED) {
12046             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12047             if (RExC_parse < RExC_end)
12048                 goto tryagain;
12049         }
12050         /* FALLTHROUGH */
12051
12052     default:
12053
12054             parse_start = RExC_parse - 1;
12055
12056             RExC_parse++;
12057
12058         defchar: {
12059             STRLEN len = 0;
12060             UV ender = 0;
12061             char *p;
12062             char *s;
12063 #define MAX_NODE_STRING_SIZE 127
12064             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12065             char *s0;
12066             U8 upper_parse = MAX_NODE_STRING_SIZE;
12067             U8 node_type = compute_EXACTish(pRExC_state);
12068             bool next_is_quantifier;
12069             char * oldp = NULL;
12070
12071             /* We can convert EXACTF nodes to EXACTFU if they contain only
12072              * characters that match identically regardless of the target
12073              * string's UTF8ness.  The reason to do this is that EXACTF is not
12074              * trie-able, EXACTFU is.
12075              *
12076              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12077              * contain only above-Latin1 characters (hence must be in UTF8),
12078              * which don't participate in folds with Latin1-range characters,
12079              * as the latter's folds aren't known until runtime.  (We don't
12080              * need to figure this out until pass 2) */
12081             bool maybe_exactfu = PASS2
12082                                && (node_type == EXACTF || node_type == EXACTFL);
12083
12084             /* If a folding node contains only code points that don't
12085              * participate in folds, it can be changed into an EXACT node,
12086              * which allows the optimizer more things to look for */
12087             bool maybe_exact;
12088
12089             ret = reg_node(pRExC_state, node_type);
12090
12091             /* In pass1, folded, we use a temporary buffer instead of the
12092              * actual node, as the node doesn't exist yet */
12093             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12094
12095             s0 = s;
12096
12097         reparse:
12098
12099             /* We do the EXACTFish to EXACT node only if folding.  (And we
12100              * don't need to figure this out until pass 2) */
12101             maybe_exact = FOLD && PASS2;
12102
12103             /* XXX The node can hold up to 255 bytes, yet this only goes to
12104              * 127.  I (khw) do not know why.  Keeping it somewhat less than
12105              * 255 allows us to not have to worry about overflow due to
12106              * converting to utf8 and fold expansion, but that value is
12107              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
12108              * split up by this limit into a single one using the real max of
12109              * 255.  Even at 127, this breaks under rare circumstances.  If
12110              * folding, we do not want to split a node at a character that is a
12111              * non-final in a multi-char fold, as an input string could just
12112              * happen to want to match across the node boundary.  The join
12113              * would solve that problem if the join actually happens.  But a
12114              * series of more than two nodes in a row each of 127 would cause
12115              * the first join to succeed to get to 254, but then there wouldn't
12116              * be room for the next one, which could at be one of those split
12117              * multi-char folds.  I don't know of any fool-proof solution.  One
12118              * could back off to end with only a code point that isn't such a
12119              * non-final, but it is possible for there not to be any in the
12120              * entire node. */
12121             for (p = RExC_parse - 1;
12122                  len < upper_parse && p < RExC_end;
12123                  len++)
12124             {
12125                 oldp = p;
12126
12127                 if (RExC_flags & RXf_PMf_EXTENDED)
12128                     p = regpatws(pRExC_state, p,
12129                                           TRUE); /* means recognize comments */
12130                 switch ((U8)*p) {
12131                 case '^':
12132                 case '$':
12133                 case '.':
12134                 case '[':
12135                 case '(':
12136                 case ')':
12137                 case '|':
12138                     goto loopdone;
12139                 case '\\':
12140                     /* Literal Escapes Switch
12141
12142                        This switch is meant to handle escape sequences that
12143                        resolve to a literal character.
12144
12145                        Every escape sequence that represents something
12146                        else, like an assertion or a char class, is handled
12147                        in the switch marked 'Special Escapes' above in this
12148                        routine, but also has an entry here as anything that
12149                        isn't explicitly mentioned here will be treated as
12150                        an unescaped equivalent literal.
12151                     */
12152
12153                     switch ((U8)*++p) {
12154                     /* These are all the special escapes. */
12155                     case 'A':             /* Start assertion */
12156                     case 'b': case 'B':   /* Word-boundary assertion*/
12157                     case 'C':             /* Single char !DANGEROUS! */
12158                     case 'd': case 'D':   /* digit class */
12159                     case 'g': case 'G':   /* generic-backref, pos assertion */
12160                     case 'h': case 'H':   /* HORIZWS */
12161                     case 'k': case 'K':   /* named backref, keep marker */
12162                     case 'p': case 'P':   /* Unicode property */
12163                               case 'R':   /* LNBREAK */
12164                     case 's': case 'S':   /* space class */
12165                     case 'v': case 'V':   /* VERTWS */
12166                     case 'w': case 'W':   /* word class */
12167                     case 'X':             /* eXtended Unicode "combining
12168                                              character sequence" */
12169                     case 'z': case 'Z':   /* End of line/string assertion */
12170                         --p;
12171                         goto loopdone;
12172
12173                     /* Anything after here is an escape that resolves to a
12174                        literal. (Except digits, which may or may not)
12175                      */
12176                     case 'n':
12177                         ender = '\n';
12178                         p++;
12179                         break;
12180                     case 'N': /* Handle a single-code point named character. */
12181                         /* The options cause it to fail if a multiple code
12182                          * point sequence.  Handle those in the switch() above
12183                          * */
12184                         RExC_parse = p + 1;
12185                         if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12186                                                          &ender,
12187                                                          flagp,
12188                                                          depth,
12189                                                          FALSE
12190                         )) {
12191                             if (*flagp & RESTART_UTF8)
12192                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12193                             RExC_parse = p = oldp;
12194                             goto loopdone;
12195                         }
12196                         p = RExC_parse;
12197                         if (ender > 0xff) {
12198                             REQUIRE_UTF8;
12199                         }
12200                         break;
12201                     case 'r':
12202                         ender = '\r';
12203                         p++;
12204                         break;
12205                     case 't':
12206                         ender = '\t';
12207                         p++;
12208                         break;
12209                     case 'f':
12210                         ender = '\f';
12211                         p++;
12212                         break;
12213                     case 'e':
12214                         ender = ESC_NATIVE;
12215                         p++;
12216                         break;
12217                     case 'a':
12218                         ender = '\a';
12219                         p++;
12220                         break;
12221                     case 'o':
12222                         {
12223                             UV result;
12224                             const char* error_msg;
12225
12226                             bool valid = grok_bslash_o(&p,
12227                                                        &result,
12228                                                        &error_msg,
12229                                                        PASS2, /* out warnings */
12230                                                        FALSE, /* not strict */
12231                                                        TRUE, /* Output warnings
12232                                                                 for non-
12233                                                                 portables */
12234                                                        UTF);
12235                             if (! valid) {
12236                                 RExC_parse = p; /* going to die anyway; point
12237                                                    to exact spot of failure */
12238                                 vFAIL(error_msg);
12239                             }
12240                             ender = result;
12241                             if (PL_encoding && ender < 0x100) {
12242                                 goto recode_encoding;
12243                             }
12244                             if (ender > 0xff) {
12245                                 REQUIRE_UTF8;
12246                             }
12247                             break;
12248                         }
12249                     case 'x':
12250                         {
12251                             UV result = UV_MAX; /* initialize to erroneous
12252                                                    value */
12253                             const char* error_msg;
12254
12255                             bool valid = grok_bslash_x(&p,
12256                                                        &result,
12257                                                        &error_msg,
12258                                                        PASS2, /* out warnings */
12259                                                        FALSE, /* not strict */
12260                                                        TRUE, /* Output warnings
12261                                                                 for non-
12262                                                                 portables */
12263                                                        UTF);
12264                             if (! valid) {
12265                                 RExC_parse = p; /* going to die anyway; point
12266                                                    to exact spot of failure */
12267                                 vFAIL(error_msg);
12268                             }
12269                             ender = result;
12270
12271                             if (PL_encoding && ender < 0x100) {
12272                                 goto recode_encoding;
12273                             }
12274                             if (ender > 0xff) {
12275                                 REQUIRE_UTF8;
12276                             }
12277                             break;
12278                         }
12279                     case 'c':
12280                         p++;
12281                         ender = grok_bslash_c(*p++, PASS2);
12282                         break;
12283                     case '8': case '9': /* must be a backreference */
12284                         --p;
12285                         goto loopdone;
12286                     case '1': case '2': case '3':case '4':
12287                     case '5': case '6': case '7':
12288                         /* When we parse backslash escapes there is ambiguity
12289                          * between backreferences and octal escapes. Any escape
12290                          * from \1 - \9 is a backreference, any multi-digit
12291                          * escape which does not start with 0 and which when
12292                          * evaluated as decimal could refer to an already
12293                          * parsed capture buffer is a backslash. Anything else
12294                          * is octal.
12295                          *
12296                          * Note this implies that \118 could be interpreted as
12297                          * 118 OR as "\11" . "8" depending on whether there
12298                          * were 118 capture buffers defined already in the
12299                          * pattern.  */
12300
12301                         /* NOTE, RExC_npar is 1 more than the actual number of
12302                          * parens we have seen so far, hence the < RExC_npar below. */
12303
12304                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12305                         {  /* Not to be treated as an octal constant, go
12306                                    find backref */
12307                             --p;
12308                             goto loopdone;
12309                         }
12310                         /* FALLTHROUGH */
12311                     case '0':
12312                         {
12313                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12314                             STRLEN numlen = 3;
12315                             ender = grok_oct(p, &numlen, &flags, NULL);
12316                             if (ender > 0xff) {
12317                                 REQUIRE_UTF8;
12318                             }
12319                             p += numlen;
12320                             if (PASS2   /* like \08, \178 */
12321                                 && numlen < 3
12322                                 && p < RExC_end
12323                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12324                             {
12325                                 reg_warn_non_literal_string(
12326                                          p + 1,
12327                                          form_short_octal_warning(p, numlen));
12328                             }
12329                         }
12330                         if (PL_encoding && ender < 0x100)
12331                             goto recode_encoding;
12332                         break;
12333                     recode_encoding:
12334                         if (! RExC_override_recoding) {
12335                             SV* enc = PL_encoding;
12336                             ender = reg_recode((const char)(U8)ender, &enc);
12337                             if (!enc && PASS2)
12338                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12339                             REQUIRE_UTF8;
12340                         }
12341                         break;
12342                     case '\0':
12343                         if (p >= RExC_end)
12344                             FAIL("Trailing \\");
12345                         /* FALLTHROUGH */
12346                     default:
12347                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12348                             /* Include any { following the alpha to emphasize
12349                              * that it could be part of an escape at some point
12350                              * in the future */
12351                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12352                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12353                         }
12354                         goto normal_default;
12355                     } /* End of switch on '\' */
12356                     break;
12357                 case '{':
12358                     /* Currently we don't warn when the lbrace is at the start
12359                      * of a construct.  This catches it in the middle of a
12360                      * literal string, or when its the first thing after
12361                      * something like "\b" */
12362                     if (! SIZE_ONLY
12363                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12364                     {
12365                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12366                     }
12367                     /*FALLTHROUGH*/
12368                 default:    /* A literal character */
12369                   normal_default:
12370                     if (UTF8_IS_START(*p) && UTF) {
12371                         STRLEN numlen;
12372                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12373                                                &numlen, UTF8_ALLOW_DEFAULT);
12374                         p += numlen;
12375                     }
12376                     else
12377                         ender = (U8) *p++;
12378                     break;
12379                 } /* End of switch on the literal */
12380
12381                 /* Here, have looked at the literal character and <ender>
12382                  * contains its ordinal, <p> points to the character after it
12383                  */
12384
12385                 if ( RExC_flags & RXf_PMf_EXTENDED)
12386                     p = regpatws(pRExC_state, p,
12387                                           TRUE); /* means recognize comments */
12388
12389                 /* If the next thing is a quantifier, it applies to this
12390                  * character only, which means that this character has to be in
12391                  * its own node and can't just be appended to the string in an
12392                  * existing node, so if there are already other characters in
12393                  * the node, close the node with just them, and set up to do
12394                  * this character again next time through, when it will be the
12395                  * only thing in its new node */
12396                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12397                 {
12398                     p = oldp;
12399                     goto loopdone;
12400                 }
12401
12402                 if (! FOLD   /* The simple case, just append the literal */
12403                     || (LOC  /* Also don't fold for tricky chars under /l */
12404                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12405                 {
12406                     if (UTF) {
12407                         const STRLEN unilen = reguni(pRExC_state, ender, s);
12408                         if (unilen > 0) {
12409                            s   += unilen;
12410                            len += unilen;
12411                         }
12412
12413                         /* The loop increments <len> each time, as all but this
12414                          * path (and one other) through it add a single byte to
12415                          * the EXACTish node.  But this one has changed len to
12416                          * be the correct final value, so subtract one to
12417                          * cancel out the increment that follows */
12418                         len--;
12419                     }
12420                     else {
12421                         REGC((char)ender, s++);
12422                     }
12423
12424                     /* Can get here if folding only if is one of the /l
12425                      * characters whose fold depends on the locale.  The
12426                      * occurrence of any of these indicate that we can't
12427                      * simplify things */
12428                     if (FOLD) {
12429                         maybe_exact = FALSE;
12430                         maybe_exactfu = FALSE;
12431                     }
12432                 }
12433                 else             /* FOLD */
12434                      if (! ( UTF
12435                         /* See comments for join_exact() as to why we fold this
12436                          * non-UTF at compile time */
12437                         || (node_type == EXACTFU
12438                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12439                 {
12440                     /* Here, are folding and are not UTF-8 encoded; therefore
12441                      * the character must be in the range 0-255, and is not /l
12442                      * (Not /l because we already handled these under /l in
12443                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12444                     if (IS_IN_SOME_FOLD_L1(ender)) {
12445                         maybe_exact = FALSE;
12446
12447                         /* See if the character's fold differs between /d and
12448                          * /u.  This includes the multi-char fold SHARP S to
12449                          * 'ss' */
12450                         if (maybe_exactfu
12451                             && (PL_fold[ender] != PL_fold_latin1[ender]
12452                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12453                                 || (len > 0
12454                                    && isALPHA_FOLD_EQ(ender, 's')
12455                                    && isALPHA_FOLD_EQ(*(s-1), 's'))))
12456                         {
12457                             maybe_exactfu = FALSE;
12458                         }
12459                     }
12460
12461                     /* Even when folding, we store just the input character, as
12462                      * we have an array that finds its fold quickly */
12463                     *(s++) = (char) ender;
12464                 }
12465                 else {  /* FOLD and UTF */
12466                     /* Unlike the non-fold case, we do actually have to
12467                      * calculate the results here in pass 1.  This is for two
12468                      * reasons, the folded length may be longer than the
12469                      * unfolded, and we have to calculate how many EXACTish
12470                      * nodes it will take; and we may run out of room in a node
12471                      * in the middle of a potential multi-char fold, and have
12472                      * to back off accordingly.  (Hence we can't use REGC for
12473                      * the simple case just below.) */
12474
12475                     UV folded;
12476                     if (isASCII_uni(ender)) {
12477                         folded = toFOLD(ender);
12478                         *(s)++ = (U8) folded;
12479                     }
12480                     else {
12481                         STRLEN foldlen;
12482
12483                         folded = _to_uni_fold_flags(
12484                                      ender,
12485                                      (U8 *) s,
12486                                      &foldlen,
12487                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12488                                                         ? FOLD_FLAGS_NOMIX_ASCII
12489                                                         : 0));
12490                         s += foldlen;
12491
12492                         /* The loop increments <len> each time, as all but this
12493                          * path (and one other) through it add a single byte to
12494                          * the EXACTish node.  But this one has changed len to
12495                          * be the correct final value, so subtract one to
12496                          * cancel out the increment that follows */
12497                         len += foldlen - 1;
12498                     }
12499                     /* If this node only contains non-folding code points so
12500                      * far, see if this new one is also non-folding */
12501                     if (maybe_exact) {
12502                         if (folded != ender) {
12503                             maybe_exact = FALSE;
12504                         }
12505                         else {
12506                             /* Here the fold is the original; we have to check
12507                              * further to see if anything folds to it */
12508                             if (_invlist_contains_cp(PL_utf8_foldable,
12509                                                         ender))
12510                             {
12511                                 maybe_exact = FALSE;
12512                             }
12513                         }
12514                     }
12515                     ender = folded;
12516                 }
12517
12518                 if (next_is_quantifier) {
12519
12520                     /* Here, the next input is a quantifier, and to get here,
12521                      * the current character is the only one in the node.
12522                      * Also, here <len> doesn't include the final byte for this
12523                      * character */
12524                     len++;
12525                     goto loopdone;
12526                 }
12527
12528             } /* End of loop through literal characters */
12529
12530             /* Here we have either exhausted the input or ran out of room in
12531              * the node.  (If we encountered a character that can't be in the
12532              * node, transfer is made directly to <loopdone>, and so we
12533              * wouldn't have fallen off the end of the loop.)  In the latter
12534              * case, we artificially have to split the node into two, because
12535              * we just don't have enough space to hold everything.  This
12536              * creates a problem if the final character participates in a
12537              * multi-character fold in the non-final position, as a match that
12538              * should have occurred won't, due to the way nodes are matched,
12539              * and our artificial boundary.  So back off until we find a non-
12540              * problematic character -- one that isn't at the beginning or
12541              * middle of such a fold.  (Either it doesn't participate in any
12542              * folds, or appears only in the final position of all the folds it
12543              * does participate in.)  A better solution with far fewer false
12544              * positives, and that would fill the nodes more completely, would
12545              * be to actually have available all the multi-character folds to
12546              * test against, and to back-off only far enough to be sure that
12547              * this node isn't ending with a partial one.  <upper_parse> is set
12548              * further below (if we need to reparse the node) to include just
12549              * up through that final non-problematic character that this code
12550              * identifies, so when it is set to less than the full node, we can
12551              * skip the rest of this */
12552             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12553
12554                 const STRLEN full_len = len;
12555
12556                 assert(len >= MAX_NODE_STRING_SIZE);
12557
12558                 /* Here, <s> points to the final byte of the final character.
12559                  * Look backwards through the string until find a non-
12560                  * problematic character */
12561
12562                 if (! UTF) {
12563
12564                     /* This has no multi-char folds to non-UTF characters */
12565                     if (ASCII_FOLD_RESTRICTED) {
12566                         goto loopdone;
12567                     }
12568
12569                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12570                     len = s - s0 + 1;
12571                 }
12572                 else {
12573                     if (!  PL_NonL1NonFinalFold) {
12574                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12575                                         NonL1_Perl_Non_Final_Folds_invlist);
12576                     }
12577
12578                     /* Point to the first byte of the final character */
12579                     s = (char *) utf8_hop((U8 *) s, -1);
12580
12581                     while (s >= s0) {   /* Search backwards until find
12582                                            non-problematic char */
12583                         if (UTF8_IS_INVARIANT(*s)) {
12584
12585                             /* There are no ascii characters that participate
12586                              * in multi-char folds under /aa.  In EBCDIC, the
12587                              * non-ascii invariants are all control characters,
12588                              * so don't ever participate in any folds. */
12589                             if (ASCII_FOLD_RESTRICTED
12590                                 || ! IS_NON_FINAL_FOLD(*s))
12591                             {
12592                                 break;
12593                             }
12594                         }
12595                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12596                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12597                                                                   *s, *(s+1))))
12598                             {
12599                                 break;
12600                             }
12601                         }
12602                         else if (! _invlist_contains_cp(
12603                                         PL_NonL1NonFinalFold,
12604                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12605                         {
12606                             break;
12607                         }
12608
12609                         /* Here, the current character is problematic in that
12610                          * it does occur in the non-final position of some
12611                          * fold, so try the character before it, but have to
12612                          * special case the very first byte in the string, so
12613                          * we don't read outside the string */
12614                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12615                     } /* End of loop backwards through the string */
12616
12617                     /* If there were only problematic characters in the string,
12618                      * <s> will point to before s0, in which case the length
12619                      * should be 0, otherwise include the length of the
12620                      * non-problematic character just found */
12621                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12622                 }
12623
12624                 /* Here, have found the final character, if any, that is
12625                  * non-problematic as far as ending the node without splitting
12626                  * it across a potential multi-char fold.  <len> contains the
12627                  * number of bytes in the node up-to and including that
12628                  * character, or is 0 if there is no such character, meaning
12629                  * the whole node contains only problematic characters.  In
12630                  * this case, give up and just take the node as-is.  We can't
12631                  * do any better */
12632                 if (len == 0) {
12633                     len = full_len;
12634
12635                     /* If the node ends in an 's' we make sure it stays EXACTF,
12636                      * as if it turns into an EXACTFU, it could later get
12637                      * joined with another 's' that would then wrongly match
12638                      * the sharp s */
12639                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12640                     {
12641                         maybe_exactfu = FALSE;
12642                     }
12643                 } else {
12644
12645                     /* Here, the node does contain some characters that aren't
12646                      * problematic.  If one such is the final character in the
12647                      * node, we are done */
12648                     if (len == full_len) {
12649                         goto loopdone;
12650                     }
12651                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12652
12653                         /* If the final character is problematic, but the
12654                          * penultimate is not, back-off that last character to
12655                          * later start a new node with it */
12656                         p = oldp;
12657                         goto loopdone;
12658                     }
12659
12660                     /* Here, the final non-problematic character is earlier
12661                      * in the input than the penultimate character.  What we do
12662                      * is reparse from the beginning, going up only as far as
12663                      * this final ok one, thus guaranteeing that the node ends
12664                      * in an acceptable character.  The reason we reparse is
12665                      * that we know how far in the character is, but we don't
12666                      * know how to correlate its position with the input parse.
12667                      * An alternate implementation would be to build that
12668                      * correlation as we go along during the original parse,
12669                      * but that would entail extra work for every node, whereas
12670                      * this code gets executed only when the string is too
12671                      * large for the node, and the final two characters are
12672                      * problematic, an infrequent occurrence.  Yet another
12673                      * possible strategy would be to save the tail of the
12674                      * string, and the next time regatom is called, initialize
12675                      * with that.  The problem with this is that unless you
12676                      * back off one more character, you won't be guaranteed
12677                      * regatom will get called again, unless regbranch,
12678                      * regpiece ... are also changed.  If you do back off that
12679                      * extra character, so that there is input guaranteed to
12680                      * force calling regatom, you can't handle the case where
12681                      * just the first character in the node is acceptable.  I
12682                      * (khw) decided to try this method which doesn't have that
12683                      * pitfall; if performance issues are found, we can do a
12684                      * combination of the current approach plus that one */
12685                     upper_parse = len;
12686                     len = 0;
12687                     s = s0;
12688                     goto reparse;
12689                 }
12690             }   /* End of verifying node ends with an appropriate char */
12691
12692         loopdone:   /* Jumped to when encounters something that shouldn't be in
12693                        the node */
12694
12695             /* I (khw) don't know if you can get here with zero length, but the
12696              * old code handled this situation by creating a zero-length EXACT
12697              * node.  Might as well be NOTHING instead */
12698             if (len == 0) {
12699                 OP(ret) = NOTHING;
12700             }
12701             else {
12702                 if (FOLD) {
12703                     /* If 'maybe_exact' is still set here, means there are no
12704                      * code points in the node that participate in folds;
12705                      * similarly for 'maybe_exactfu' and code points that match
12706                      * differently depending on UTF8ness of the target string
12707                      * (for /u), or depending on locale for /l */
12708                     if (maybe_exact) {
12709                         OP(ret) = EXACT;
12710                     }
12711                     else if (maybe_exactfu) {
12712                         OP(ret) = EXACTFU;
12713                     }
12714                 }
12715                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12716                                            FALSE /* Don't look to see if could
12717                                                     be turned into an EXACT
12718                                                     node, as we have already
12719                                                     computed that */
12720                                           );
12721             }
12722
12723             RExC_parse = p - 1;
12724             Set_Node_Cur_Length(ret, parse_start);
12725             nextchar(pRExC_state);
12726             {
12727                 /* len is STRLEN which is unsigned, need to copy to signed */
12728                 IV iv = len;
12729                 if (iv < 0)
12730                     vFAIL("Internal disaster");
12731             }
12732
12733         } /* End of label 'defchar:' */
12734         break;
12735     } /* End of giant switch on input character */
12736
12737     return(ret);
12738 }
12739
12740 STATIC char *
12741 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12742 {
12743     /* Returns the next non-pattern-white space, non-comment character (the
12744      * latter only if 'recognize_comment is true) in the string p, which is
12745      * ended by RExC_end.  See also reg_skipcomment */
12746     const char *e = RExC_end;
12747
12748     PERL_ARGS_ASSERT_REGPATWS;
12749
12750     while (p < e) {
12751         STRLEN len;
12752         if ((len = is_PATWS_safe(p, e, UTF))) {
12753             p += len;
12754         }
12755         else if (recognize_comment && *p == '#') {
12756             p = reg_skipcomment(pRExC_state, p);
12757         }
12758         else
12759             break;
12760     }
12761     return p;
12762 }
12763
12764 STATIC void
12765 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12766 {
12767     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12768      * sets up the bitmap and any flags, removing those code points from the
12769      * inversion list, setting it to NULL should it become completely empty */
12770
12771     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12772     assert(PL_regkind[OP(node)] == ANYOF);
12773
12774     ANYOF_BITMAP_ZERO(node);
12775     if (*invlist_ptr) {
12776
12777         /* This gets set if we actually need to modify things */
12778         bool change_invlist = FALSE;
12779
12780         UV start, end;
12781
12782         /* Start looking through *invlist_ptr */
12783         invlist_iterinit(*invlist_ptr);
12784         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12785             UV high;
12786             int i;
12787
12788             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12789                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12790             }
12791             else if (end >= NUM_ANYOF_CODE_POINTS) {
12792                 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12793             }
12794
12795             /* Quit if are above what we should change */
12796             if (start >= NUM_ANYOF_CODE_POINTS) {
12797                 break;
12798             }
12799
12800             change_invlist = TRUE;
12801
12802             /* Set all the bits in the range, up to the max that we are doing */
12803             high = (end < NUM_ANYOF_CODE_POINTS - 1)
12804                    ? end
12805                    : NUM_ANYOF_CODE_POINTS - 1;
12806             for (i = start; i <= (int) high; i++) {
12807                 if (! ANYOF_BITMAP_TEST(node, i)) {
12808                     ANYOF_BITMAP_SET(node, i);
12809                 }
12810             }
12811         }
12812         invlist_iterfinish(*invlist_ptr);
12813
12814         /* Done with loop; remove any code points that are in the bitmap from
12815          * *invlist_ptr; similarly for code points above the bitmap if we have
12816          * a flag to match all of them anyways */
12817         if (change_invlist) {
12818             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12819         }
12820         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12821             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12822         }
12823
12824         /* If have completely emptied it, remove it completely */
12825         if (_invlist_len(*invlist_ptr) == 0) {
12826             SvREFCNT_dec_NN(*invlist_ptr);
12827             *invlist_ptr = NULL;
12828         }
12829     }
12830 }
12831
12832 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12833    Character classes ([:foo:]) can also be negated ([:^foo:]).
12834    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12835    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12836    but trigger failures because they are currently unimplemented. */
12837
12838 #define POSIXCC_DONE(c)   ((c) == ':')
12839 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12840 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12841
12842 PERL_STATIC_INLINE I32
12843 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12844 {
12845     I32 namedclass = OOB_NAMEDCLASS;
12846
12847     PERL_ARGS_ASSERT_REGPPOSIXCC;
12848
12849     if (value == '[' && RExC_parse + 1 < RExC_end &&
12850         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12851         POSIXCC(UCHARAT(RExC_parse)))
12852     {
12853         const char c = UCHARAT(RExC_parse);
12854         char* const s = RExC_parse++;
12855
12856         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12857             RExC_parse++;
12858         if (RExC_parse == RExC_end) {
12859             if (strict) {
12860
12861                 /* Try to give a better location for the error (than the end of
12862                  * the string) by looking for the matching ']' */
12863                 RExC_parse = s;
12864                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12865                     RExC_parse++;
12866                 }
12867                 vFAIL2("Unmatched '%c' in POSIX class", c);
12868             }
12869             /* Grandfather lone [:, [=, [. */
12870             RExC_parse = s;
12871         }
12872         else {
12873             const char* const t = RExC_parse++; /* skip over the c */
12874             assert(*t == c);
12875
12876             if (UCHARAT(RExC_parse) == ']') {
12877                 const char *posixcc = s + 1;
12878                 RExC_parse++; /* skip over the ending ] */
12879
12880                 if (*s == ':') {
12881                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12882                     const I32 skip = t - posixcc;
12883
12884                     /* Initially switch on the length of the name.  */
12885                     switch (skip) {
12886                     case 4:
12887                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12888                                                           this is the Perl \w
12889                                                         */
12890                             namedclass = ANYOF_WORDCHAR;
12891                         break;
12892                     case 5:
12893                         /* Names all of length 5.  */
12894                         /* alnum alpha ascii blank cntrl digit graph lower
12895                            print punct space upper  */
12896                         /* Offset 4 gives the best switch position.  */
12897                         switch (posixcc[4]) {
12898                         case 'a':
12899                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12900                                 namedclass = ANYOF_ALPHA;
12901                             break;
12902                         case 'e':
12903                             if (memEQ(posixcc, "spac", 4)) /* space */
12904                                 namedclass = ANYOF_PSXSPC;
12905                             break;
12906                         case 'h':
12907                             if (memEQ(posixcc, "grap", 4)) /* graph */
12908                                 namedclass = ANYOF_GRAPH;
12909                             break;
12910                         case 'i':
12911                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12912                                 namedclass = ANYOF_ASCII;
12913                             break;
12914                         case 'k':
12915                             if (memEQ(posixcc, "blan", 4)) /* blank */
12916                                 namedclass = ANYOF_BLANK;
12917                             break;
12918                         case 'l':
12919                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12920                                 namedclass = ANYOF_CNTRL;
12921                             break;
12922                         case 'm':
12923                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12924                                 namedclass = ANYOF_ALPHANUMERIC;
12925                             break;
12926                         case 'r':
12927                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12928                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12929                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12930                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12931                             break;
12932                         case 't':
12933                             if (memEQ(posixcc, "digi", 4)) /* digit */
12934                                 namedclass = ANYOF_DIGIT;
12935                             else if (memEQ(posixcc, "prin", 4)) /* print */
12936                                 namedclass = ANYOF_PRINT;
12937                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12938                                 namedclass = ANYOF_PUNCT;
12939                             break;
12940                         }
12941                         break;
12942                     case 6:
12943                         if (memEQ(posixcc, "xdigit", 6))
12944                             namedclass = ANYOF_XDIGIT;
12945                         break;
12946                     }
12947
12948                     if (namedclass == OOB_NAMEDCLASS)
12949                         vFAIL2utf8f(
12950                             "POSIX class [:%"UTF8f":] unknown",
12951                             UTF8fARG(UTF, t - s - 1, s + 1));
12952
12953                     /* The #defines are structured so each complement is +1 to
12954                      * the normal one */
12955                     if (complement) {
12956                         namedclass++;
12957                     }
12958                     assert (posixcc[skip] == ':');
12959                     assert (posixcc[skip+1] == ']');
12960                 } else if (!SIZE_ONLY) {
12961                     /* [[=foo=]] and [[.foo.]] are still future. */
12962
12963                     /* adjust RExC_parse so the warning shows after
12964                        the class closes */
12965                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12966                         RExC_parse++;
12967                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12968                 }
12969             } else {
12970                 /* Maternal grandfather:
12971                  * "[:" ending in ":" but not in ":]" */
12972                 if (strict) {
12973                     vFAIL("Unmatched '[' in POSIX class");
12974                 }
12975
12976                 /* Grandfather lone [:, [=, [. */
12977                 RExC_parse = s;
12978             }
12979         }
12980     }
12981
12982     return namedclass;
12983 }
12984
12985 STATIC bool
12986 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12987 {
12988     /* This applies some heuristics at the current parse position (which should
12989      * be at a '[') to see if what follows might be intended to be a [:posix:]
12990      * class.  It returns true if it really is a posix class, of course, but it
12991      * also can return true if it thinks that what was intended was a posix
12992      * class that didn't quite make it.
12993      *
12994      * It will return true for
12995      *      [:alphanumerics:
12996      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12997      *                         ')' indicating the end of the (?[
12998      *      [:any garbage including %^&$ punctuation:]
12999      *
13000      * This is designed to be called only from S_handle_regex_sets; it could be
13001      * easily adapted to be called from the spot at the beginning of regclass()
13002      * that checks to see in a normal bracketed class if the surrounding []
13003      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
13004      * change long-standing behavior, so I (khw) didn't do that */
13005     char* p = RExC_parse + 1;
13006     char first_char = *p;
13007
13008     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13009
13010     assert(*(p - 1) == '[');
13011
13012     if (! POSIXCC(first_char)) {
13013         return FALSE;
13014     }
13015
13016     p++;
13017     while (p < RExC_end && isWORDCHAR(*p)) p++;
13018
13019     if (p >= RExC_end) {
13020         return FALSE;
13021     }
13022
13023     if (p - RExC_parse > 2    /* Got at least 1 word character */
13024         && (*p == first_char
13025             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13026     {
13027         return TRUE;
13028     }
13029
13030     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13031
13032     return (p
13033             && p - RExC_parse > 2 /* [:] evaluates to colon;
13034                                       [::] is a bad posix class. */
13035             && first_char == *(p - 1));
13036 }
13037
13038 STATIC regnode *
13039 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13040                     I32 *flagp, U32 depth,
13041                     char * const oregcomp_parse)
13042 {
13043     /* Handle the (?[...]) construct to do set operations */
13044
13045     U8 curchar;
13046     UV start, end;      /* End points of code point ranges */
13047     SV* result_string;
13048     char *save_end, *save_parse;
13049     SV* final;
13050     STRLEN len;
13051     regnode* node;
13052     AV* stack;
13053     const bool save_fold = FOLD;
13054
13055     GET_RE_DEBUG_FLAGS_DECL;
13056
13057     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13058
13059     if (LOC) {
13060         vFAIL("(?[...]) not valid in locale");
13061     }
13062     RExC_uni_semantics = 1;
13063
13064     /* This will return only an ANYOF regnode, or (unlikely) something smaller
13065      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
13066      * call regclass to handle '[]' so as to not have to reinvent its parsing
13067      * rules here (throwing away the size it computes each time).  And, we exit
13068      * upon an unescaped ']' that isn't one ending a regclass.  To do both
13069      * these things, we need to realize that something preceded by a backslash
13070      * is escaped, so we have to keep track of backslashes */
13071     if (PASS2) {
13072         Perl_ck_warner_d(aTHX_
13073             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13074             "The regex_sets feature is experimental" REPORT_LOCATION,
13075                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13076                 UTF8fARG(UTF,
13077                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13078                          RExC_precomp + (RExC_parse - RExC_precomp)));
13079     }
13080     else {
13081         UV depth = 0; /* how many nested (?[...]) constructs */
13082
13083         while (RExC_parse < RExC_end) {
13084             SV* current = NULL;
13085             RExC_parse = regpatws(pRExC_state, RExC_parse,
13086                                           TRUE); /* means recognize comments */
13087             switch (*RExC_parse) {
13088                 case '?':
13089                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
13090                     /* FALLTHROUGH */
13091                 default:
13092                     break;
13093                 case '\\':
13094                     /* Skip the next byte (which could cause us to end up in
13095                      * the middle of a UTF-8 character, but since none of those
13096                      * are confusable with anything we currently handle in this
13097                      * switch (invariants all), it's safe.  We'll just hit the
13098                      * default: case next time and keep on incrementing until
13099                      * we find one of the invariants we do handle. */
13100                     RExC_parse++;
13101                     break;
13102                 case '[':
13103                 {
13104                     /* If this looks like it is a [:posix:] class, leave the
13105                      * parse pointer at the '[' to fool regclass() into
13106                      * thinking it is part of a '[[:posix:]]'.  That function
13107                      * will use strict checking to force a syntax error if it
13108                      * doesn't work out to a legitimate class */
13109                     bool is_posix_class
13110                                     = could_it_be_a_POSIX_class(pRExC_state);
13111                     if (! is_posix_class) {
13112                         RExC_parse++;
13113                     }
13114
13115                     /* regclass() can only return RESTART_UTF8 if multi-char
13116                        folds are allowed.  */
13117                     if (!regclass(pRExC_state, flagp,depth+1,
13118                                   is_posix_class, /* parse the whole char
13119                                                      class only if not a
13120                                                      posix class */
13121                                   FALSE, /* don't allow multi-char folds */
13122                                   TRUE, /* silence non-portable warnings. */
13123                                   &current))
13124                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13125                               (UV) *flagp);
13126
13127                     /* function call leaves parse pointing to the ']', except
13128                      * if we faked it */
13129                     if (is_posix_class) {
13130                         RExC_parse--;
13131                     }
13132
13133                     SvREFCNT_dec(current);   /* In case it returned something */
13134                     break;
13135                 }
13136
13137                 case ']':
13138                     if (depth--) break;
13139                     RExC_parse++;
13140                     if (RExC_parse < RExC_end
13141                         && *RExC_parse == ')')
13142                     {
13143                         node = reganode(pRExC_state, ANYOF, 0);
13144                         RExC_size += ANYOF_SKIP;
13145                         nextchar(pRExC_state);
13146                         Set_Node_Length(node,
13147                                 RExC_parse - oregcomp_parse + 1); /* MJD */
13148                         return node;
13149                     }
13150                     goto no_close;
13151             }
13152             RExC_parse++;
13153         }
13154
13155         no_close:
13156         FAIL("Syntax error in (?[...])");
13157     }
13158
13159     /* Pass 2 only after this.  Everything in this construct is a
13160      * metacharacter.  Operands begin with either a '\' (for an escape
13161      * sequence), or a '[' for a bracketed character class.  Any other
13162      * character should be an operator, or parenthesis for grouping.  Both
13163      * types of operands are handled by calling regclass() to parse them.  It
13164      * is called with a parameter to indicate to return the computed inversion
13165      * list.  The parsing here is implemented via a stack.  Each entry on the
13166      * stack is a single character representing one of the operators, or the
13167      * '('; or else a pointer to an operand inversion list. */
13168
13169 #define IS_OPERAND(a)  (! SvIOK(a))
13170
13171     /* The stack starts empty.  It is a syntax error if the first thing parsed
13172      * is a binary operator; everything else is pushed on the stack.  When an
13173      * operand is parsed, the top of the stack is examined.  If it is a binary
13174      * operator, the item before it should be an operand, and both are replaced
13175      * by the result of doing that operation on the new operand and the one on
13176      * the stack.   Thus a sequence of binary operands is reduced to a single
13177      * one before the next one is parsed.
13178      *
13179      * A unary operator may immediately follow a binary in the input, for
13180      * example
13181      *      [a] + ! [b]
13182      * When an operand is parsed and the top of the stack is a unary operator,
13183      * the operation is performed, and then the stack is rechecked to see if
13184      * this new operand is part of a binary operation; if so, it is handled as
13185      * above.
13186      *
13187      * A '(' is simply pushed on the stack; it is valid only if the stack is
13188      * empty, or the top element of the stack is an operator or another '('
13189      * (for which the parenthesized expression will become an operand).  By the
13190      * time the corresponding ')' is parsed everything in between should have
13191      * been parsed and evaluated to a single operand (or else is a syntax
13192      * error), and is handled as a regular operand */
13193
13194     sv_2mortal((SV *)(stack = newAV()));
13195
13196     while (RExC_parse < RExC_end) {
13197         I32 top_index = av_tindex(stack);
13198         SV** top_ptr;
13199         SV* current = NULL;
13200
13201         /* Skip white space */
13202         RExC_parse = regpatws(pRExC_state, RExC_parse,
13203                                          TRUE /* means recognize comments */ );
13204         if (RExC_parse >= RExC_end) {
13205             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13206         }
13207         if ((curchar = UCHARAT(RExC_parse)) == ']') {
13208             break;
13209         }
13210
13211         switch (curchar) {
13212
13213             case '?':
13214                 if (av_tindex(stack) >= 0   /* This makes sure that we can
13215                                                safely subtract 1 from
13216                                                RExC_parse in the next clause.
13217                                                If we have something on the
13218                                                stack, we have parsed something
13219                                              */
13220                     && UCHARAT(RExC_parse - 1) == '('
13221                     && RExC_parse < RExC_end)
13222                 {
13223                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13224                      * This happens when we have some thing like
13225                      *
13226                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13227                      *   ...
13228                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
13229                      *
13230                      * Here we would be handling the interpolated
13231                      * '$thai_or_lao'.  We handle this by a recursive call to
13232                      * ourselves which returns the inversion list the
13233                      * interpolated expression evaluates to.  We use the flags
13234                      * from the interpolated pattern. */
13235                     U32 save_flags = RExC_flags;
13236                     const char * const save_parse = ++RExC_parse;
13237
13238                     parse_lparen_question_flags(pRExC_state);
13239
13240                     if (RExC_parse == save_parse  /* Makes sure there was at
13241                                                      least one flag (or this
13242                                                      embedding wasn't compiled)
13243                                                    */
13244                         || RExC_parse >= RExC_end - 4
13245                         || UCHARAT(RExC_parse) != ':'
13246                         || UCHARAT(++RExC_parse) != '('
13247                         || UCHARAT(++RExC_parse) != '?'
13248                         || UCHARAT(++RExC_parse) != '[')
13249                     {
13250
13251                         /* In combination with the above, this moves the
13252                          * pointer to the point just after the first erroneous
13253                          * character (or if there are no flags, to where they
13254                          * should have been) */
13255                         if (RExC_parse >= RExC_end - 4) {
13256                             RExC_parse = RExC_end;
13257                         }
13258                         else if (RExC_parse != save_parse) {
13259                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13260                         }
13261                         vFAIL("Expecting '(?flags:(?[...'");
13262                     }
13263                     RExC_parse++;
13264                     (void) handle_regex_sets(pRExC_state, &current, flagp,
13265                                                     depth+1, oregcomp_parse);
13266
13267                     /* Here, 'current' contains the embedded expression's
13268                      * inversion list, and RExC_parse points to the trailing
13269                      * ']'; the next character should be the ')' which will be
13270                      * paired with the '(' that has been put on the stack, so
13271                      * the whole embedded expression reduces to '(operand)' */
13272                     RExC_parse++;
13273
13274                     RExC_flags = save_flags;
13275                     goto handle_operand;
13276                 }
13277                 /* FALLTHROUGH */
13278
13279             default:
13280                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13281                 vFAIL("Unexpected character");
13282
13283             case '\\':
13284                 /* regclass() can only return RESTART_UTF8 if multi-char
13285                    folds are allowed.  */
13286                 if (!regclass(pRExC_state, flagp,depth+1,
13287                               TRUE, /* means parse just the next thing */
13288                               FALSE, /* don't allow multi-char folds */
13289                               FALSE, /* don't silence non-portable warnings.  */
13290                               &current))
13291                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13292                           (UV) *flagp);
13293                 /* regclass() will return with parsing just the \ sequence,
13294                  * leaving the parse pointer at the next thing to parse */
13295                 RExC_parse--;
13296                 goto handle_operand;
13297
13298             case '[':   /* Is a bracketed character class */
13299             {
13300                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13301
13302                 if (! is_posix_class) {
13303                     RExC_parse++;
13304                 }
13305
13306                 /* regclass() can only return RESTART_UTF8 if multi-char
13307                    folds are allowed.  */
13308                 if(!regclass(pRExC_state, flagp,depth+1,
13309                              is_posix_class, /* parse the whole char class
13310                                                 only if not a posix class */
13311                              FALSE, /* don't allow multi-char folds */
13312                              FALSE, /* don't silence non-portable warnings.  */
13313                              &current))
13314                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13315                           (UV) *flagp);
13316                 /* function call leaves parse pointing to the ']', except if we
13317                  * faked it */
13318                 if (is_posix_class) {
13319                     RExC_parse--;
13320                 }
13321
13322                 goto handle_operand;
13323             }
13324
13325             case '&':
13326             case '|':
13327             case '+':
13328             case '-':
13329             case '^':
13330                 if (top_index < 0
13331                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13332                     || ! IS_OPERAND(*top_ptr))
13333                 {
13334                     RExC_parse++;
13335                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13336                 }
13337                 av_push(stack, newSVuv(curchar));
13338                 break;
13339
13340             case '!':
13341                 av_push(stack, newSVuv(curchar));
13342                 break;
13343
13344             case '(':
13345                 if (top_index >= 0) {
13346                     top_ptr = av_fetch(stack, top_index, FALSE);
13347                     assert(top_ptr);
13348                     if (IS_OPERAND(*top_ptr)) {
13349                         RExC_parse++;
13350                         vFAIL("Unexpected '(' with no preceding operator");
13351                     }
13352                 }
13353                 av_push(stack, newSVuv(curchar));
13354                 break;
13355
13356             case ')':
13357             {
13358                 SV* lparen;
13359                 if (top_index < 1
13360                     || ! (current = av_pop(stack))
13361                     || ! IS_OPERAND(current)
13362                     || ! (lparen = av_pop(stack))
13363                     || IS_OPERAND(lparen)
13364                     || SvUV(lparen) != '(')
13365                 {
13366                     SvREFCNT_dec(current);
13367                     RExC_parse++;
13368                     vFAIL("Unexpected ')'");
13369                 }
13370                 top_index -= 2;
13371                 SvREFCNT_dec_NN(lparen);
13372
13373                 /* FALLTHROUGH */
13374             }
13375
13376               handle_operand:
13377
13378                 /* Here, we have an operand to process, in 'current' */
13379
13380                 if (top_index < 0) {    /* Just push if stack is empty */
13381                     av_push(stack, current);
13382                 }
13383                 else {
13384                     SV* top = av_pop(stack);
13385                     SV *prev = NULL;
13386                     char current_operator;
13387
13388                     if (IS_OPERAND(top)) {
13389                         SvREFCNT_dec_NN(top);
13390                         SvREFCNT_dec_NN(current);
13391                         vFAIL("Operand with no preceding operator");
13392                     }
13393                     current_operator = (char) SvUV(top);
13394                     switch (current_operator) {
13395                         case '(':   /* Push the '(' back on followed by the new
13396                                        operand */
13397                             av_push(stack, top);
13398                             av_push(stack, current);
13399                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13400                                                    just after the 'break', so
13401                                                    it doesn't get wrongly freed
13402                                                  */
13403                             break;
13404
13405                         case '!':
13406                             _invlist_invert(current);
13407
13408                             /* Unlike binary operators, the top of the stack,
13409                              * now that this unary one has been popped off, may
13410                              * legally be an operator, and we now have operand
13411                              * for it. */
13412                             top_index--;
13413                             SvREFCNT_dec_NN(top);
13414                             goto handle_operand;
13415
13416                         case '&':
13417                             prev = av_pop(stack);
13418                             _invlist_intersection(prev,
13419                                                    current,
13420                                                    &current);
13421                             av_push(stack, current);
13422                             break;
13423
13424                         case '|':
13425                         case '+':
13426                             prev = av_pop(stack);
13427                             _invlist_union(prev, current, &current);
13428                             av_push(stack, current);
13429                             break;
13430
13431                         case '-':
13432                             prev = av_pop(stack);;
13433                             _invlist_subtract(prev, current, &current);
13434                             av_push(stack, current);
13435                             break;
13436
13437                         case '^':   /* The union minus the intersection */
13438                         {
13439                             SV* i = NULL;
13440                             SV* u = NULL;
13441                             SV* element;
13442
13443                             prev = av_pop(stack);
13444                             _invlist_union(prev, current, &u);
13445                             _invlist_intersection(prev, current, &i);
13446                             /* _invlist_subtract will overwrite current
13447                                 without freeing what it already contains */
13448                             element = current;
13449                             _invlist_subtract(u, i, &current);
13450                             av_push(stack, current);
13451                             SvREFCNT_dec_NN(i);
13452                             SvREFCNT_dec_NN(u);
13453                             SvREFCNT_dec_NN(element);
13454                             break;
13455                         }
13456
13457                         default:
13458                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13459                 }
13460                 SvREFCNT_dec_NN(top);
13461                 SvREFCNT_dec(prev);
13462             }
13463         }
13464
13465         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13466     }
13467
13468     if (av_tindex(stack) < 0   /* Was empty */
13469         || ((final = av_pop(stack)) == NULL)
13470         || ! IS_OPERAND(final)
13471         || av_tindex(stack) >= 0)  /* More left on stack */
13472     {
13473         vFAIL("Incomplete expression within '(?[ ])'");
13474     }
13475
13476     /* Here, 'final' is the resultant inversion list from evaluating the
13477      * expression.  Return it if so requested */
13478     if (return_invlist) {
13479         *return_invlist = final;
13480         return END;
13481     }
13482
13483     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13484      * expecting a string of ranges and individual code points */
13485     invlist_iterinit(final);
13486     result_string = newSVpvs("");
13487     while (invlist_iternext(final, &start, &end)) {
13488         if (start == end) {
13489             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13490         }
13491         else {
13492             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13493                                                      start,          end);
13494         }
13495     }
13496
13497     save_parse = RExC_parse;
13498     RExC_parse = SvPV(result_string, len);
13499     save_end = RExC_end;
13500     RExC_end = RExC_parse + len;
13501
13502     /* We turn off folding around the call, as the class we have constructed
13503      * already has all folding taken into consideration, and we don't want
13504      * regclass() to add to that */
13505     RExC_flags &= ~RXf_PMf_FOLD;
13506     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13507      */
13508     node = regclass(pRExC_state, flagp,depth+1,
13509                     FALSE, /* means parse the whole char class */
13510                     FALSE, /* don't allow multi-char folds */
13511                     TRUE, /* silence non-portable warnings.  The above may very
13512                              well have generated non-portable code points, but
13513                              they're valid on this machine */
13514                     NULL);
13515     if (!node)
13516         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13517                     PTR2UV(flagp));
13518     if (save_fold) {
13519         RExC_flags |= RXf_PMf_FOLD;
13520     }
13521     RExC_parse = save_parse + 1;
13522     RExC_end = save_end;
13523     SvREFCNT_dec_NN(final);
13524     SvREFCNT_dec_NN(result_string);
13525
13526     nextchar(pRExC_state);
13527     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13528     return node;
13529 }
13530 #undef IS_OPERAND
13531
13532 STATIC void
13533 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13534 {
13535     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13536      * innocent-looking character class, like /[ks]/i won't have to go out to
13537      * disk to find the possible matches.
13538      *
13539      * This should be called only for a Latin1-range code points, cp, which is
13540      * known to be involved in a simple fold with other code points above
13541      * Latin1.  It would give false results if /aa has been specified.
13542      * Multi-char folds are outside the scope of this, and must be handled
13543      * specially.
13544      *
13545      * XXX It would be better to generate these via regen, in case a new
13546      * version of the Unicode standard adds new mappings, though that is not
13547      * really likely, and may be caught by the default: case of the switch
13548      * below. */
13549
13550     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13551
13552     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13553
13554     switch (cp) {
13555         case 'k':
13556         case 'K':
13557           *invlist =
13558              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13559             break;
13560         case 's':
13561         case 'S':
13562           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13563             break;
13564         case MICRO_SIGN:
13565           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13566           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13567             break;
13568         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13569         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13570           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13571             break;
13572         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13573           *invlist = add_cp_to_invlist(*invlist,
13574                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13575             break;
13576         case LATIN_SMALL_LETTER_SHARP_S:
13577           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13578             break;
13579         default:
13580             /* Use deprecated warning to increase the chances of this being
13581              * output */
13582             if (PASS2) {
13583                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13584             }
13585             break;
13586     }
13587 }
13588
13589 STATIC AV *
13590 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13591 {
13592     /* This adds the string scalar <multi_string> to the array
13593      * <multi_char_matches>.  <multi_string> is known to have exactly
13594      * <cp_count> code points in it.  This is used when constructing a
13595      * bracketed character class and we find something that needs to match more
13596      * than a single character.
13597      *
13598      * <multi_char_matches> is actually an array of arrays.  Each top-level
13599      * element is an array that contains all the strings known so far that are
13600      * the same length.  And that length (in number of code points) is the same
13601      * as the index of the top-level array.  Hence, the [2] element is an
13602      * array, each element thereof is a string containing TWO code points;
13603      * while element [3] is for strings of THREE characters, and so on.  Since
13604      * this is for multi-char strings there can never be a [0] nor [1] element.
13605      *
13606      * When we rewrite the character class below, we will do so such that the
13607      * longest strings are written first, so that it prefers the longest
13608      * matching strings first.  This is done even if it turns out that any
13609      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
13610      * Christiansen has agreed that this is ok.  This makes the test for the
13611      * ligature 'ffi' come before the test for 'ff', for example */
13612
13613     AV* this_array;
13614     AV** this_array_ptr;
13615
13616     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13617
13618     if (! multi_char_matches) {
13619         multi_char_matches = newAV();
13620     }
13621
13622     if (av_exists(multi_char_matches, cp_count)) {
13623         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13624         this_array = *this_array_ptr;
13625     }
13626     else {
13627         this_array = newAV();
13628         av_store(multi_char_matches, cp_count,
13629                  (SV*) this_array);
13630     }
13631     av_push(this_array, multi_string);
13632
13633     return multi_char_matches;
13634 }
13635
13636 /* The names of properties whose definitions are not known at compile time are
13637  * stored in this SV, after a constant heading.  So if the length has been
13638  * changed since initialization, then there is a run-time definition. */
13639 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13640                                         (SvCUR(listsv) != initial_listsv_len)
13641
13642 STATIC regnode *
13643 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13644                  const bool stop_at_1,  /* Just parse the next thing, don't
13645                                            look for a full character class */
13646                  bool allow_multi_folds,
13647                  const bool silence_non_portable,   /* Don't output warnings
13648                                                        about too large
13649                                                        characters */
13650                  SV** ret_invlist)  /* Return an inversion list, not a node */
13651 {
13652     /* parse a bracketed class specification.  Most of these will produce an
13653      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13654      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13655      * under /i with multi-character folds: it will be rewritten following the
13656      * paradigm of this example, where the <multi-fold>s are characters which
13657      * fold to multiple character sequences:
13658      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13659      * gets effectively rewritten as:
13660      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13661      * reg() gets called (recursively) on the rewritten version, and this
13662      * function will return what it constructs.  (Actually the <multi-fold>s
13663      * aren't physically removed from the [abcdefghi], it's just that they are
13664      * ignored in the recursion by means of a flag:
13665      * <RExC_in_multi_char_class>.)
13666      *
13667      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13668      * characters, with the corresponding bit set if that character is in the
13669      * list.  For characters above this, a range list or swash is used.  There
13670      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13671      * determinable at compile time
13672      *
13673      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13674      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13675      */
13676
13677     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13678     IV range = 0;
13679     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13680     regnode *ret;
13681     STRLEN numlen;
13682     IV namedclass = OOB_NAMEDCLASS;
13683     char *rangebegin = NULL;
13684     bool need_class = 0;
13685     SV *listsv = NULL;
13686     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13687                                       than just initialized.  */
13688     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13689     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13690                                extended beyond the Latin1 range.  These have to
13691                                be kept separate from other code points for much
13692                                of this function because their handling  is
13693                                different under /i, and for most classes under
13694                                /d as well */
13695     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13696                                separate for a while from the non-complemented
13697                                versions because of complications with /d
13698                                matching */
13699     UV element_count = 0;   /* Number of distinct elements in the class.
13700                                Optimizations may be possible if this is tiny */
13701     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13702                                        character; used under /i */
13703     UV n;
13704     char * stop_ptr = RExC_end;    /* where to stop parsing */
13705     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13706                                                    space? */
13707     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13708
13709     /* Unicode properties are stored in a swash; this holds the current one
13710      * being parsed.  If this swash is the only above-latin1 component of the
13711      * character class, an optimization is to pass it directly on to the
13712      * execution engine.  Otherwise, it is set to NULL to indicate that there
13713      * are other things in the class that have to be dealt with at execution
13714      * time */
13715     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13716
13717     /* Set if a component of this character class is user-defined; just passed
13718      * on to the engine */
13719     bool has_user_defined_property = FALSE;
13720
13721     /* inversion list of code points this node matches only when the target
13722      * string is in UTF-8.  (Because is under /d) */
13723     SV* depends_list = NULL;
13724
13725     /* Inversion list of code points this node matches regardless of things
13726      * like locale, folding, utf8ness of the target string */
13727     SV* cp_list = NULL;
13728
13729     /* Like cp_list, but code points on this list need to be checked for things
13730      * that fold to/from them under /i */
13731     SV* cp_foldable_list = NULL;
13732
13733     /* Like cp_list, but code points on this list are valid only when the
13734      * runtime locale is UTF-8 */
13735     SV* only_utf8_locale_list = NULL;
13736
13737 #ifdef EBCDIC
13738     /* In a range, counts how many 0-2 of the ends of it came from literals,
13739      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13740     UV literal_endpoint = 0;
13741 #endif
13742     bool invert = FALSE;    /* Is this class to be complemented */
13743
13744     bool warn_super = ALWAYS_WARN_SUPER;
13745
13746     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13747         case we need to change the emitted regop to an EXACT. */
13748     const char * orig_parse = RExC_parse;
13749     const SSize_t orig_size = RExC_size;
13750     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13751     GET_RE_DEBUG_FLAGS_DECL;
13752
13753     PERL_ARGS_ASSERT_REGCLASS;
13754 #ifndef DEBUGGING
13755     PERL_UNUSED_ARG(depth);
13756 #endif
13757
13758     DEBUG_PARSE("clas");
13759
13760     /* Assume we are going to generate an ANYOF node. */
13761     ret = reganode(pRExC_state, ANYOF, 0);
13762
13763     if (SIZE_ONLY) {
13764         RExC_size += ANYOF_SKIP;
13765         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13766     }
13767     else {
13768         ANYOF_FLAGS(ret) = 0;
13769
13770         RExC_emit += ANYOF_SKIP;
13771         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13772         initial_listsv_len = SvCUR(listsv);
13773         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13774     }
13775
13776     if (skip_white) {
13777         RExC_parse = regpatws(pRExC_state, RExC_parse,
13778                               FALSE /* means don't recognize comments */ );
13779     }
13780
13781     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13782         RExC_parse++;
13783         invert = TRUE;
13784         allow_multi_folds = FALSE;
13785         RExC_naughty++;
13786         if (skip_white) {
13787             RExC_parse = regpatws(pRExC_state, RExC_parse,
13788                                   FALSE /* means don't recognize comments */ );
13789         }
13790     }
13791
13792     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13793     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13794         const char *s = RExC_parse;
13795         const char  c = *s++;
13796
13797         while (isWORDCHAR(*s))
13798             s++;
13799         if (*s && c == *s && s[1] == ']') {
13800             SAVEFREESV(RExC_rx_sv);
13801             ckWARN3reg(s+2,
13802                        "POSIX syntax [%c %c] belongs inside character classes",
13803                        c, c);
13804             (void)ReREFCNT_inc(RExC_rx_sv);
13805         }
13806     }
13807
13808     /* If the caller wants us to just parse a single element, accomplish this
13809      * by faking the loop ending condition */
13810     if (stop_at_1 && RExC_end > RExC_parse) {
13811         stop_ptr = RExC_parse + 1;
13812     }
13813
13814     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13815     if (UCHARAT(RExC_parse) == ']')
13816         goto charclassloop;
13817
13818     while (1) {
13819         if  (RExC_parse >= stop_ptr) {
13820             break;
13821         }
13822
13823         if (skip_white) {
13824             RExC_parse = regpatws(pRExC_state, RExC_parse,
13825                                   FALSE /* means don't recognize comments */ );
13826         }
13827
13828         if  (UCHARAT(RExC_parse) == ']') {
13829             break;
13830         }
13831
13832     charclassloop:
13833
13834         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13835         save_value = value;
13836         save_prevvalue = prevvalue;
13837
13838         if (!range) {
13839             rangebegin = RExC_parse;
13840             element_count++;
13841         }
13842         if (UTF) {
13843             value = utf8n_to_uvchr((U8*)RExC_parse,
13844                                    RExC_end - RExC_parse,
13845                                    &numlen, UTF8_ALLOW_DEFAULT);
13846             RExC_parse += numlen;
13847         }
13848         else
13849             value = UCHARAT(RExC_parse++);
13850
13851         if (value == '['
13852             && RExC_parse < RExC_end
13853             && POSIXCC(UCHARAT(RExC_parse)))
13854         {
13855             namedclass = regpposixcc(pRExC_state, value, strict);
13856         }
13857         else if (value != '\\') {
13858 #ifdef EBCDIC
13859             literal_endpoint++;
13860 #endif
13861         }
13862         else {
13863             /* Is a backslash; get the code point of the char after it */
13864             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13865                 value = utf8n_to_uvchr((U8*)RExC_parse,
13866                                    RExC_end - RExC_parse,
13867                                    &numlen, UTF8_ALLOW_DEFAULT);
13868                 RExC_parse += numlen;
13869             }
13870             else
13871                 value = UCHARAT(RExC_parse++);
13872
13873             /* Some compilers cannot handle switching on 64-bit integer
13874              * values, therefore value cannot be an UV.  Yes, this will
13875              * be a problem later if we want switch on Unicode.
13876              * A similar issue a little bit later when switching on
13877              * namedclass. --jhi */
13878
13879             /* If the \ is escaping white space when white space is being
13880              * skipped, it means that that white space is wanted literally, and
13881              * is already in 'value'.  Otherwise, need to translate the escape
13882              * into what it signifies. */
13883             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13884
13885             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13886             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13887             case 's':   namedclass = ANYOF_SPACE;       break;
13888             case 'S':   namedclass = ANYOF_NSPACE;      break;
13889             case 'd':   namedclass = ANYOF_DIGIT;       break;
13890             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13891             case 'v':   namedclass = ANYOF_VERTWS;      break;
13892             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13893             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13894             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13895             case 'N':  /* Handle \N{NAME} in class */
13896                 {
13897                     SV *as_text;
13898                     STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13899                                                     flagp, depth, &as_text);
13900                     if (*flagp & RESTART_UTF8)
13901                         FAIL("panic: grok_bslash_N set RESTART_UTF8");
13902                     if (cp_count != 1) {    /* The typical case drops through */
13903                         assert(cp_count != (STRLEN) -1);
13904                         if (cp_count == 0) {
13905                             if (strict) {
13906                                 RExC_parse++;   /* Position after the "}" */
13907                                 vFAIL("Zero length \\N{}");
13908                             }
13909                             else if (PASS2) {
13910                                 ckWARNreg(RExC_parse,
13911                                         "Ignoring zero length \\N{} in character class");
13912                             }
13913                         }
13914                         else { /* cp_count > 1 */
13915                             if (! RExC_in_multi_char_class) {
13916                                 if (invert || range || *RExC_parse == '-') {
13917                                     if (strict) {
13918                                         RExC_parse--;
13919                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13920                                     }
13921                                     else if (PASS2) {
13922                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13923                                     }
13924                                 }
13925                                 else {
13926                                     multi_char_matches
13927                                         = add_multi_match(multi_char_matches,
13928                                                           as_text,
13929                                                           cp_count);
13930                                 }
13931                                 break; /* <value> contains the first code
13932                                           point. Drop out of the switch to
13933                                           process it */
13934                             }
13935                         } /* End of cp_count != 1 */
13936
13937                         /* This element should not be processed further in this
13938                          * class */
13939                         element_count--;
13940                         value = save_value;
13941                         prevvalue = save_prevvalue;
13942                         continue;   /* Back to top of loop to get next char */
13943                     }
13944                     /* Here, is a single code point, and <value> contains it */
13945 #ifdef EBCDIC
13946                     /* We consider named characters to be literal characters */
13947                     literal_endpoint++;
13948 #endif
13949                 }
13950                 break;
13951             case 'p':
13952             case 'P':
13953                 {
13954                 char *e;
13955
13956                 /* We will handle any undefined properties ourselves */
13957                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13958                                        /* And we actually would prefer to get
13959                                         * the straight inversion list of the
13960                                         * swash, since we will be accessing it
13961                                         * anyway, to save a little time */
13962                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13963
13964                 if (RExC_parse >= RExC_end)
13965                     vFAIL2("Empty \\%c{}", (U8)value);
13966                 if (*RExC_parse == '{') {
13967                     const U8 c = (U8)value;
13968                     e = strchr(RExC_parse++, '}');
13969                     if (!e)
13970                         vFAIL2("Missing right brace on \\%c{}", c);
13971                     while (isSPACE(*RExC_parse))
13972                         RExC_parse++;
13973                     if (e == RExC_parse)
13974                         vFAIL2("Empty \\%c{}", c);
13975                     n = e - RExC_parse;
13976                     while (isSPACE(*(RExC_parse + n - 1)))
13977                         n--;
13978                 }
13979                 else {
13980                     e = RExC_parse;
13981                     n = 1;
13982                 }
13983                 if (!SIZE_ONLY) {
13984                     SV* invlist;
13985                     char* name;
13986
13987                     if (UCHARAT(RExC_parse) == '^') {
13988                          RExC_parse++;
13989                          n--;
13990                          /* toggle.  (The rhs xor gets the single bit that
13991                           * differs between P and p; the other xor inverts just
13992                           * that bit) */
13993                          value ^= 'P' ^ 'p';
13994
13995                          while (isSPACE(*RExC_parse)) {
13996                               RExC_parse++;
13997                               n--;
13998                          }
13999                     }
14000                     /* Try to get the definition of the property into
14001                      * <invlist>.  If /i is in effect, the effective property
14002                      * will have its name be <__NAME_i>.  The design is
14003                      * discussed in commit
14004                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14005                     name = savepv(Perl_form(aTHX_
14006                                           "%s%.*s%s\n",
14007                                           (FOLD) ? "__" : "",
14008                                           (int)n,
14009                                           RExC_parse,
14010                                           (FOLD) ? "_i" : ""
14011                                 ));
14012
14013                     /* Look up the property name, and get its swash and
14014                      * inversion list, if the property is found  */
14015                     if (swash) {
14016                         SvREFCNT_dec_NN(swash);
14017                     }
14018                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
14019                                              1, /* binary */
14020                                              0, /* not tr/// */
14021                                              NULL, /* No inversion list */
14022                                              &swash_init_flags
14023                                             );
14024                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14025                         HV* curpkg = (IN_PERL_COMPILETIME)
14026                                       ? PL_curstash
14027                                       : CopSTASH(PL_curcop);
14028                         if (swash) {
14029                             SvREFCNT_dec_NN(swash);
14030                             swash = NULL;
14031                         }
14032
14033                         /* Here didn't find it.  It could be a user-defined
14034                          * property that will be available at run-time.  If we
14035                          * accept only compile-time properties, is an error;
14036                          * otherwise add it to the list for run-time look up */
14037                         if (ret_invlist) {
14038                             RExC_parse = e + 1;
14039                             vFAIL2utf8f(
14040                                 "Property '%"UTF8f"' is unknown",
14041                                 UTF8fARG(UTF, n, name));
14042                         }
14043
14044                         /* If the property name doesn't already have a package
14045                          * name, add the current one to it so that it can be
14046                          * referred to outside it. [perl #121777] */
14047                         if (curpkg && ! instr(name, "::")) {
14048                             char* pkgname = HvNAME(curpkg);
14049                             if (strNE(pkgname, "main")) {
14050                                 char* full_name = Perl_form(aTHX_
14051                                                             "%s::%s",
14052                                                             pkgname,
14053                                                             name);
14054                                 n = strlen(full_name);
14055                                 Safefree(name);
14056                                 name = savepvn(full_name, n);
14057                             }
14058                         }
14059                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14060                                         (value == 'p' ? '+' : '!'),
14061                                         UTF8fARG(UTF, n, name));
14062                         has_user_defined_property = TRUE;
14063
14064                         /* We don't know yet, so have to assume that the
14065                          * property could match something in the Latin1 range,
14066                          * hence something that isn't utf8.  Note that this
14067                          * would cause things in <depends_list> to match
14068                          * inappropriately, except that any \p{}, including
14069                          * this one forces Unicode semantics, which means there
14070                          * is no <depends_list> */
14071                         ANYOF_FLAGS(ret)
14072                                       |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14073                     }
14074                     else {
14075
14076                         /* Here, did get the swash and its inversion list.  If
14077                          * the swash is from a user-defined property, then this
14078                          * whole character class should be regarded as such */
14079                         if (swash_init_flags
14080                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14081                         {
14082                             has_user_defined_property = TRUE;
14083                         }
14084                         else if
14085                             /* We warn on matching an above-Unicode code point
14086                              * if the match would return true, except don't
14087                              * warn for \p{All}, which has exactly one element
14088                              * = 0 */
14089                             (_invlist_contains_cp(invlist, 0x110000)
14090                                 && (! (_invlist_len(invlist) == 1
14091                                        && *invlist_array(invlist) == 0)))
14092                         {
14093                             warn_super = TRUE;
14094                         }
14095
14096
14097                         /* Invert if asking for the complement */
14098                         if (value == 'P') {
14099                             _invlist_union_complement_2nd(properties,
14100                                                           invlist,
14101                                                           &properties);
14102
14103                             /* The swash can't be used as-is, because we've
14104                              * inverted things; delay removing it to here after
14105                              * have copied its invlist above */
14106                             SvREFCNT_dec_NN(swash);
14107                             swash = NULL;
14108                         }
14109                         else {
14110                             _invlist_union(properties, invlist, &properties);
14111                         }
14112                     }
14113                     Safefree(name);
14114                 }
14115                 RExC_parse = e + 1;
14116                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
14117                                                 named */
14118
14119                 /* \p means they want Unicode semantics */
14120                 RExC_uni_semantics = 1;
14121                 }
14122                 break;
14123             case 'n':   value = '\n';                   break;
14124             case 'r':   value = '\r';                   break;
14125             case 't':   value = '\t';                   break;
14126             case 'f':   value = '\f';                   break;
14127             case 'b':   value = '\b';                   break;
14128             case 'e':   value = ESC_NATIVE;             break;
14129             case 'a':   value = '\a';                   break;
14130             case 'o':
14131                 RExC_parse--;   /* function expects to be pointed at the 'o' */
14132                 {
14133                     const char* error_msg;
14134                     bool valid = grok_bslash_o(&RExC_parse,
14135                                                &value,
14136                                                &error_msg,
14137                                                PASS2,   /* warnings only in
14138                                                            pass 2 */
14139                                                strict,
14140                                                silence_non_portable,
14141                                                UTF);
14142                     if (! valid) {
14143                         vFAIL(error_msg);
14144                     }
14145                 }
14146                 if (PL_encoding && value < 0x100) {
14147                     goto recode_encoding;
14148                 }
14149                 break;
14150             case 'x':
14151                 RExC_parse--;   /* function expects to be pointed at the 'x' */
14152                 {
14153                     const char* error_msg;
14154                     bool valid = grok_bslash_x(&RExC_parse,
14155                                                &value,
14156                                                &error_msg,
14157                                                PASS2, /* Output warnings */
14158                                                strict,
14159                                                silence_non_portable,
14160                                                UTF);
14161                     if (! valid) {
14162                         vFAIL(error_msg);
14163                     }
14164                 }
14165                 if (PL_encoding && value < 0x100)
14166                     goto recode_encoding;
14167                 break;
14168             case 'c':
14169                 value = grok_bslash_c(*RExC_parse++, PASS2);
14170                 break;
14171             case '0': case '1': case '2': case '3': case '4':
14172             case '5': case '6': case '7':
14173                 {
14174                     /* Take 1-3 octal digits */
14175                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14176                     numlen = (strict) ? 4 : 3;
14177                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14178                     RExC_parse += numlen;
14179                     if (numlen != 3) {
14180                         if (strict) {
14181                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14182                             vFAIL("Need exactly 3 octal digits");
14183                         }
14184                         else if (! SIZE_ONLY /* like \08, \178 */
14185                                  && numlen < 3
14186                                  && RExC_parse < RExC_end
14187                                  && isDIGIT(*RExC_parse)
14188                                  && ckWARN(WARN_REGEXP))
14189                         {
14190                             SAVEFREESV(RExC_rx_sv);
14191                             reg_warn_non_literal_string(
14192                                  RExC_parse + 1,
14193                                  form_short_octal_warning(RExC_parse, numlen));
14194                             (void)ReREFCNT_inc(RExC_rx_sv);
14195                         }
14196                     }
14197                     if (PL_encoding && value < 0x100)
14198                         goto recode_encoding;
14199                     break;
14200                 }
14201             recode_encoding:
14202                 if (! RExC_override_recoding) {
14203                     SV* enc = PL_encoding;
14204                     value = reg_recode((const char)(U8)value, &enc);
14205                     if (!enc) {
14206                         if (strict) {
14207                             vFAIL("Invalid escape in the specified encoding");
14208                         }
14209                         else if (PASS2) {
14210                             ckWARNreg(RExC_parse,
14211                                   "Invalid escape in the specified encoding");
14212                         }
14213                     }
14214                     break;
14215                 }
14216             default:
14217                 /* Allow \_ to not give an error */
14218                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14219                     if (strict) {
14220                         vFAIL2("Unrecognized escape \\%c in character class",
14221                                (int)value);
14222                     }
14223                     else {
14224                         SAVEFREESV(RExC_rx_sv);
14225                         ckWARN2reg(RExC_parse,
14226                             "Unrecognized escape \\%c in character class passed through",
14227                             (int)value);
14228                         (void)ReREFCNT_inc(RExC_rx_sv);
14229                     }
14230                 }
14231                 break;
14232             }   /* End of switch on char following backslash */
14233         } /* end of handling backslash escape sequences */
14234
14235         /* Here, we have the current token in 'value' */
14236
14237         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14238             U8 classnum;
14239
14240             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
14241              * literal, as is the character that began the false range, i.e.
14242              * the 'a' in the examples */
14243             if (range) {
14244                 if (!SIZE_ONLY) {
14245                     const int w = (RExC_parse >= rangebegin)
14246                                   ? RExC_parse - rangebegin
14247                                   : 0;
14248                     if (strict) {
14249                         vFAIL2utf8f(
14250                             "False [] range \"%"UTF8f"\"",
14251                             UTF8fARG(UTF, w, rangebegin));
14252                     }
14253                     else {
14254                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14255                         ckWARN2reg(RExC_parse,
14256                             "False [] range \"%"UTF8f"\"",
14257                             UTF8fARG(UTF, w, rangebegin));
14258                         (void)ReREFCNT_inc(RExC_rx_sv);
14259                         cp_list = add_cp_to_invlist(cp_list, '-');
14260                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14261                                                              prevvalue);
14262                     }
14263                 }
14264
14265                 range = 0; /* this was not a true range */
14266                 element_count += 2; /* So counts for three values */
14267             }
14268
14269             classnum = namedclass_to_classnum(namedclass);
14270
14271             if (LOC && namedclass < ANYOF_POSIXL_MAX
14272 #ifndef HAS_ISASCII
14273                 && classnum != _CC_ASCII
14274 #endif
14275             ) {
14276                 /* What the Posix classes (like \w, [:space:]) match in locale
14277                  * isn't knowable under locale until actual match time.  Room
14278                  * must be reserved (one time per outer bracketed class) to
14279                  * store such classes.  The space will contain a bit for each
14280                  * named class that is to be matched against.  This isn't
14281                  * needed for \p{} and pseudo-classes, as they are not affected
14282                  * by locale, and hence are dealt with separately */
14283                 if (! need_class) {
14284                     need_class = 1;
14285                     if (SIZE_ONLY) {
14286                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14287                     }
14288                     else {
14289                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14290                     }
14291                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14292                     ANYOF_POSIXL_ZERO(ret);
14293                 }
14294
14295                 /* Coverity thinks it is possible for this to be negative; both
14296                  * jhi and khw think it's not, but be safer */
14297                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14298                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14299
14300                 /* See if it already matches the complement of this POSIX
14301                  * class */
14302                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14303                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14304                                                             ? -1
14305                                                             : 1)))
14306                 {
14307                     posixl_matches_all = TRUE;
14308                     break;  /* No need to continue.  Since it matches both
14309                                e.g., \w and \W, it matches everything, and the
14310                                bracketed class can be optimized into qr/./s */
14311                 }
14312
14313                 /* Add this class to those that should be checked at runtime */
14314                 ANYOF_POSIXL_SET(ret, namedclass);
14315
14316                 /* The above-Latin1 characters are not subject to locale rules.
14317                  * Just add them, in the second pass, to the
14318                  * unconditionally-matched list */
14319                 if (! SIZE_ONLY) {
14320                     SV* scratch_list = NULL;
14321
14322                     /* Get the list of the above-Latin1 code points this
14323                      * matches */
14324                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14325                                           PL_XPosix_ptrs[classnum],
14326
14327                                           /* Odd numbers are complements, like
14328                                            * NDIGIT, NASCII, ... */
14329                                           namedclass % 2 != 0,
14330                                           &scratch_list);
14331                     /* Checking if 'cp_list' is NULL first saves an extra
14332                      * clone.  Its reference count will be decremented at the
14333                      * next union, etc, or if this is the only instance, at the
14334                      * end of the routine */
14335                     if (! cp_list) {
14336                         cp_list = scratch_list;
14337                     }
14338                     else {
14339                         _invlist_union(cp_list, scratch_list, &cp_list);
14340                         SvREFCNT_dec_NN(scratch_list);
14341                     }
14342                     continue;   /* Go get next character */
14343                 }
14344             }
14345             else if (! SIZE_ONLY) {
14346
14347                 /* Here, not in pass1 (in that pass we skip calculating the
14348                  * contents of this class), and is /l, or is a POSIX class for
14349                  * which /l doesn't matter (or is a Unicode property, which is
14350                  * skipped here). */
14351                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
14352                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14353
14354                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
14355                          * nor /l make a difference in what these match,
14356                          * therefore we just add what they match to cp_list. */
14357                         if (classnum != _CC_VERTSPACE) {
14358                             assert(   namedclass == ANYOF_HORIZWS
14359                                    || namedclass == ANYOF_NHORIZWS);
14360
14361                             /* It turns out that \h is just a synonym for
14362                              * XPosixBlank */
14363                             classnum = _CC_BLANK;
14364                         }
14365
14366                         _invlist_union_maybe_complement_2nd(
14367                                 cp_list,
14368                                 PL_XPosix_ptrs[classnum],
14369                                 namedclass % 2 != 0,    /* Complement if odd
14370                                                           (NHORIZWS, NVERTWS)
14371                                                         */
14372                                 &cp_list);
14373                     }
14374                 }
14375                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
14376                            complement and use nposixes */
14377                     SV** posixes_ptr = namedclass % 2 == 0
14378                                        ? &posixes
14379                                        : &nposixes;
14380                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
14381                     _invlist_union_maybe_complement_2nd(
14382                                                      *posixes_ptr,
14383                                                      *source_ptr,
14384                                                      namedclass % 2 != 0,
14385                                                      posixes_ptr);
14386                 }
14387             }
14388         } /* end of namedclass \blah */
14389
14390         if (skip_white) {
14391             RExC_parse = regpatws(pRExC_state, RExC_parse,
14392                                 FALSE /* means don't recognize comments */ );
14393         }
14394
14395         /* If 'range' is set, 'value' is the ending of a range--check its
14396          * validity.  (If value isn't a single code point in the case of a
14397          * range, we should have figured that out above in the code that
14398          * catches false ranges).  Later, we will handle each individual code
14399          * point in the range.  If 'range' isn't set, this could be the
14400          * beginning of a range, so check for that by looking ahead to see if
14401          * the next real character to be processed is the range indicator--the
14402          * minus sign */
14403
14404         if (range) {
14405             if (prevvalue > value) /* b-a */ {
14406                 const int w = RExC_parse - rangebegin;
14407                 vFAIL2utf8f(
14408                     "Invalid [] range \"%"UTF8f"\"",
14409                     UTF8fARG(UTF, w, rangebegin));
14410                 range = 0; /* not a valid range */
14411             }
14412         }
14413         else {
14414             prevvalue = value; /* save the beginning of the potential range */
14415             if (! stop_at_1     /* Can't be a range if parsing just one thing */
14416                 && *RExC_parse == '-')
14417             {
14418                 char* next_char_ptr = RExC_parse + 1;
14419                 if (skip_white) {   /* Get the next real char after the '-' */
14420                     next_char_ptr = regpatws(pRExC_state,
14421                                              RExC_parse + 1,
14422                                              FALSE); /* means don't recognize
14423                                                         comments */
14424                 }
14425
14426                 /* If the '-' is at the end of the class (just before the ']',
14427                  * it is a literal minus; otherwise it is a range */
14428                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14429                     RExC_parse = next_char_ptr;
14430
14431                     /* a bad range like \w-, [:word:]- ? */
14432                     if (namedclass > OOB_NAMEDCLASS) {
14433                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14434                             const int w = RExC_parse >= rangebegin
14435                                           ?  RExC_parse - rangebegin
14436                                           : 0;
14437                             if (strict) {
14438                                 vFAIL4("False [] range \"%*.*s\"",
14439                                     w, w, rangebegin);
14440                             }
14441                             else if (PASS2) {
14442                                 vWARN4(RExC_parse,
14443                                     "False [] range \"%*.*s\"",
14444                                     w, w, rangebegin);
14445                             }
14446                         }
14447                         if (!SIZE_ONLY) {
14448                             cp_list = add_cp_to_invlist(cp_list, '-');
14449                         }
14450                         element_count++;
14451                     } else
14452                         range = 1;      /* yeah, it's a range! */
14453                     continue;   /* but do it the next time */
14454                 }
14455             }
14456         }
14457
14458         if (namedclass > OOB_NAMEDCLASS) {
14459             continue;
14460         }
14461
14462         /* Here, we have a single value, and <prevvalue> is the beginning of
14463          * the range, if any; or <value> if not */
14464
14465         /* non-Latin1 code point implies unicode semantics.  Must be set in
14466          * pass1 so is there for the whole of pass 2 */
14467         if (value > 255) {
14468             RExC_uni_semantics = 1;
14469         }
14470
14471         /* Ready to process either the single value, or the completed range.
14472          * For single-valued non-inverted ranges, we consider the possibility
14473          * of multi-char folds.  (We made a conscious decision to not do this
14474          * for the other cases because it can often lead to non-intuitive
14475          * results.  For example, you have the peculiar case that:
14476          *  "s s" =~ /^[^\xDF]+$/i => Y
14477          *  "ss"  =~ /^[^\xDF]+$/i => N
14478          *
14479          * See [perl #89750] */
14480         if (FOLD && allow_multi_folds && value == prevvalue) {
14481             if (value == LATIN_SMALL_LETTER_SHARP_S
14482                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14483                                                         value)))
14484             {
14485                 /* Here <value> is indeed a multi-char fold.  Get what it is */
14486
14487                 U8 foldbuf[UTF8_MAXBYTES_CASE];
14488                 STRLEN foldlen;
14489
14490                 UV folded = _to_uni_fold_flags(
14491                                 value,
14492                                 foldbuf,
14493                                 &foldlen,
14494                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14495                                                    ? FOLD_FLAGS_NOMIX_ASCII
14496                                                    : 0)
14497                                 );
14498
14499                 /* Here, <folded> should be the first character of the
14500                  * multi-char fold of <value>, with <foldbuf> containing the
14501                  * whole thing.  But, if this fold is not allowed (because of
14502                  * the flags), <fold> will be the same as <value>, and should
14503                  * be processed like any other character, so skip the special
14504                  * handling */
14505                 if (folded != value) {
14506
14507                     /* Skip if we are recursed, currently parsing the class
14508                      * again.  Otherwise add this character to the list of
14509                      * multi-char folds. */
14510                     if (! RExC_in_multi_char_class) {
14511                         STRLEN cp_count = utf8_length(foldbuf,
14512                                                       foldbuf + foldlen);
14513                         SV* multi_fold = sv_2mortal(newSVpvs(""));
14514
14515                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14516
14517                         multi_char_matches
14518                                         = add_multi_match(multi_char_matches,
14519                                                           multi_fold,
14520                                                           cp_count);
14521
14522                     }
14523
14524                     /* This element should not be processed further in this
14525                      * class */
14526                     element_count--;
14527                     value = save_value;
14528                     prevvalue = save_prevvalue;
14529                     continue;
14530                 }
14531             }
14532         }
14533
14534         /* Deal with this element of the class */
14535         if (! SIZE_ONLY) {
14536 #ifndef EBCDIC
14537             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14538                                                      prevvalue, value);
14539 #else
14540             SV* this_range = _new_invlist(1);
14541             _append_range_to_invlist(this_range, prevvalue, value);
14542
14543             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14544              * If this range was specified using something like 'i-j', we want
14545              * to include only the 'i' and the 'j', and not anything in
14546              * between, so exclude non-ASCII, non-alphabetics from it.
14547              * However, if the range was specified with something like
14548              * [\x89-\x91] or [\x89-j], all code points within it should be
14549              * included.  literal_endpoint==2 means both ends of the range used
14550              * a literal character, not \x{foo} */
14551             if (literal_endpoint == 2
14552                 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14553                     || (isUPPER_A(prevvalue) && isUPPER_A(value))))
14554             {
14555                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14556                                       &this_range);
14557
14558                 /* Since 'this_range' now only contains ascii, the intersection
14559                  * of it with anything will still yield only ascii */
14560                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14561                                       &this_range);
14562             }
14563             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14564             literal_endpoint = 0;
14565             SvREFCNT_dec_NN(this_range);
14566 #endif
14567         }
14568
14569         range = 0; /* this range (if it was one) is done now */
14570     } /* End of loop through all the text within the brackets */
14571
14572     /* If anything in the class expands to more than one character, we have to
14573      * deal with them by building up a substitute parse string, and recursively
14574      * calling reg() on it, instead of proceeding */
14575     if (multi_char_matches) {
14576         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14577         I32 cp_count;
14578         STRLEN len;
14579         char *save_end = RExC_end;
14580         char *save_parse = RExC_parse;
14581         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14582                                        a "|" */
14583         I32 reg_flags;
14584
14585         assert(! invert);
14586 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14587            because too confusing */
14588         if (invert) {
14589             sv_catpv(substitute_parse, "(?:");
14590         }
14591 #endif
14592
14593         /* Look at the longest folds first */
14594         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14595
14596             if (av_exists(multi_char_matches, cp_count)) {
14597                 AV** this_array_ptr;
14598                 SV* this_sequence;
14599
14600                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14601                                                  cp_count, FALSE);
14602                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14603                                                                 &PL_sv_undef)
14604                 {
14605                     if (! first_time) {
14606                         sv_catpv(substitute_parse, "|");
14607                     }
14608                     first_time = FALSE;
14609
14610                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14611                 }
14612             }
14613         }
14614
14615         /* If the character class contains anything else besides these
14616          * multi-character folds, have to include it in recursive parsing */
14617         if (element_count) {
14618             sv_catpv(substitute_parse, "|[");
14619             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14620             sv_catpv(substitute_parse, "]");
14621         }
14622
14623         sv_catpv(substitute_parse, ")");
14624 #if 0
14625         if (invert) {
14626             /* This is a way to get the parse to skip forward a whole named
14627              * sequence instead of matching the 2nd character when it fails the
14628              * first */
14629             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14630         }
14631 #endif
14632
14633         RExC_parse = SvPV(substitute_parse, len);
14634         RExC_end = RExC_parse + len;
14635         RExC_in_multi_char_class = 1;
14636         RExC_override_recoding = 1;
14637         RExC_emit = (regnode *)orig_emit;
14638
14639         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14640
14641         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14642
14643         RExC_parse = save_parse;
14644         RExC_end = save_end;
14645         RExC_in_multi_char_class = 0;
14646         RExC_override_recoding = 0;
14647         SvREFCNT_dec_NN(multi_char_matches);
14648         return ret;
14649     }
14650
14651     /* Here, we've gone through the entire class and dealt with multi-char
14652      * folds.  We are now in a position that we can do some checks to see if we
14653      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14654      * Currently we only do two checks:
14655      * 1) is in the unlikely event that the user has specified both, eg. \w and
14656      *    \W under /l, then the class matches everything.  (This optimization
14657      *    is done only to make the optimizer code run later work.)
14658      * 2) if the character class contains only a single element (including a
14659      *    single range), we see if there is an equivalent node for it.
14660      * Other checks are possible */
14661     if (! ret_invlist   /* Can't optimize if returning the constructed
14662                            inversion list */
14663         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14664     {
14665         U8 op = END;
14666         U8 arg = 0;
14667
14668         if (UNLIKELY(posixl_matches_all)) {
14669             op = SANY;
14670         }
14671         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14672                                                    \w or [:digit:] or \p{foo}
14673                                                  */
14674
14675             /* All named classes are mapped into POSIXish nodes, with its FLAG
14676              * argument giving which class it is */
14677             switch ((I32)namedclass) {
14678                 case ANYOF_UNIPROP:
14679                     break;
14680
14681                 /* These don't depend on the charset modifiers.  They always
14682                  * match under /u rules */
14683                 case ANYOF_NHORIZWS:
14684                 case ANYOF_HORIZWS:
14685                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14686                     /* FALLTHROUGH */
14687
14688                 case ANYOF_NVERTWS:
14689                 case ANYOF_VERTWS:
14690                     op = POSIXU;
14691                     goto join_posix;
14692
14693                 /* The actual POSIXish node for all the rest depends on the
14694                  * charset modifier.  The ones in the first set depend only on
14695                  * ASCII or, if available on this platform, locale */
14696                 case ANYOF_ASCII:
14697                 case ANYOF_NASCII:
14698 #ifdef HAS_ISASCII
14699                     op = (LOC) ? POSIXL : POSIXA;
14700 #else
14701                     op = POSIXA;
14702 #endif
14703                     goto join_posix;
14704
14705                 case ANYOF_NCASED:
14706                 case ANYOF_LOWER:
14707                 case ANYOF_NLOWER:
14708                 case ANYOF_UPPER:
14709                 case ANYOF_NUPPER:
14710                     /* under /a could be alpha */
14711                     if (FOLD) {
14712                         if (ASCII_RESTRICTED) {
14713                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14714                         }
14715                         else if (! LOC) {
14716                             break;
14717                         }
14718                     }
14719                     /* FALLTHROUGH */
14720
14721                 /* The rest have more possibilities depending on the charset.
14722                  * We take advantage of the enum ordering of the charset
14723                  * modifiers to get the exact node type, */
14724                 default:
14725                     op = POSIXD + get_regex_charset(RExC_flags);
14726                     if (op > POSIXA) { /* /aa is same as /a */
14727                         op = POSIXA;
14728                     }
14729
14730                 join_posix:
14731                     /* The odd numbered ones are the complements of the
14732                      * next-lower even number one */
14733                     if (namedclass % 2 == 1) {
14734                         invert = ! invert;
14735                         namedclass--;
14736                     }
14737                     arg = namedclass_to_classnum(namedclass);
14738                     break;
14739             }
14740         }
14741         else if (value == prevvalue) {
14742
14743             /* Here, the class consists of just a single code point */
14744
14745             if (invert) {
14746                 if (! LOC && value == '\n') {
14747                     op = REG_ANY; /* Optimize [^\n] */
14748                     *flagp |= HASWIDTH|SIMPLE;
14749                     RExC_naughty++;
14750                 }
14751             }
14752             else if (value < 256 || UTF) {
14753
14754                 /* Optimize a single value into an EXACTish node, but not if it
14755                  * would require converting the pattern to UTF-8. */
14756                 op = compute_EXACTish(pRExC_state);
14757             }
14758         } /* Otherwise is a range */
14759         else if (! LOC) {   /* locale could vary these */
14760             if (prevvalue == '0') {
14761                 if (value == '9') {
14762                     arg = _CC_DIGIT;
14763                     op = POSIXA;
14764                 }
14765             }
14766             else if (prevvalue == 'A') {
14767                 if (value == 'Z'
14768 #ifdef EBCDIC
14769                     && literal_endpoint == 2
14770 #endif
14771                 ) {
14772                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14773                     op = POSIXA;
14774                 }
14775             }
14776             else if (prevvalue == 'a') {
14777                 if (value == 'z'
14778 #ifdef EBCDIC
14779                     && literal_endpoint == 2
14780 #endif
14781                 ) {
14782                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14783                     op = POSIXA;
14784                 }
14785             }
14786         }
14787
14788         /* Here, we have changed <op> away from its initial value iff we found
14789          * an optimization */
14790         if (op != END) {
14791
14792             /* Throw away this ANYOF regnode, and emit the calculated one,
14793              * which should correspond to the beginning, not current, state of
14794              * the parse */
14795             const char * cur_parse = RExC_parse;
14796             RExC_parse = (char *)orig_parse;
14797             if ( SIZE_ONLY) {
14798                 if (! LOC) {
14799
14800                     /* To get locale nodes to not use the full ANYOF size would
14801                      * require moving the code above that writes the portions
14802                      * of it that aren't in other nodes to after this point.
14803                      * e.g.  ANYOF_POSIXL_SET */
14804                     RExC_size = orig_size;
14805                 }
14806             }
14807             else {
14808                 RExC_emit = (regnode *)orig_emit;
14809                 if (PL_regkind[op] == POSIXD) {
14810                     if (op == POSIXL) {
14811                         RExC_contains_locale = 1;
14812                     }
14813                     if (invert) {
14814                         op += NPOSIXD - POSIXD;
14815                     }
14816                 }
14817             }
14818
14819             ret = reg_node(pRExC_state, op);
14820
14821             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14822                 if (! SIZE_ONLY) {
14823                     FLAGS(ret) = arg;
14824                 }
14825                 *flagp |= HASWIDTH|SIMPLE;
14826             }
14827             else if (PL_regkind[op] == EXACT) {
14828                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14829                                            TRUE /* downgradable to EXACT */
14830                                            );
14831             }
14832
14833             RExC_parse = (char *) cur_parse;
14834
14835             SvREFCNT_dec(posixes);
14836             SvREFCNT_dec(nposixes);
14837             SvREFCNT_dec(cp_list);
14838             SvREFCNT_dec(cp_foldable_list);
14839             return ret;
14840         }
14841     }
14842
14843     if (SIZE_ONLY)
14844         return ret;
14845     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14846
14847     /* If folding, we calculate all characters that could fold to or from the
14848      * ones already on the list */
14849     if (cp_foldable_list) {
14850         if (FOLD) {
14851             UV start, end;      /* End points of code point ranges */
14852
14853             SV* fold_intersection = NULL;
14854             SV** use_list;
14855
14856             /* Our calculated list will be for Unicode rules.  For locale
14857              * matching, we have to keep a separate list that is consulted at
14858              * runtime only when the locale indicates Unicode rules.  For
14859              * non-locale, we just use to the general list */
14860             if (LOC) {
14861                 use_list = &only_utf8_locale_list;
14862             }
14863             else {
14864                 use_list = &cp_list;
14865             }
14866
14867             /* Only the characters in this class that participate in folds need
14868              * be checked.  Get the intersection of this class and all the
14869              * possible characters that are foldable.  This can quickly narrow
14870              * down a large class */
14871             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14872                                   &fold_intersection);
14873
14874             /* The folds for all the Latin1 characters are hard-coded into this
14875              * program, but we have to go out to disk to get the others. */
14876             if (invlist_highest(cp_foldable_list) >= 256) {
14877
14878                 /* This is a hash that for a particular fold gives all
14879                  * characters that are involved in it */
14880                 if (! PL_utf8_foldclosures) {
14881                     _load_PL_utf8_foldclosures();
14882                 }
14883             }
14884
14885             /* Now look at the foldable characters in this class individually */
14886             invlist_iterinit(fold_intersection);
14887             while (invlist_iternext(fold_intersection, &start, &end)) {
14888                 UV j;
14889
14890                 /* Look at every character in the range */
14891                 for (j = start; j <= end; j++) {
14892                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14893                     STRLEN foldlen;
14894                     SV** listp;
14895
14896                     if (j < 256) {
14897
14898                         if (IS_IN_SOME_FOLD_L1(j)) {
14899
14900                             /* ASCII is always matched; non-ASCII is matched
14901                              * only under Unicode rules (which could happen
14902                              * under /l if the locale is a UTF-8 one */
14903                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14904                                 *use_list = add_cp_to_invlist(*use_list,
14905                                                             PL_fold_latin1[j]);
14906                             }
14907                             else {
14908                                 depends_list =
14909                                  add_cp_to_invlist(depends_list,
14910                                                    PL_fold_latin1[j]);
14911                             }
14912                         }
14913
14914                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14915                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14916                         {
14917                             add_above_Latin1_folds(pRExC_state,
14918                                                    (U8) j,
14919                                                    use_list);
14920                         }
14921                         continue;
14922                     }
14923
14924                     /* Here is an above Latin1 character.  We don't have the
14925                      * rules hard-coded for it.  First, get its fold.  This is
14926                      * the simple fold, as the multi-character folds have been
14927                      * handled earlier and separated out */
14928                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14929                                                         (ASCII_FOLD_RESTRICTED)
14930                                                         ? FOLD_FLAGS_NOMIX_ASCII
14931                                                         : 0);
14932
14933                     /* Single character fold of above Latin1.  Add everything in
14934                     * its fold closure to the list that this node should match.
14935                     * The fold closures data structure is a hash with the keys
14936                     * being the UTF-8 of every character that is folded to, like
14937                     * 'k', and the values each an array of all code points that
14938                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14939                     * Multi-character folds are not included */
14940                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14941                                         (char *) foldbuf, foldlen, FALSE)))
14942                     {
14943                         AV* list = (AV*) *listp;
14944                         IV k;
14945                         for (k = 0; k <= av_tindex(list); k++) {
14946                             SV** c_p = av_fetch(list, k, FALSE);
14947                             UV c;
14948                             assert(c_p);
14949
14950                             c = SvUV(*c_p);
14951
14952                             /* /aa doesn't allow folds between ASCII and non- */
14953                             if ((ASCII_FOLD_RESTRICTED
14954                                 && (isASCII(c) != isASCII(j))))
14955                             {
14956                                 continue;
14957                             }
14958
14959                             /* Folds under /l which cross the 255/256 boundary
14960                              * are added to a separate list.  (These are valid
14961                              * only when the locale is UTF-8.) */
14962                             if (c < 256 && LOC) {
14963                                 *use_list = add_cp_to_invlist(*use_list, c);
14964                                 continue;
14965                             }
14966
14967                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14968                             {
14969                                 cp_list = add_cp_to_invlist(cp_list, c);
14970                             }
14971                             else {
14972                                 /* Similarly folds involving non-ascii Latin1
14973                                 * characters under /d are added to their list */
14974                                 depends_list = add_cp_to_invlist(depends_list,
14975                                                                  c);
14976                             }
14977                         }
14978                     }
14979                 }
14980             }
14981             SvREFCNT_dec_NN(fold_intersection);
14982         }
14983
14984         /* Now that we have finished adding all the folds, there is no reason
14985          * to keep the foldable list separate */
14986         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14987         SvREFCNT_dec_NN(cp_foldable_list);
14988     }
14989
14990     /* And combine the result (if any) with any inversion list from posix
14991      * classes.  The lists are kept separate up to now because we don't want to
14992      * fold the classes (folding of those is automatically handled by the swash
14993      * fetching code) */
14994     if (posixes || nposixes) {
14995         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14996             /* Under /a and /aa, nothing above ASCII matches these */
14997             _invlist_intersection(posixes,
14998                                   PL_XPosix_ptrs[_CC_ASCII],
14999                                   &posixes);
15000         }
15001         if (nposixes) {
15002             if (DEPENDS_SEMANTICS) {
15003                 /* Under /d, everything in the upper half of the Latin1 range
15004                  * matches these complements */
15005                 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15006             }
15007             else if (AT_LEAST_ASCII_RESTRICTED) {
15008                 /* Under /a and /aa, everything above ASCII matches these
15009                  * complements */
15010                 _invlist_union_complement_2nd(nposixes,
15011                                               PL_XPosix_ptrs[_CC_ASCII],
15012                                               &nposixes);
15013             }
15014             if (posixes) {
15015                 _invlist_union(posixes, nposixes, &posixes);
15016                 SvREFCNT_dec_NN(nposixes);
15017             }
15018             else {
15019                 posixes = nposixes;
15020             }
15021         }
15022         if (! DEPENDS_SEMANTICS) {
15023             if (cp_list) {
15024                 _invlist_union(cp_list, posixes, &cp_list);
15025                 SvREFCNT_dec_NN(posixes);
15026             }
15027             else {
15028                 cp_list = posixes;
15029             }
15030         }
15031         else {
15032             /* Under /d, we put into a separate list the Latin1 things that
15033              * match only when the target string is utf8 */
15034             SV* nonascii_but_latin1_properties = NULL;
15035             _invlist_intersection(posixes, PL_UpperLatin1,
15036                                   &nonascii_but_latin1_properties);
15037             _invlist_subtract(posixes, nonascii_but_latin1_properties,
15038                               &posixes);
15039             if (cp_list) {
15040                 _invlist_union(cp_list, posixes, &cp_list);
15041                 SvREFCNT_dec_NN(posixes);
15042             }
15043             else {
15044                 cp_list = posixes;
15045             }
15046
15047             if (depends_list) {
15048                 _invlist_union(depends_list, nonascii_but_latin1_properties,
15049                                &depends_list);
15050                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15051             }
15052             else {
15053                 depends_list = nonascii_but_latin1_properties;
15054             }
15055         }
15056     }
15057
15058     /* And combine the result (if any) with any inversion list from properties.
15059      * The lists are kept separate up to now so that we can distinguish the two
15060      * in regards to matching above-Unicode.  A run-time warning is generated
15061      * if a Unicode property is matched against a non-Unicode code point. But,
15062      * we allow user-defined properties to match anything, without any warning,
15063      * and we also suppress the warning if there is a portion of the character
15064      * class that isn't a Unicode property, and which matches above Unicode, \W
15065      * or [\x{110000}] for example.
15066      * (Note that in this case, unlike the Posix one above, there is no
15067      * <depends_list>, because having a Unicode property forces Unicode
15068      * semantics */
15069     if (properties) {
15070         if (cp_list) {
15071
15072             /* If it matters to the final outcome, see if a non-property
15073              * component of the class matches above Unicode.  If so, the
15074              * warning gets suppressed.  This is true even if just a single
15075              * such code point is specified, as though not strictly correct if
15076              * another such code point is matched against, the fact that they
15077              * are using above-Unicode code points indicates they should know
15078              * the issues involved */
15079             if (warn_super) {
15080                 warn_super = ! (invert
15081                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15082             }
15083
15084             _invlist_union(properties, cp_list, &cp_list);
15085             SvREFCNT_dec_NN(properties);
15086         }
15087         else {
15088             cp_list = properties;
15089         }
15090
15091         if (warn_super) {
15092             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15093         }
15094     }
15095
15096     /* Here, we have calculated what code points should be in the character
15097      * class.
15098      *
15099      * Now we can see about various optimizations.  Fold calculation (which we
15100      * did above) needs to take place before inversion.  Otherwise /[^k]/i
15101      * would invert to include K, which under /i would match k, which it
15102      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
15103      * folded until runtime */
15104
15105     /* If we didn't do folding, it's because some information isn't available
15106      * until runtime; set the run-time fold flag for these.  (We don't have to
15107      * worry about properties folding, as that is taken care of by the swash
15108      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
15109      * locales, or the class matches at least one 0-255 range code point */
15110     if (LOC && FOLD) {
15111         if (only_utf8_locale_list) {
15112             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15113         }
15114         else if (cp_list) { /* Look to see if there a 0-255 code point is in
15115                                the list */
15116             UV start, end;
15117             invlist_iterinit(cp_list);
15118             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15119                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15120             }
15121             invlist_iterfinish(cp_list);
15122         }
15123     }
15124
15125     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15126      * at compile time.  Besides not inverting folded locale now, we can't
15127      * invert if there are things such as \w, which aren't known until runtime
15128      * */
15129     if (cp_list
15130         && invert
15131         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15132         && ! depends_list
15133         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15134     {
15135         _invlist_invert(cp_list);
15136
15137         /* Any swash can't be used as-is, because we've inverted things */
15138         if (swash) {
15139             SvREFCNT_dec_NN(swash);
15140             swash = NULL;
15141         }
15142
15143         /* Clear the invert flag since have just done it here */
15144         invert = FALSE;
15145     }
15146
15147     if (ret_invlist) {
15148         *ret_invlist = cp_list;
15149         SvREFCNT_dec(swash);
15150
15151         /* Discard the generated node */
15152         if (SIZE_ONLY) {
15153             RExC_size = orig_size;
15154         }
15155         else {
15156             RExC_emit = orig_emit;
15157         }
15158         return orig_emit;
15159     }
15160
15161     /* Some character classes are equivalent to other nodes.  Such nodes take
15162      * up less room and generally fewer operations to execute than ANYOF nodes.
15163      * Above, we checked for and optimized into some such equivalents for
15164      * certain common classes that are easy to test.  Getting to this point in
15165      * the code means that the class didn't get optimized there.  Since this
15166      * code is only executed in Pass 2, it is too late to save space--it has
15167      * been allocated in Pass 1, and currently isn't given back.  But turning
15168      * things into an EXACTish node can allow the optimizer to join it to any
15169      * adjacent such nodes.  And if the class is equivalent to things like /./,
15170      * expensive run-time swashes can be avoided.  Now that we have more
15171      * complete information, we can find things necessarily missed by the
15172      * earlier code.  I (khw) am not sure how much to look for here.  It would
15173      * be easy, but perhaps too slow, to check any candidates against all the
15174      * node types they could possibly match using _invlistEQ(). */
15175
15176     if (cp_list
15177         && ! invert
15178         && ! depends_list
15179         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15180         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15181
15182            /* We don't optimize if we are supposed to make sure all non-Unicode
15183             * code points raise a warning, as only ANYOF nodes have this check.
15184             * */
15185         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15186     {
15187         UV start, end;
15188         U8 op = END;  /* The optimzation node-type */
15189         const char * cur_parse= RExC_parse;
15190
15191         invlist_iterinit(cp_list);
15192         if (! invlist_iternext(cp_list, &start, &end)) {
15193
15194             /* Here, the list is empty.  This happens, for example, when a
15195              * Unicode property is the only thing in the character class, and
15196              * it doesn't match anything.  (perluniprops.pod notes such
15197              * properties) */
15198             op = OPFAIL;
15199             *flagp |= HASWIDTH|SIMPLE;
15200         }
15201         else if (start == end) {    /* The range is a single code point */
15202             if (! invlist_iternext(cp_list, &start, &end)
15203
15204                     /* Don't do this optimization if it would require changing
15205                      * the pattern to UTF-8 */
15206                 && (start < 256 || UTF))
15207             {
15208                 /* Here, the list contains a single code point.  Can optimize
15209                  * into an EXACTish node */
15210
15211                 value = start;
15212
15213                 if (! FOLD) {
15214                     op = EXACT;
15215                 }
15216                 else if (LOC) {
15217
15218                     /* A locale node under folding with one code point can be
15219                      * an EXACTFL, as its fold won't be calculated until
15220                      * runtime */
15221                     op = EXACTFL;
15222                 }
15223                 else {
15224
15225                     /* Here, we are generally folding, but there is only one
15226                      * code point to match.  If we have to, we use an EXACT
15227                      * node, but it would be better for joining with adjacent
15228                      * nodes in the optimization pass if we used the same
15229                      * EXACTFish node that any such are likely to be.  We can
15230                      * do this iff the code point doesn't participate in any
15231                      * folds.  For example, an EXACTF of a colon is the same as
15232                      * an EXACT one, since nothing folds to or from a colon. */
15233                     if (value < 256) {
15234                         if (IS_IN_SOME_FOLD_L1(value)) {
15235                             op = EXACT;
15236                         }
15237                     }
15238                     else {
15239                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15240                             op = EXACT;
15241                         }
15242                     }
15243
15244                     /* If we haven't found the node type, above, it means we
15245                      * can use the prevailing one */
15246                     if (op == END) {
15247                         op = compute_EXACTish(pRExC_state);
15248                     }
15249                 }
15250             }
15251         }
15252         else if (start == 0) {
15253             if (end == UV_MAX) {
15254                 op = SANY;
15255                 *flagp |= HASWIDTH|SIMPLE;
15256                 RExC_naughty++;
15257             }
15258             else if (end == '\n' - 1
15259                     && invlist_iternext(cp_list, &start, &end)
15260                     && start == '\n' + 1 && end == UV_MAX)
15261             {
15262                 op = REG_ANY;
15263                 *flagp |= HASWIDTH|SIMPLE;
15264                 RExC_naughty++;
15265             }
15266         }
15267         invlist_iterfinish(cp_list);
15268
15269         if (op != END) {
15270             RExC_parse = (char *)orig_parse;
15271             RExC_emit = (regnode *)orig_emit;
15272
15273             ret = reg_node(pRExC_state, op);
15274
15275             RExC_parse = (char *)cur_parse;
15276
15277             if (PL_regkind[op] == EXACT) {
15278                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15279                                            TRUE /* downgradable to EXACT */
15280                                           );
15281             }
15282
15283             SvREFCNT_dec_NN(cp_list);
15284             return ret;
15285         }
15286     }
15287
15288     /* Here, <cp_list> contains all the code points we can determine at
15289      * compile time that match under all conditions.  Go through it, and
15290      * for things that belong in the bitmap, put them there, and delete from
15291      * <cp_list>.  While we are at it, see if everything above 255 is in the
15292      * list, and if so, set a flag to speed up execution */
15293
15294     populate_ANYOF_from_invlist(ret, &cp_list);
15295
15296     if (invert) {
15297         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15298     }
15299
15300     /* Here, the bitmap has been populated with all the Latin1 code points that
15301      * always match.  Can now add to the overall list those that match only
15302      * when the target string is UTF-8 (<depends_list>). */
15303     if (depends_list) {
15304         if (cp_list) {
15305             _invlist_union(cp_list, depends_list, &cp_list);
15306             SvREFCNT_dec_NN(depends_list);
15307         }
15308         else {
15309             cp_list = depends_list;
15310         }
15311         ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15312     }
15313
15314     /* If there is a swash and more than one element, we can't use the swash in
15315      * the optimization below. */
15316     if (swash && element_count > 1) {
15317         SvREFCNT_dec_NN(swash);
15318         swash = NULL;
15319     }
15320
15321     /* Note that the optimization of using 'swash' if it is the only thing in
15322      * the class doesn't have us change swash at all, so it can include things
15323      * that are also in the bitmap; otherwise we have purposely deleted that
15324      * duplicate information */
15325     set_ANYOF_arg(pRExC_state, ret, cp_list,
15326                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15327                    ? listsv : NULL,
15328                   only_utf8_locale_list,
15329                   swash, has_user_defined_property);
15330
15331     *flagp |= HASWIDTH|SIMPLE;
15332
15333     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15334         RExC_contains_locale = 1;
15335     }
15336
15337     return ret;
15338 }
15339
15340 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15341
15342 STATIC void
15343 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15344                 regnode* const node,
15345                 SV* const cp_list,
15346                 SV* const runtime_defns,
15347                 SV* const only_utf8_locale_list,
15348                 SV* const swash,
15349                 const bool has_user_defined_property)
15350 {
15351     /* Sets the arg field of an ANYOF-type node 'node', using information about
15352      * the node passed-in.  If there is nothing outside the node's bitmap, the
15353      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
15354      * the count returned by add_data(), having allocated and stored an array,
15355      * av, that that count references, as follows:
15356      *  av[0] stores the character class description in its textual form.
15357      *        This is used later (regexec.c:Perl_regclass_swash()) to
15358      *        initialize the appropriate swash, and is also useful for dumping
15359      *        the regnode.  This is set to &PL_sv_undef if the textual
15360      *        description is not needed at run-time (as happens if the other
15361      *        elements completely define the class)
15362      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15363      *        computed from av[0].  But if no further computation need be done,
15364      *        the swash is stored here now (and av[0] is &PL_sv_undef).
15365      *  av[2] stores the inversion list of code points that match only if the
15366      *        current locale is UTF-8
15367      *  av[3] stores the cp_list inversion list for use in addition or instead
15368      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15369      *        (Otherwise everything needed is already in av[0] and av[1])
15370      *  av[4] is set if any component of the class is from a user-defined
15371      *        property; used only if av[3] exists */
15372
15373     UV n;
15374
15375     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15376
15377     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15378         assert(! (ANYOF_FLAGS(node)
15379                   & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15380                      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15381         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15382     }
15383     else {
15384         AV * const av = newAV();
15385         SV *rv;
15386
15387         assert(ANYOF_FLAGS(node)
15388                & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15389                   |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15390
15391         av_store(av, 0, (runtime_defns)
15392                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15393         if (swash) {
15394             assert(cp_list);
15395             av_store(av, 1, swash);
15396             SvREFCNT_dec_NN(cp_list);
15397         }
15398         else {
15399             av_store(av, 1, &PL_sv_undef);
15400             if (cp_list) {
15401                 av_store(av, 3, cp_list);
15402                 av_store(av, 4, newSVuv(has_user_defined_property));
15403             }
15404         }
15405
15406         if (only_utf8_locale_list) {
15407             av_store(av, 2, only_utf8_locale_list);
15408         }
15409         else {
15410             av_store(av, 2, &PL_sv_undef);
15411         }
15412
15413         rv = newRV_noinc(MUTABLE_SV(av));
15414         n = add_data(pRExC_state, STR_WITH_LEN("s"));
15415         RExC_rxi->data->data[n] = (void*)rv;
15416         ARG_SET(node, n);
15417     }
15418 }
15419
15420 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15421 SV *
15422 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15423                                         const regnode* node,
15424                                         bool doinit,
15425                                         SV** listsvp,
15426                                         SV** only_utf8_locale_ptr,
15427                                         SV*  exclude_list)
15428
15429 {
15430     /* For internal core use only.
15431      * Returns the swash for the input 'node' in the regex 'prog'.
15432      * If <doinit> is 'true', will attempt to create the swash if not already
15433      *    done.
15434      * If <listsvp> is non-null, will return the printable contents of the
15435      *    swash.  This can be used to get debugging information even before the
15436      *    swash exists, by calling this function with 'doinit' set to false, in
15437      *    which case the components that will be used to eventually create the
15438      *    swash are returned  (in a printable form).
15439      * If <exclude_list> is not NULL, it is an inversion list of things to
15440      *    exclude from what's returned in <listsvp>.
15441      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
15442      * that, in spite of this function's name, the swash it returns may include
15443      * the bitmap data as well */
15444
15445     SV *sw  = NULL;
15446     SV *si  = NULL;         /* Input swash initialization string */
15447     SV*  invlist = NULL;
15448
15449     RXi_GET_DECL(prog,progi);
15450     const struct reg_data * const data = prog ? progi->data : NULL;
15451
15452     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15453
15454     assert(ANYOF_FLAGS(node)
15455         & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15456            |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15457
15458     if (data && data->count) {
15459         const U32 n = ARG(node);
15460
15461         if (data->what[n] == 's') {
15462             SV * const rv = MUTABLE_SV(data->data[n]);
15463             AV * const av = MUTABLE_AV(SvRV(rv));
15464             SV **const ary = AvARRAY(av);
15465             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15466
15467             si = *ary;  /* ary[0] = the string to initialize the swash with */
15468
15469             /* Elements 3 and 4 are either both present or both absent. [3] is
15470              * any inversion list generated at compile time; [4] indicates if
15471              * that inversion list has any user-defined properties in it. */
15472             if (av_tindex(av) >= 2) {
15473                 if (only_utf8_locale_ptr
15474                     && ary[2]
15475                     && ary[2] != &PL_sv_undef)
15476                 {
15477                     *only_utf8_locale_ptr = ary[2];
15478                 }
15479                 else {
15480                     assert(only_utf8_locale_ptr);
15481                     *only_utf8_locale_ptr = NULL;
15482                 }
15483
15484                 if (av_tindex(av) >= 3) {
15485                     invlist = ary[3];
15486                     if (SvUV(ary[4])) {
15487                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15488                     }
15489                 }
15490                 else {
15491                     invlist = NULL;
15492                 }
15493             }
15494
15495             /* Element [1] is reserved for the set-up swash.  If already there,
15496              * return it; if not, create it and store it there */
15497             if (ary[1] && SvROK(ary[1])) {
15498                 sw = ary[1];
15499             }
15500             else if (doinit && ((si && si != &PL_sv_undef)
15501                                  || (invlist && invlist != &PL_sv_undef))) {
15502                 assert(si);
15503                 sw = _core_swash_init("utf8", /* the utf8 package */
15504                                       "", /* nameless */
15505                                       si,
15506                                       1, /* binary */
15507                                       0, /* not from tr/// */
15508                                       invlist,
15509                                       &swash_init_flags);
15510                 (void)av_store(av, 1, sw);
15511             }
15512         }
15513     }
15514
15515     /* If requested, return a printable version of what this swash matches */
15516     if (listsvp) {
15517         SV* matches_string = newSVpvs("");
15518
15519         /* The swash should be used, if possible, to get the data, as it
15520          * contains the resolved data.  But this function can be called at
15521          * compile-time, before everything gets resolved, in which case we
15522          * return the currently best available information, which is the string
15523          * that will eventually be used to do that resolving, 'si' */
15524         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15525             && (si && si != &PL_sv_undef))
15526         {
15527             sv_catsv(matches_string, si);
15528         }
15529
15530         /* Add the inversion list to whatever we have.  This may have come from
15531          * the swash, or from an input parameter */
15532         if (invlist) {
15533             if (exclude_list) {
15534                 SV* clone = invlist_clone(invlist);
15535                 _invlist_subtract(clone, exclude_list, &clone);
15536                 sv_catsv(matches_string, _invlist_contents(clone));
15537                 SvREFCNT_dec_NN(clone);
15538             }
15539             else {
15540                 sv_catsv(matches_string, _invlist_contents(invlist));
15541             }
15542         }
15543         *listsvp = matches_string;
15544     }
15545
15546     return sw;
15547 }
15548 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15549
15550 /* reg_skipcomment()
15551
15552    Absorbs an /x style # comment from the input stream,
15553    returning a pointer to the first character beyond the comment, or if the
15554    comment terminates the pattern without anything following it, this returns
15555    one past the final character of the pattern (in other words, RExC_end) and
15556    sets the REG_RUN_ON_COMMENT_SEEN flag.
15557
15558    Note it's the callers responsibility to ensure that we are
15559    actually in /x mode
15560
15561 */
15562
15563 PERL_STATIC_INLINE char*
15564 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15565 {
15566     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15567
15568     assert(*p == '#');
15569
15570     while (p < RExC_end) {
15571         if (*(++p) == '\n') {
15572             return p+1;
15573         }
15574     }
15575
15576     /* we ran off the end of the pattern without ending the comment, so we have
15577      * to add an \n when wrapping */
15578     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15579     return p;
15580 }
15581
15582 /* nextchar()
15583
15584    Advances the parse position, and optionally absorbs
15585    "whitespace" from the inputstream.
15586
15587    Without /x "whitespace" means (?#...) style comments only,
15588    with /x this means (?#...) and # comments and whitespace proper.
15589
15590    Returns the RExC_parse point from BEFORE the scan occurs.
15591
15592    This is the /x friendly way of saying RExC_parse++.
15593 */
15594
15595 STATIC char*
15596 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15597 {
15598     char* const retval = RExC_parse++;
15599
15600     PERL_ARGS_ASSERT_NEXTCHAR;
15601
15602     for (;;) {
15603         if (RExC_end - RExC_parse >= 3
15604             && *RExC_parse == '('
15605             && RExC_parse[1] == '?'
15606             && RExC_parse[2] == '#')
15607         {
15608             while (*RExC_parse != ')') {
15609                 if (RExC_parse == RExC_end)
15610                     FAIL("Sequence (?#... not terminated");
15611                 RExC_parse++;
15612             }
15613             RExC_parse++;
15614             continue;
15615         }
15616         if (RExC_flags & RXf_PMf_EXTENDED) {
15617             char * p = regpatws(pRExC_state, RExC_parse,
15618                                           TRUE); /* means recognize comments */
15619             if (p != RExC_parse) {
15620                 RExC_parse = p;
15621                 continue;
15622             }
15623         }
15624         return retval;
15625     }
15626 }
15627
15628 STATIC regnode *
15629 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15630 {
15631     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15632      * space.  In pass1, it aligns and increments RExC_size; in pass2,
15633      * RExC_emit */
15634
15635     regnode * const ret = RExC_emit;
15636     GET_RE_DEBUG_FLAGS_DECL;
15637
15638     PERL_ARGS_ASSERT_REGNODE_GUTS;
15639
15640     assert(extra_size >= regarglen[op]);
15641
15642     if (SIZE_ONLY) {
15643         SIZE_ALIGN(RExC_size);
15644         RExC_size += 1 + extra_size;
15645         return(ret);
15646     }
15647     if (RExC_emit >= RExC_emit_bound)
15648         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15649                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15650
15651     NODE_ALIGN_FILL(ret);
15652 #ifndef RE_TRACK_PATTERN_OFFSETS
15653     PERL_UNUSED_ARG(name);
15654 #else
15655     if (RExC_offsets) {         /* MJD */
15656         MJD_OFFSET_DEBUG(
15657               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15658               name, __LINE__,
15659               PL_reg_name[op],
15660               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15661                 ? "Overwriting end of array!\n" : "OK",
15662               (UV)(RExC_emit - RExC_emit_start),
15663               (UV)(RExC_parse - RExC_start),
15664               (UV)RExC_offsets[0]));
15665         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15666     }
15667 #endif
15668     return(ret);
15669 }
15670
15671 /*
15672 - reg_node - emit a node
15673 */
15674 STATIC regnode *                        /* Location. */
15675 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15676 {
15677     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15678
15679     PERL_ARGS_ASSERT_REG_NODE;
15680
15681     assert(regarglen[op] == 0);
15682
15683     if (PASS2) {
15684         regnode *ptr = ret;
15685         FILL_ADVANCE_NODE(ptr, op);
15686         RExC_emit = ptr;
15687     }
15688     return(ret);
15689 }
15690
15691 /*
15692 - reganode - emit a node with an argument
15693 */
15694 STATIC regnode *                        /* Location. */
15695 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15696 {
15697     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15698
15699     PERL_ARGS_ASSERT_REGANODE;
15700
15701     assert(regarglen[op] == 1);
15702
15703     if (PASS2) {
15704         regnode *ptr = ret;
15705         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15706         RExC_emit = ptr;
15707     }
15708     return(ret);
15709 }
15710
15711 STATIC regnode *
15712 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15713 {
15714     /* emit a node with U32 and I32 arguments */
15715
15716     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15717
15718     PERL_ARGS_ASSERT_REG2LANODE;
15719
15720     assert(regarglen[op] == 2);
15721
15722     if (PASS2) {
15723         regnode *ptr = ret;
15724         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15725         RExC_emit = ptr;
15726     }
15727     return(ret);
15728 }
15729
15730 /*
15731 - reguni - emit (if appropriate) a Unicode character
15732 */
15733 PERL_STATIC_INLINE STRLEN
15734 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15735 {
15736     PERL_ARGS_ASSERT_REGUNI;
15737
15738     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15739 }
15740
15741 /*
15742 - reginsert - insert an operator in front of already-emitted operand
15743 *
15744 * Means relocating the operand.
15745 */
15746 STATIC void
15747 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15748 {
15749     regnode *src;
15750     regnode *dst;
15751     regnode *place;
15752     const int offset = regarglen[(U8)op];
15753     const int size = NODE_STEP_REGNODE + offset;
15754     GET_RE_DEBUG_FLAGS_DECL;
15755
15756     PERL_ARGS_ASSERT_REGINSERT;
15757     PERL_UNUSED_CONTEXT;
15758     PERL_UNUSED_ARG(depth);
15759 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15760     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15761     if (SIZE_ONLY) {
15762         RExC_size += size;
15763         return;
15764     }
15765
15766     src = RExC_emit;
15767     RExC_emit += size;
15768     dst = RExC_emit;
15769     if (RExC_open_parens) {
15770         int paren;
15771         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15772         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15773             if ( RExC_open_parens[paren] >= opnd ) {
15774                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15775                 RExC_open_parens[paren] += size;
15776             } else {
15777                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15778             }
15779             if ( RExC_close_parens[paren] >= opnd ) {
15780                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15781                 RExC_close_parens[paren] += size;
15782             } else {
15783                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15784             }
15785         }
15786     }
15787
15788     while (src > opnd) {
15789         StructCopy(--src, --dst, regnode);
15790 #ifdef RE_TRACK_PATTERN_OFFSETS
15791         if (RExC_offsets) {     /* MJD 20010112 */
15792             MJD_OFFSET_DEBUG(
15793                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15794                   "reg_insert",
15795                   __LINE__,
15796                   PL_reg_name[op],
15797                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15798                     ? "Overwriting end of array!\n" : "OK",
15799                   (UV)(src - RExC_emit_start),
15800                   (UV)(dst - RExC_emit_start),
15801                   (UV)RExC_offsets[0]));
15802             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15803             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15804         }
15805 #endif
15806     }
15807
15808
15809     place = opnd;               /* Op node, where operand used to be. */
15810 #ifdef RE_TRACK_PATTERN_OFFSETS
15811     if (RExC_offsets) {         /* MJD */
15812         MJD_OFFSET_DEBUG(
15813               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15814               "reginsert",
15815               __LINE__,
15816               PL_reg_name[op],
15817               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15818               ? "Overwriting end of array!\n" : "OK",
15819               (UV)(place - RExC_emit_start),
15820               (UV)(RExC_parse - RExC_start),
15821               (UV)RExC_offsets[0]));
15822         Set_Node_Offset(place, RExC_parse);
15823         Set_Node_Length(place, 1);
15824     }
15825 #endif
15826     src = NEXTOPER(place);
15827     FILL_ADVANCE_NODE(place, op);
15828     Zero(src, offset, regnode);
15829 }
15830
15831 /*
15832 - regtail - set the next-pointer at the end of a node chain of p to val.
15833 - SEE ALSO: regtail_study
15834 */
15835 /* TODO: All three parms should be const */
15836 STATIC void
15837 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15838                 const regnode *val,U32 depth)
15839 {
15840     regnode *scan;
15841     GET_RE_DEBUG_FLAGS_DECL;
15842
15843     PERL_ARGS_ASSERT_REGTAIL;
15844 #ifndef DEBUGGING
15845     PERL_UNUSED_ARG(depth);
15846 #endif
15847
15848     if (SIZE_ONLY)
15849         return;
15850
15851     /* Find last node. */
15852     scan = p;
15853     for (;;) {
15854         regnode * const temp = regnext(scan);
15855         DEBUG_PARSE_r({
15856             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15857             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15858             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15859                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15860                     (temp == NULL ? "->" : ""),
15861                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15862             );
15863         });
15864         if (temp == NULL)
15865             break;
15866         scan = temp;
15867     }
15868
15869     if (reg_off_by_arg[OP(scan)]) {
15870         ARG_SET(scan, val - scan);
15871     }
15872     else {
15873         NEXT_OFF(scan) = val - scan;
15874     }
15875 }
15876
15877 #ifdef DEBUGGING
15878 /*
15879 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15880 - Look for optimizable sequences at the same time.
15881 - currently only looks for EXACT chains.
15882
15883 This is experimental code. The idea is to use this routine to perform
15884 in place optimizations on branches and groups as they are constructed,
15885 with the long term intention of removing optimization from study_chunk so
15886 that it is purely analytical.
15887
15888 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15889 to control which is which.
15890
15891 */
15892 /* TODO: All four parms should be const */
15893
15894 STATIC U8
15895 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15896                       const regnode *val,U32 depth)
15897 {
15898     regnode *scan;
15899     U8 exact = PSEUDO;
15900 #ifdef EXPERIMENTAL_INPLACESCAN
15901     I32 min = 0;
15902 #endif
15903     GET_RE_DEBUG_FLAGS_DECL;
15904
15905     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15906
15907
15908     if (SIZE_ONLY)
15909         return exact;
15910
15911     /* Find last node. */
15912
15913     scan = p;
15914     for (;;) {
15915         regnode * const temp = regnext(scan);
15916 #ifdef EXPERIMENTAL_INPLACESCAN
15917         if (PL_regkind[OP(scan)] == EXACT) {
15918             bool unfolded_multi_char;   /* Unexamined in this routine */
15919             if (join_exact(pRExC_state, scan, &min,
15920                            &unfolded_multi_char, 1, val, depth+1))
15921                 return EXACT;
15922         }
15923 #endif
15924         if ( exact ) {
15925             switch (OP(scan)) {
15926                 case EXACT:
15927                 case EXACTF:
15928                 case EXACTFA_NO_TRIE:
15929                 case EXACTFA:
15930                 case EXACTFU:
15931                 case EXACTFU_SS:
15932                 case EXACTFL:
15933                         if( exact == PSEUDO )
15934                             exact= OP(scan);
15935                         else if ( exact != OP(scan) )
15936                             exact= 0;
15937                 case NOTHING:
15938                     break;
15939                 default:
15940                     exact= 0;
15941             }
15942         }
15943         DEBUG_PARSE_r({
15944             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15945             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15946             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15947                 SvPV_nolen_const(RExC_mysv),
15948                 REG_NODE_NUM(scan),
15949                 PL_reg_name[exact]);
15950         });
15951         if (temp == NULL)
15952             break;
15953         scan = temp;
15954     }
15955     DEBUG_PARSE_r({
15956         DEBUG_PARSE_MSG("");
15957         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
15958         PerlIO_printf(Perl_debug_log,
15959                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15960                       SvPV_nolen_const(RExC_mysv),
15961                       (IV)REG_NODE_NUM(val),
15962                       (IV)(val - scan)
15963         );
15964     });
15965     if (reg_off_by_arg[OP(scan)]) {
15966         ARG_SET(scan, val - scan);
15967     }
15968     else {
15969         NEXT_OFF(scan) = val - scan;
15970     }
15971
15972     return exact;
15973 }
15974 #endif
15975
15976 /*
15977  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15978  */
15979 #ifdef DEBUGGING
15980
15981 static void
15982 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15983 {
15984     int bit;
15985     int set=0;
15986
15987     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15988
15989     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15990         if (flags & (1<<bit)) {
15991             if (!set++ && lead)
15992                 PerlIO_printf(Perl_debug_log, "%s",lead);
15993             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15994         }
15995     }
15996     if (lead)  {
15997         if (set)
15998             PerlIO_printf(Perl_debug_log, "\n");
15999         else
16000             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16001     }
16002 }
16003
16004 static void
16005 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16006 {
16007     int bit;
16008     int set=0;
16009     regex_charset cs;
16010
16011     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16012
16013     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16014         if (flags & (1<<bit)) {
16015             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
16016                 continue;
16017             }
16018             if (!set++ && lead)
16019                 PerlIO_printf(Perl_debug_log, "%s",lead);
16020             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16021         }
16022     }
16023     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16024             if (!set++ && lead) {
16025                 PerlIO_printf(Perl_debug_log, "%s",lead);
16026             }
16027             switch (cs) {
16028                 case REGEX_UNICODE_CHARSET:
16029                     PerlIO_printf(Perl_debug_log, "UNICODE");
16030                     break;
16031                 case REGEX_LOCALE_CHARSET:
16032                     PerlIO_printf(Perl_debug_log, "LOCALE");
16033                     break;
16034                 case REGEX_ASCII_RESTRICTED_CHARSET:
16035                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16036                     break;
16037                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16038                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16039                     break;
16040                 default:
16041                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16042                     break;
16043             }
16044     }
16045     if (lead)  {
16046         if (set)
16047             PerlIO_printf(Perl_debug_log, "\n");
16048         else
16049             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16050     }
16051 }
16052 #endif
16053
16054 void
16055 Perl_regdump(pTHX_ const regexp *r)
16056 {
16057 #ifdef DEBUGGING
16058     SV * const sv = sv_newmortal();
16059     SV *dsv= sv_newmortal();
16060     RXi_GET_DECL(r,ri);
16061     GET_RE_DEBUG_FLAGS_DECL;
16062
16063     PERL_ARGS_ASSERT_REGDUMP;
16064
16065     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16066
16067     /* Header fields of interest. */
16068     if (r->anchored_substr) {
16069         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16070             RE_SV_DUMPLEN(r->anchored_substr), 30);
16071         PerlIO_printf(Perl_debug_log,
16072                       "anchored %s%s at %"IVdf" ",
16073                       s, RE_SV_TAIL(r->anchored_substr),
16074                       (IV)r->anchored_offset);
16075     } else if (r->anchored_utf8) {
16076         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16077             RE_SV_DUMPLEN(r->anchored_utf8), 30);
16078         PerlIO_printf(Perl_debug_log,
16079                       "anchored utf8 %s%s at %"IVdf" ",
16080                       s, RE_SV_TAIL(r->anchored_utf8),
16081                       (IV)r->anchored_offset);
16082     }
16083     if (r->float_substr) {
16084         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16085             RE_SV_DUMPLEN(r->float_substr), 30);
16086         PerlIO_printf(Perl_debug_log,
16087                       "floating %s%s at %"IVdf"..%"UVuf" ",
16088                       s, RE_SV_TAIL(r->float_substr),
16089                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16090     } else if (r->float_utf8) {
16091         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16092             RE_SV_DUMPLEN(r->float_utf8), 30);
16093         PerlIO_printf(Perl_debug_log,
16094                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16095                       s, RE_SV_TAIL(r->float_utf8),
16096                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16097     }
16098     if (r->check_substr || r->check_utf8)
16099         PerlIO_printf(Perl_debug_log,
16100                       (const char *)
16101                       (r->check_substr == r->float_substr
16102                        && r->check_utf8 == r->float_utf8
16103                        ? "(checking floating" : "(checking anchored"));
16104     if (r->intflags & PREGf_NOSCAN)
16105         PerlIO_printf(Perl_debug_log, " noscan");
16106     if (r->extflags & RXf_CHECK_ALL)
16107         PerlIO_printf(Perl_debug_log, " isall");
16108     if (r->check_substr || r->check_utf8)
16109         PerlIO_printf(Perl_debug_log, ") ");
16110
16111     if (ri->regstclass) {
16112         regprop(r, sv, ri->regstclass, NULL, NULL);
16113         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16114     }
16115     if (r->intflags & PREGf_ANCH) {
16116         PerlIO_printf(Perl_debug_log, "anchored");
16117         if (r->intflags & PREGf_ANCH_MBOL)
16118             PerlIO_printf(Perl_debug_log, "(MBOL)");
16119         if (r->intflags & PREGf_ANCH_SBOL)
16120             PerlIO_printf(Perl_debug_log, "(SBOL)");
16121         if (r->intflags & PREGf_ANCH_GPOS)
16122             PerlIO_printf(Perl_debug_log, "(GPOS)");
16123         PerlIO_putc(Perl_debug_log, ' ');
16124     }
16125     if (r->intflags & PREGf_GPOS_SEEN)
16126         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16127     if (r->intflags & PREGf_SKIP)
16128         PerlIO_printf(Perl_debug_log, "plus ");
16129     if (r->intflags & PREGf_IMPLICIT)
16130         PerlIO_printf(Perl_debug_log, "implicit ");
16131     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16132     if (r->extflags & RXf_EVAL_SEEN)
16133         PerlIO_printf(Perl_debug_log, "with eval ");
16134     PerlIO_printf(Perl_debug_log, "\n");
16135     DEBUG_FLAGS_r({
16136         regdump_extflags("r->extflags: ",r->extflags);
16137         regdump_intflags("r->intflags: ",r->intflags);
16138     });
16139 #else
16140     PERL_ARGS_ASSERT_REGDUMP;
16141     PERL_UNUSED_CONTEXT;
16142     PERL_UNUSED_ARG(r);
16143 #endif  /* DEBUGGING */
16144 }
16145
16146 /*
16147 - regprop - printable representation of opcode, with run time support
16148 */
16149
16150 void
16151 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16152 {
16153 #ifdef DEBUGGING
16154     int k;
16155
16156     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16157     static const char * const anyofs[] = {
16158 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16159     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
16160     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
16161     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
16162     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
16163     || _CC_VERTSPACE != 16
16164   #error Need to adjust order of anyofs[]
16165 #endif
16166         "\\w",
16167         "\\W",
16168         "\\d",
16169         "\\D",
16170         "[:alpha:]",
16171         "[:^alpha:]",
16172         "[:lower:]",
16173         "[:^lower:]",
16174         "[:upper:]",
16175         "[:^upper:]",
16176         "[:punct:]",
16177         "[:^punct:]",
16178         "[:print:]",
16179         "[:^print:]",
16180         "[:alnum:]",
16181         "[:^alnum:]",
16182         "[:graph:]",
16183         "[:^graph:]",
16184         "[:cased:]",
16185         "[:^cased:]",
16186         "\\s",
16187         "\\S",
16188         "[:blank:]",
16189         "[:^blank:]",
16190         "[:xdigit:]",
16191         "[:^xdigit:]",
16192         "[:space:]",
16193         "[:^space:]",
16194         "[:cntrl:]",
16195         "[:^cntrl:]",
16196         "[:ascii:]",
16197         "[:^ascii:]",
16198         "\\v",
16199         "\\V"
16200     };
16201     RXi_GET_DECL(prog,progi);
16202     GET_RE_DEBUG_FLAGS_DECL;
16203
16204     PERL_ARGS_ASSERT_REGPROP;
16205
16206     sv_setpvn(sv, "", 0);
16207
16208     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
16209         /* It would be nice to FAIL() here, but this may be called from
16210            regexec.c, and it would be hard to supply pRExC_state. */
16211         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16212                                               (int)OP(o), (int)REGNODE_MAX);
16213     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16214
16215     k = PL_regkind[OP(o)];
16216
16217     if (k == EXACT) {
16218         sv_catpvs(sv, " ");
16219         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16220          * is a crude hack but it may be the best for now since
16221          * we have no flag "this EXACTish node was UTF-8"
16222          * --jhi */
16223         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16224                   PERL_PV_ESCAPE_UNI_DETECT |
16225                   PERL_PV_ESCAPE_NONASCII   |
16226                   PERL_PV_PRETTY_ELLIPSES   |
16227                   PERL_PV_PRETTY_LTGT       |
16228                   PERL_PV_PRETTY_NOCLEAR
16229                   );
16230     } else if (k == TRIE) {
16231         /* print the details of the trie in dumpuntil instead, as
16232          * progi->data isn't available here */
16233         const char op = OP(o);
16234         const U32 n = ARG(o);
16235         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16236                (reg_ac_data *)progi->data->data[n] :
16237                NULL;
16238         const reg_trie_data * const trie
16239             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16240
16241         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16242         DEBUG_TRIE_COMPILE_r(
16243           Perl_sv_catpvf(aTHX_ sv,
16244             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16245             (UV)trie->startstate,
16246             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16247             (UV)trie->wordcount,
16248             (UV)trie->minlen,
16249             (UV)trie->maxlen,
16250             (UV)TRIE_CHARCOUNT(trie),
16251             (UV)trie->uniquecharcount
16252           );
16253         );
16254         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16255             sv_catpvs(sv, "[");
16256             (void) put_charclass_bitmap_innards(sv,
16257                                                 (IS_ANYOF_TRIE(op))
16258                                                  ? ANYOF_BITMAP(o)
16259                                                  : TRIE_BITMAP(trie),
16260                                                 NULL);
16261             sv_catpvs(sv, "]");
16262         }
16263
16264     } else if (k == CURLY) {
16265         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16266             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16267         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16268     }
16269     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
16270         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16271     else if (k == REF || k == OPEN || k == CLOSE
16272              || k == GROUPP || OP(o)==ACCEPT)
16273     {
16274         AV *name_list= NULL;
16275         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
16276         if ( RXp_PAREN_NAMES(prog) ) {
16277             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16278         } else if ( pRExC_state ) {
16279             name_list= RExC_paren_name_list;
16280         }
16281         if (name_list) {
16282             if ( k != REF || (OP(o) < NREF)) {
16283                 SV **name= av_fetch(name_list, ARG(o), 0 );
16284                 if (name)
16285                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16286             }
16287             else {
16288                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16289                 I32 *nums=(I32*)SvPVX(sv_dat);
16290                 SV **name= av_fetch(name_list, nums[0], 0 );
16291                 I32 n;
16292                 if (name) {
16293                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
16294                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16295                                     (n ? "," : ""), (IV)nums[n]);
16296                     }
16297                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16298                 }
16299             }
16300         }
16301         if ( k == REF && reginfo) {
16302             U32 n = ARG(o);  /* which paren pair */
16303             I32 ln = prog->offs[n].start;
16304             if (prog->lastparen < n || ln == -1)
16305                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16306             else if (ln == prog->offs[n].end)
16307                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16308             else {
16309                 const char *s = reginfo->strbeg + ln;
16310                 Perl_sv_catpvf(aTHX_ sv, ": ");
16311                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16312                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16313             }
16314         }
16315     } else if (k == GOSUB) {
16316         AV *name_list= NULL;
16317         if ( RXp_PAREN_NAMES(prog) ) {
16318             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16319         } else if ( pRExC_state ) {
16320             name_list= RExC_paren_name_list;
16321         }
16322
16323         /* Paren and offset */
16324         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16325         if (name_list) {
16326             SV **name= av_fetch(name_list, ARG(o), 0 );
16327             if (name)
16328                 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16329         }
16330     }
16331     else if (k == VERB) {
16332         if (!o->flags)
16333             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16334                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16335     } else if (k == LOGICAL)
16336         /* 2: embedded, otherwise 1 */
16337         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16338     else if (k == ANYOF) {
16339         const U8 flags = ANYOF_FLAGS(o);
16340         int do_sep = 0;
16341         SV* bitmap_invlist;  /* Will hold what the bit map contains */
16342
16343
16344         if (flags & ANYOF_LOCALE_FLAGS)
16345             sv_catpvs(sv, "{loc}");
16346         if (flags & ANYOF_LOC_FOLD)
16347             sv_catpvs(sv, "{i}");
16348         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16349         if (flags & ANYOF_INVERT)
16350             sv_catpvs(sv, "^");
16351
16352         /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16353          * */
16354         do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16355                                                             &bitmap_invlist);
16356
16357         /* output any special charclass tests (used entirely under use
16358          * locale) * */
16359         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16360             int i;
16361             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16362                 if (ANYOF_POSIXL_TEST(o,i)) {
16363                     sv_catpv(sv, anyofs[i]);
16364                     do_sep = 1;
16365                 }
16366             }
16367         }
16368
16369         if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16370                       |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16371                       |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16372                       |ANYOF_LOC_FOLD)))
16373         {
16374             if (do_sep) {
16375                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16376                 if (flags & ANYOF_INVERT)
16377                     /*make sure the invert info is in each */
16378                     sv_catpvs(sv, "^");
16379             }
16380
16381             if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16382                 sv_catpvs(sv, "{non-utf8-latin1-all}");
16383             }
16384
16385             /* output information about the unicode matching */
16386             if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16387                 sv_catpvs(sv, "{above_bitmap_all}");
16388             else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16389                 SV *lv; /* Set if there is something outside the bit map. */
16390                 bool byte_output = FALSE;   /* If something in the bitmap has
16391                                                been output */
16392                 SV *only_utf8_locale;
16393
16394                 /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
16395                  * is used to guarantee that nothing in the bitmap gets
16396                  * returned */
16397                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16398                                                     &lv, &only_utf8_locale,
16399                                                     bitmap_invlist);
16400                 if (lv && lv != &PL_sv_undef) {
16401                     char *s = savesvpv(lv);
16402                     char * const origs = s;
16403
16404                     while (*s && *s != '\n')
16405                         s++;
16406
16407                     if (*s == '\n') {
16408                         const char * const t = ++s;
16409
16410                         if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16411                             sv_catpvs(sv, "{outside bitmap}");
16412                         }
16413                         else {
16414                             sv_catpvs(sv, "{utf8}");
16415                         }
16416
16417                         if (byte_output) {
16418                             sv_catpvs(sv, " ");
16419                         }
16420
16421                         while (*s) {
16422                             if (*s == '\n') {
16423
16424                                 /* Truncate very long output */
16425                                 if (s - origs > 256) {
16426                                     Perl_sv_catpvf(aTHX_ sv,
16427                                                 "%.*s...",
16428                                                 (int) (s - origs - 1),
16429                                                 t);
16430                                     goto out_dump;
16431                                 }
16432                                 *s = ' ';
16433                             }
16434                             else if (*s == '\t') {
16435                                 *s = '-';
16436                             }
16437                             s++;
16438                         }
16439                         if (s[-1] == ' ')
16440                             s[-1] = 0;
16441
16442                         sv_catpv(sv, t);
16443                     }
16444
16445                 out_dump:
16446
16447                     Safefree(origs);
16448                     SvREFCNT_dec_NN(lv);
16449                 }
16450
16451                 if ((flags & ANYOF_LOC_FOLD)
16452                      && only_utf8_locale
16453                      && only_utf8_locale != &PL_sv_undef)
16454                 {
16455                     UV start, end;
16456                     int max_entries = 256;
16457
16458                     sv_catpvs(sv, "{utf8 locale}");
16459                     invlist_iterinit(only_utf8_locale);
16460                     while (invlist_iternext(only_utf8_locale,
16461                                             &start, &end)) {
16462                         put_range(sv, start, end, FALSE);
16463                         max_entries --;
16464                         if (max_entries < 0) {
16465                             sv_catpvs(sv, "...");
16466                             break;
16467                         }
16468                     }
16469                     invlist_iterfinish(only_utf8_locale);
16470                 }
16471             }
16472         }
16473         SvREFCNT_dec(bitmap_invlist);
16474
16475
16476         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16477     }
16478     else if (k == POSIXD || k == NPOSIXD) {
16479         U8 index = FLAGS(o) * 2;
16480         if (index < C_ARRAY_LENGTH(anyofs)) {
16481             if (*anyofs[index] != '[')  {
16482                 sv_catpv(sv, "[");
16483             }
16484             sv_catpv(sv, anyofs[index]);
16485             if (*anyofs[index] != '[')  {
16486                 sv_catpv(sv, "]");
16487             }
16488         }
16489         else {
16490             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16491         }
16492     }
16493     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16494         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16495     else if (OP(o) == SBOL)
16496         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16497 #else
16498     PERL_UNUSED_CONTEXT;
16499     PERL_UNUSED_ARG(sv);
16500     PERL_UNUSED_ARG(o);
16501     PERL_UNUSED_ARG(prog);
16502     PERL_UNUSED_ARG(reginfo);
16503 #endif  /* DEBUGGING */
16504 }
16505
16506
16507
16508 SV *
16509 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16510 {                               /* Assume that RE_INTUIT is set */
16511     struct regexp *const prog = ReANY(r);
16512     GET_RE_DEBUG_FLAGS_DECL;
16513
16514     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16515     PERL_UNUSED_CONTEXT;
16516
16517     DEBUG_COMPILE_r(
16518         {
16519             const char * const s = SvPV_nolen_const(prog->check_substr
16520                       ? prog->check_substr : prog->check_utf8);
16521
16522             if (!PL_colorset) reginitcolors();
16523             PerlIO_printf(Perl_debug_log,
16524                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16525                       PL_colors[4],
16526                       prog->check_substr ? "" : "utf8 ",
16527                       PL_colors[5],PL_colors[0],
16528                       s,
16529                       PL_colors[1],
16530                       (strlen(s) > 60 ? "..." : ""));
16531         } );
16532
16533     return prog->check_substr ? prog->check_substr : prog->check_utf8;
16534 }
16535
16536 /*
16537    pregfree()
16538
16539    handles refcounting and freeing the perl core regexp structure. When
16540    it is necessary to actually free the structure the first thing it
16541    does is call the 'free' method of the regexp_engine associated to
16542    the regexp, allowing the handling of the void *pprivate; member
16543    first. (This routine is not overridable by extensions, which is why
16544    the extensions free is called first.)
16545
16546    See regdupe and regdupe_internal if you change anything here.
16547 */
16548 #ifndef PERL_IN_XSUB_RE
16549 void
16550 Perl_pregfree(pTHX_ REGEXP *r)
16551 {
16552     SvREFCNT_dec(r);
16553 }
16554
16555 void
16556 Perl_pregfree2(pTHX_ REGEXP *rx)
16557 {
16558     struct regexp *const r = ReANY(rx);
16559     GET_RE_DEBUG_FLAGS_DECL;
16560
16561     PERL_ARGS_ASSERT_PREGFREE2;
16562
16563     if (r->mother_re) {
16564         ReREFCNT_dec(r->mother_re);
16565     } else {
16566         CALLREGFREE_PVT(rx); /* free the private data */
16567         SvREFCNT_dec(RXp_PAREN_NAMES(r));
16568         Safefree(r->xpv_len_u.xpvlenu_pv);
16569     }
16570     if (r->substrs) {
16571         SvREFCNT_dec(r->anchored_substr);
16572         SvREFCNT_dec(r->anchored_utf8);
16573         SvREFCNT_dec(r->float_substr);
16574         SvREFCNT_dec(r->float_utf8);
16575         Safefree(r->substrs);
16576     }
16577     RX_MATCH_COPY_FREE(rx);
16578 #ifdef PERL_ANY_COW
16579     SvREFCNT_dec(r->saved_copy);
16580 #endif
16581     Safefree(r->offs);
16582     SvREFCNT_dec(r->qr_anoncv);
16583     rx->sv_u.svu_rx = 0;
16584 }
16585
16586 /*  reg_temp_copy()
16587
16588     This is a hacky workaround to the structural issue of match results
16589     being stored in the regexp structure which is in turn stored in
16590     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16591     could be PL_curpm in multiple contexts, and could require multiple
16592     result sets being associated with the pattern simultaneously, such
16593     as when doing a recursive match with (??{$qr})
16594
16595     The solution is to make a lightweight copy of the regexp structure
16596     when a qr// is returned from the code executed by (??{$qr}) this
16597     lightweight copy doesn't actually own any of its data except for
16598     the starp/end and the actual regexp structure itself.
16599
16600 */
16601
16602
16603 REGEXP *
16604 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16605 {
16606     struct regexp *ret;
16607     struct regexp *const r = ReANY(rx);
16608     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16609
16610     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16611
16612     if (!ret_x)
16613         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16614     else {
16615         SvOK_off((SV *)ret_x);
16616         if (islv) {
16617             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16618                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16619                made both spots point to the same regexp body.) */
16620             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16621             assert(!SvPVX(ret_x));
16622             ret_x->sv_u.svu_rx = temp->sv_any;
16623             temp->sv_any = NULL;
16624             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16625             SvREFCNT_dec_NN(temp);
16626             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16627                ing below will not set it. */
16628             SvCUR_set(ret_x, SvCUR(rx));
16629         }
16630     }
16631     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16632        sv_force_normal(sv) is called.  */
16633     SvFAKE_on(ret_x);
16634     ret = ReANY(ret_x);
16635
16636     SvFLAGS(ret_x) |= SvUTF8(rx);
16637     /* We share the same string buffer as the original regexp, on which we
16638        hold a reference count, incremented when mother_re is set below.
16639        The string pointer is copied here, being part of the regexp struct.
16640      */
16641     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16642            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16643     if (r->offs) {
16644         const I32 npar = r->nparens+1;
16645         Newx(ret->offs, npar, regexp_paren_pair);
16646         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16647     }
16648     if (r->substrs) {
16649         Newx(ret->substrs, 1, struct reg_substr_data);
16650         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16651
16652         SvREFCNT_inc_void(ret->anchored_substr);
16653         SvREFCNT_inc_void(ret->anchored_utf8);
16654         SvREFCNT_inc_void(ret->float_substr);
16655         SvREFCNT_inc_void(ret->float_utf8);
16656
16657         /* check_substr and check_utf8, if non-NULL, point to either their
16658            anchored or float namesakes, and don't hold a second reference.  */
16659     }
16660     RX_MATCH_COPIED_off(ret_x);
16661 #ifdef PERL_ANY_COW
16662     ret->saved_copy = NULL;
16663 #endif
16664     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16665     SvREFCNT_inc_void(ret->qr_anoncv);
16666
16667     return ret_x;
16668 }
16669 #endif
16670
16671 /* regfree_internal()
16672
16673    Free the private data in a regexp. This is overloadable by
16674    extensions. Perl takes care of the regexp structure in pregfree(),
16675    this covers the *pprivate pointer which technically perl doesn't
16676    know about, however of course we have to handle the
16677    regexp_internal structure when no extension is in use.
16678
16679    Note this is called before freeing anything in the regexp
16680    structure.
16681  */
16682
16683 void
16684 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16685 {
16686     struct regexp *const r = ReANY(rx);
16687     RXi_GET_DECL(r,ri);
16688     GET_RE_DEBUG_FLAGS_DECL;
16689
16690     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16691
16692     DEBUG_COMPILE_r({
16693         if (!PL_colorset)
16694             reginitcolors();
16695         {
16696             SV *dsv= sv_newmortal();
16697             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16698                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16699             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16700                 PL_colors[4],PL_colors[5],s);
16701         }
16702     });
16703 #ifdef RE_TRACK_PATTERN_OFFSETS
16704     if (ri->u.offsets)
16705         Safefree(ri->u.offsets);             /* 20010421 MJD */
16706 #endif
16707     if (ri->code_blocks) {
16708         int n;
16709         for (n = 0; n < ri->num_code_blocks; n++)
16710             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16711         Safefree(ri->code_blocks);
16712     }
16713
16714     if (ri->data) {
16715         int n = ri->data->count;
16716
16717         while (--n >= 0) {
16718           /* If you add a ->what type here, update the comment in regcomp.h */
16719             switch (ri->data->what[n]) {
16720             case 'a':
16721             case 'r':
16722             case 's':
16723             case 'S':
16724             case 'u':
16725                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16726                 break;
16727             case 'f':
16728                 Safefree(ri->data->data[n]);
16729                 break;
16730             case 'l':
16731             case 'L':
16732                 break;
16733             case 'T':
16734                 { /* Aho Corasick add-on structure for a trie node.
16735                      Used in stclass optimization only */
16736                     U32 refcount;
16737                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16738 #ifdef USE_ITHREADS
16739                     dVAR;
16740 #endif
16741                     OP_REFCNT_LOCK;
16742                     refcount = --aho->refcount;
16743                     OP_REFCNT_UNLOCK;
16744                     if ( !refcount ) {
16745                         PerlMemShared_free(aho->states);
16746                         PerlMemShared_free(aho->fail);
16747                          /* do this last!!!! */
16748                         PerlMemShared_free(ri->data->data[n]);
16749                         /* we should only ever get called once, so
16750                          * assert as much, and also guard the free
16751                          * which /might/ happen twice. At the least
16752                          * it will make code anlyzers happy and it
16753                          * doesn't cost much. - Yves */
16754                         assert(ri->regstclass);
16755                         if (ri->regstclass) {
16756                             PerlMemShared_free(ri->regstclass);
16757                             ri->regstclass = 0;
16758                         }
16759                     }
16760                 }
16761                 break;
16762             case 't':
16763                 {
16764                     /* trie structure. */
16765                     U32 refcount;
16766                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16767 #ifdef USE_ITHREADS
16768                     dVAR;
16769 #endif
16770                     OP_REFCNT_LOCK;
16771                     refcount = --trie->refcount;
16772                     OP_REFCNT_UNLOCK;
16773                     if ( !refcount ) {
16774                         PerlMemShared_free(trie->charmap);
16775                         PerlMemShared_free(trie->states);
16776                         PerlMemShared_free(trie->trans);
16777                         if (trie->bitmap)
16778                             PerlMemShared_free(trie->bitmap);
16779                         if (trie->jump)
16780                             PerlMemShared_free(trie->jump);
16781                         PerlMemShared_free(trie->wordinfo);
16782                         /* do this last!!!! */
16783                         PerlMemShared_free(ri->data->data[n]);
16784                     }
16785                 }
16786                 break;
16787             default:
16788                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16789                                                     ri->data->what[n]);
16790             }
16791         }
16792         Safefree(ri->data->what);
16793         Safefree(ri->data);
16794     }
16795
16796     Safefree(ri);
16797 }
16798
16799 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16800 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16801 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16802
16803 /*
16804    re_dup - duplicate a regexp.
16805
16806    This routine is expected to clone a given regexp structure. It is only
16807    compiled under USE_ITHREADS.
16808
16809    After all of the core data stored in struct regexp is duplicated
16810    the regexp_engine.dupe method is used to copy any private data
16811    stored in the *pprivate pointer. This allows extensions to handle
16812    any duplication it needs to do.
16813
16814    See pregfree() and regfree_internal() if you change anything here.
16815 */
16816 #if defined(USE_ITHREADS)
16817 #ifndef PERL_IN_XSUB_RE
16818 void
16819 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16820 {
16821     dVAR;
16822     I32 npar;
16823     const struct regexp *r = ReANY(sstr);
16824     struct regexp *ret = ReANY(dstr);
16825
16826     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16827
16828     npar = r->nparens+1;
16829     Newx(ret->offs, npar, regexp_paren_pair);
16830     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16831
16832     if (ret->substrs) {
16833         /* Do it this way to avoid reading from *r after the StructCopy().
16834            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16835            cache, it doesn't matter.  */
16836         const bool anchored = r->check_substr
16837             ? r->check_substr == r->anchored_substr
16838             : r->check_utf8 == r->anchored_utf8;
16839         Newx(ret->substrs, 1, struct reg_substr_data);
16840         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16841
16842         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16843         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16844         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16845         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16846
16847         /* check_substr and check_utf8, if non-NULL, point to either their
16848            anchored or float namesakes, and don't hold a second reference.  */
16849
16850         if (ret->check_substr) {
16851             if (anchored) {
16852                 assert(r->check_utf8 == r->anchored_utf8);
16853                 ret->check_substr = ret->anchored_substr;
16854                 ret->check_utf8 = ret->anchored_utf8;
16855             } else {
16856                 assert(r->check_substr == r->float_substr);
16857                 assert(r->check_utf8 == r->float_utf8);
16858                 ret->check_substr = ret->float_substr;
16859                 ret->check_utf8 = ret->float_utf8;
16860             }
16861         } else if (ret->check_utf8) {
16862             if (anchored) {
16863                 ret->check_utf8 = ret->anchored_utf8;
16864             } else {
16865                 ret->check_utf8 = ret->float_utf8;
16866             }
16867         }
16868     }
16869
16870     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16871     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16872
16873     if (ret->pprivate)
16874         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16875
16876     if (RX_MATCH_COPIED(dstr))
16877         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16878     else
16879         ret->subbeg = NULL;
16880 #ifdef PERL_ANY_COW
16881     ret->saved_copy = NULL;
16882 #endif
16883
16884     /* Whether mother_re be set or no, we need to copy the string.  We
16885        cannot refrain from copying it when the storage points directly to
16886        our mother regexp, because that's
16887                1: a buffer in a different thread
16888                2: something we no longer hold a reference on
16889                so we need to copy it locally.  */
16890     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16891     ret->mother_re   = NULL;
16892 }
16893 #endif /* PERL_IN_XSUB_RE */
16894
16895 /*
16896    regdupe_internal()
16897
16898    This is the internal complement to regdupe() which is used to copy
16899    the structure pointed to by the *pprivate pointer in the regexp.
16900    This is the core version of the extension overridable cloning hook.
16901    The regexp structure being duplicated will be copied by perl prior
16902    to this and will be provided as the regexp *r argument, however
16903    with the /old/ structures pprivate pointer value. Thus this routine
16904    may override any copying normally done by perl.
16905
16906    It returns a pointer to the new regexp_internal structure.
16907 */
16908
16909 void *
16910 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16911 {
16912     dVAR;
16913     struct regexp *const r = ReANY(rx);
16914     regexp_internal *reti;
16915     int len;
16916     RXi_GET_DECL(r,ri);
16917
16918     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16919
16920     len = ProgLen(ri);
16921
16922     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16923           char, regexp_internal);
16924     Copy(ri->program, reti->program, len+1, regnode);
16925
16926     reti->num_code_blocks = ri->num_code_blocks;
16927     if (ri->code_blocks) {
16928         int n;
16929         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16930                 struct reg_code_block);
16931         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16932                 struct reg_code_block);
16933         for (n = 0; n < ri->num_code_blocks; n++)
16934              reti->code_blocks[n].src_regex = (REGEXP*)
16935                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16936     }
16937     else
16938         reti->code_blocks = NULL;
16939
16940     reti->regstclass = NULL;
16941
16942     if (ri->data) {
16943         struct reg_data *d;
16944         const int count = ri->data->count;
16945         int i;
16946
16947         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16948                 char, struct reg_data);
16949         Newx(d->what, count, U8);
16950
16951         d->count = count;
16952         for (i = 0; i < count; i++) {
16953             d->what[i] = ri->data->what[i];
16954             switch (d->what[i]) {
16955                 /* see also regcomp.h and regfree_internal() */
16956             case 'a': /* actually an AV, but the dup function is identical.  */
16957             case 'r':
16958             case 's':
16959             case 'S':
16960             case 'u': /* actually an HV, but the dup function is identical.  */
16961                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16962                 break;
16963             case 'f':
16964                 /* This is cheating. */
16965                 Newx(d->data[i], 1, regnode_ssc);
16966                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16967                 reti->regstclass = (regnode*)d->data[i];
16968                 break;
16969             case 'T':
16970                 /* Trie stclasses are readonly and can thus be shared
16971                  * without duplication. We free the stclass in pregfree
16972                  * when the corresponding reg_ac_data struct is freed.
16973                  */
16974                 reti->regstclass= ri->regstclass;
16975                 /* FALLTHROUGH */
16976             case 't':
16977                 OP_REFCNT_LOCK;
16978                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16979                 OP_REFCNT_UNLOCK;
16980                 /* FALLTHROUGH */
16981             case 'l':
16982             case 'L':
16983                 d->data[i] = ri->data->data[i];
16984                 break;
16985             default:
16986                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16987                                                            ri->data->what[i]);
16988             }
16989         }
16990
16991         reti->data = d;
16992     }
16993     else
16994         reti->data = NULL;
16995
16996     reti->name_list_idx = ri->name_list_idx;
16997
16998 #ifdef RE_TRACK_PATTERN_OFFSETS
16999     if (ri->u.offsets) {
17000         Newx(reti->u.offsets, 2*len+1, U32);
17001         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17002     }
17003 #else
17004     SetProgLen(reti,len);
17005 #endif
17006
17007     return (void*)reti;
17008 }
17009
17010 #endif    /* USE_ITHREADS */
17011
17012 #ifndef PERL_IN_XSUB_RE
17013
17014 /*
17015  - regnext - dig the "next" pointer out of a node
17016  */
17017 regnode *
17018 Perl_regnext(pTHX_ regnode *p)
17019 {
17020     I32 offset;
17021
17022     if (!p)
17023         return(NULL);
17024
17025     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
17026         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17027                                                 (int)OP(p), (int)REGNODE_MAX);
17028     }
17029
17030     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17031     if (offset == 0)
17032         return(NULL);
17033
17034     return(p+offset);
17035 }
17036 #endif
17037
17038 STATIC void
17039 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17040 {
17041     va_list args;
17042     STRLEN l1 = strlen(pat1);
17043     STRLEN l2 = strlen(pat2);
17044     char buf[512];
17045     SV *msv;
17046     const char *message;
17047
17048     PERL_ARGS_ASSERT_RE_CROAK2;
17049
17050     if (l1 > 510)
17051         l1 = 510;
17052     if (l1 + l2 > 510)
17053         l2 = 510 - l1;
17054     Copy(pat1, buf, l1 , char);
17055     Copy(pat2, buf + l1, l2 , char);
17056     buf[l1 + l2] = '\n';
17057     buf[l1 + l2 + 1] = '\0';
17058     va_start(args, pat2);
17059     msv = vmess(buf, &args);
17060     va_end(args);
17061     message = SvPV_const(msv,l1);
17062     if (l1 > 512)
17063         l1 = 512;
17064     Copy(message, buf, l1 , char);
17065     /* l1-1 to avoid \n */
17066     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17067 }
17068
17069 #ifdef DEBUGGING
17070 /* Certain characters are output as a sequence with the first being a
17071  * backslash. */
17072 #define isBACKSLASHED_PUNCT(c)                                              \
17073                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17074
17075 STATIC void
17076 S_put_code_point(pTHX_ SV *sv, UV c)
17077 {
17078     PERL_ARGS_ASSERT_PUT_CODE_POINT;
17079
17080     if (c > 255) {
17081         Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17082     }
17083     else if (isPRINT(c)) {
17084         const char string = (char) c;
17085         if (isBACKSLASHED_PUNCT(c))
17086             sv_catpvs(sv, "\\");
17087         sv_catpvn(sv, &string, 1);
17088     }
17089     else {
17090         const char * const mnemonic = cntrl_to_mnemonic((char) c);
17091         if (mnemonic) {
17092             Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17093         }
17094         else {
17095             Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17096         }
17097     }
17098 }
17099
17100 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17101
17102 STATIC void
17103 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17104 {
17105     /* Appends to 'sv' a displayable version of the range of code points from
17106      * 'start' to 'end'.  It assumes that only ASCII printables are displayable
17107      * as-is (though some of these will be escaped by put_code_point()). */
17108
17109     const unsigned int min_range_count = 3;
17110
17111     assert(start <= end);
17112
17113     PERL_ARGS_ASSERT_PUT_RANGE;
17114
17115     while (start <= end) {
17116         UV this_end;
17117         const char * format;
17118
17119         if (end - start < min_range_count) {
17120
17121             /* Individual chars in short ranges */
17122             for (; start <= end; start++) {
17123                 put_code_point(sv, start);
17124             }
17125             break;
17126         }
17127
17128         /* If permitted by the input options, and there is a possibility that
17129          * this range contains a printable literal, look to see if there is
17130          * one.  */
17131         if (allow_literals && start <= MAX_PRINT_A) {
17132
17133             /* If the range begin isn't an ASCII printable, effectively split
17134              * the range into two parts:
17135              *  1) the portion before the first such printable,
17136              *  2) the rest
17137              * and output them separately. */
17138             if (! isPRINT_A(start)) {
17139                 UV temp_end = start + 1;
17140
17141                 /* There is no point looking beyond the final possible
17142                  * printable, in MAX_PRINT_A */
17143                 UV max = MIN(end, MAX_PRINT_A);
17144
17145                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17146                     temp_end++;
17147                 }
17148
17149                 /* Here, temp_end points to one beyond the first printable if
17150                  * found, or to one beyond 'max' if not.  If none found, make
17151                  * sure that we use the entire range */
17152                 if (temp_end > MAX_PRINT_A) {
17153                     temp_end = end + 1;
17154                 }
17155
17156                 /* Output the first part of the split range, the part that
17157                  * doesn't have printables, with no looking for literals
17158                  * (otherwise we would infinitely recurse) */
17159                 put_range(sv, start, temp_end - 1, FALSE);
17160
17161                 /* The 2nd part of the range (if any) starts here. */
17162                 start = temp_end;
17163
17164                 /* We continue instead of dropping down because even if the 2nd
17165                  * part is non-empty, it could be so short that we want to
17166                  * output it specially, as tested for at the top of this loop.
17167                  * */
17168                 continue;
17169             }
17170
17171             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
17172              * output a sub-range of just the digits or letters, then process
17173              * the remaining portion as usual. */
17174             if (isALPHANUMERIC_A(start)) {
17175                 UV mask = (isDIGIT_A(start))
17176                            ? _CC_DIGIT
17177                              : isUPPER_A(start)
17178                                ? _CC_UPPER
17179                                : _CC_LOWER;
17180                 UV temp_end = start + 1;
17181
17182                 /* Find the end of the sub-range that includes just the
17183                  * characters in the same class as the first character in it */
17184                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17185                     temp_end++;
17186                 }
17187                 temp_end--;
17188
17189                 /* For short ranges, don't duplicate the code above to output
17190                  * them; just call recursively */
17191                 if (temp_end - start < min_range_count) {
17192                     put_range(sv, start, temp_end, FALSE);
17193                 }
17194                 else {  /* Output as a range */
17195                     put_code_point(sv, start);
17196                     sv_catpvs(sv, "-");
17197                     put_code_point(sv, temp_end);
17198                 }
17199                 start = temp_end + 1;
17200                 continue;
17201             }
17202
17203             /* We output any other printables as individual characters */
17204             if (isPUNCT_A(start) || isSPACE_A(start)) {
17205                 while (start <= end && (isPUNCT_A(start)
17206                                         || isSPACE_A(start)))
17207                 {
17208                     put_code_point(sv, start);
17209                     start++;
17210                 }
17211                 continue;
17212             }
17213         } /* End of looking for literals */
17214
17215         /* Here is not to output as a literal.  Some control characters have
17216          * mnemonic names.  Split off any of those at the beginning and end of
17217          * the range to print mnemonically.  It isn't possible for many of
17218          * these to be in a row, so this won't overwhelm with output */
17219         while (isMNEMONIC_CNTRL(start) && start <= end) {
17220             put_code_point(sv, start);
17221             start++;
17222         }
17223         if (start < end && isMNEMONIC_CNTRL(end)) {
17224
17225             /* Here, the final character in the range has a mnemonic name.
17226              * Work backwards from the end to find the final non-mnemonic */
17227             UV temp_end = end - 1;
17228             while (isMNEMONIC_CNTRL(temp_end)) {
17229                 temp_end--;
17230             }
17231
17232             /* And separately output the range that doesn't have mnemonics */
17233             put_range(sv, start, temp_end, FALSE);
17234
17235             /* Then output the mnemonic trailing controls */
17236             start = temp_end + 1;
17237             while (start <= end) {
17238                 put_code_point(sv, start);
17239                 start++;
17240             }
17241             break;
17242         }
17243
17244         /* As a final resort, output the range or subrange as hex. */
17245
17246         this_end = (end < NUM_ANYOF_CODE_POINTS)
17247                     ? end
17248                     : NUM_ANYOF_CODE_POINTS - 1;
17249         format = (this_end < 256)
17250                  ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17251                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17252         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17253         break;
17254     }
17255 }
17256
17257 STATIC bool
17258 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17259 {
17260     /* Appends to 'sv' a displayable version of the innards of the bracketed
17261      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
17262      * output anything, and bitmap_invlist, if not NULL, will point to an
17263      * inversion list of what is in the bit map */
17264
17265     int i;
17266     UV start, end;
17267     unsigned int punct_count = 0;
17268     SV* invlist = NULL;
17269     SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
17270     bool allow_literals = TRUE;
17271
17272     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17273
17274     invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17275
17276     /* Worst case is exactly every-other code point is in the list */
17277     *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17278
17279     /* Convert the bit map to an inversion list, keeping track of how many
17280      * ASCII puncts are set, including an extra amount for the backslashed
17281      * ones.  */
17282     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17283         if (BITMAP_TEST(bitmap, i)) {
17284             *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17285             if (isPUNCT_A(i)) {
17286                 punct_count++;
17287                 if isBACKSLASHED_PUNCT(i) {
17288                     punct_count++;
17289                 }
17290             }
17291         }
17292     }
17293
17294     /* Nothing to output */
17295     if (_invlist_len(*invlist_ptr) == 0) {
17296         SvREFCNT_dec(invlist);
17297         return FALSE;
17298     }
17299
17300     /* Generally, it is more readable if printable characters are output as
17301      * literals, but if a range (nearly) spans all of them, it's best to output
17302      * it as a single range.  This code will use a single range if all but 2
17303      * printables are in it */
17304     invlist_iterinit(*invlist_ptr);
17305     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17306
17307         /* If range starts beyond final printable, it doesn't have any in it */
17308         if (start > MAX_PRINT_A) {
17309             break;
17310         }
17311
17312         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
17313          * all but two, the range must start and end no later than 2 from
17314          * either end */
17315         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17316             if (end > MAX_PRINT_A) {
17317                 end = MAX_PRINT_A;
17318             }
17319             if (start < ' ') {
17320                 start = ' ';
17321             }
17322             if (end - start >= MAX_PRINT_A - ' ' - 2) {
17323                 allow_literals = FALSE;
17324             }
17325             break;
17326         }
17327     }
17328     invlist_iterfinish(*invlist_ptr);
17329
17330     /* The legibility of the output depends mostly on how many punctuation
17331      * characters are output.  There are 32 possible ASCII ones, and some have
17332      * an additional backslash, bringing it to currently 36, so if any more
17333      * than 18 are to be output, we can instead output it as its complement,
17334      * yielding fewer puncts, and making it more legible.  But give some weight
17335      * to the fact that outputting it as a complement is less legible than a
17336      * straight output, so don't complement unless we are somewhat over the 18
17337      * mark */
17338     if (allow_literals && punct_count > 22) {
17339         sv_catpvs(sv, "^");
17340
17341         /* Add everything remaining to the list, so when we invert it just
17342          * below, it will be excluded */
17343         _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17344         _invlist_invert(*invlist_ptr);
17345     }
17346
17347     /* Here we have figured things out.  Output each range */
17348     invlist_iterinit(*invlist_ptr);
17349     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17350         if (start >= NUM_ANYOF_CODE_POINTS) {
17351             break;
17352         }
17353         put_range(sv, start, end, allow_literals);
17354     }
17355     invlist_iterfinish(*invlist_ptr);
17356
17357     return TRUE;
17358 }
17359
17360 #define CLEAR_OPTSTART \
17361     if (optstart) STMT_START {                                               \
17362         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
17363                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17364         optstart=NULL;                                                       \
17365     } STMT_END
17366
17367 #define DUMPUNTIL(b,e)                                                       \
17368                     CLEAR_OPTSTART;                                          \
17369                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17370
17371 STATIC const regnode *
17372 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17373             const regnode *last, const regnode *plast,
17374             SV* sv, I32 indent, U32 depth)
17375 {
17376     U8 op = PSEUDO;     /* Arbitrary non-END op. */
17377     const regnode *next;
17378     const regnode *optstart= NULL;
17379
17380     RXi_GET_DECL(r,ri);
17381     GET_RE_DEBUG_FLAGS_DECL;
17382
17383     PERL_ARGS_ASSERT_DUMPUNTIL;
17384
17385 #ifdef DEBUG_DUMPUNTIL
17386     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17387         last ? last-start : 0,plast ? plast-start : 0);
17388 #endif
17389
17390     if (plast && plast < last)
17391         last= plast;
17392
17393     while (PL_regkind[op] != END && (!last || node < last)) {
17394         assert(node);
17395         /* While that wasn't END last time... */
17396         NODE_ALIGN(node);
17397         op = OP(node);
17398         if (op == CLOSE || op == WHILEM)
17399             indent--;
17400         next = regnext((regnode *)node);
17401
17402         /* Where, what. */
17403         if (OP(node) == OPTIMIZED) {
17404             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17405                 optstart = node;
17406             else
17407                 goto after_print;
17408         } else
17409             CLEAR_OPTSTART;
17410
17411         regprop(r, sv, node, NULL, NULL);
17412         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17413                       (int)(2*indent + 1), "", SvPVX_const(sv));
17414
17415         if (OP(node) != OPTIMIZED) {
17416             if (next == NULL)           /* Next ptr. */
17417                 PerlIO_printf(Perl_debug_log, " (0)");
17418             else if (PL_regkind[(U8)op] == BRANCH
17419                      && PL_regkind[OP(next)] != BRANCH )
17420                 PerlIO_printf(Perl_debug_log, " (FAIL)");
17421             else
17422                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17423             (void)PerlIO_putc(Perl_debug_log, '\n');
17424         }
17425
17426       after_print:
17427         if (PL_regkind[(U8)op] == BRANCHJ) {
17428             assert(next);
17429             {
17430                 const regnode *nnode = (OP(next) == LONGJMP
17431                                        ? regnext((regnode *)next)
17432                                        : next);
17433                 if (last && nnode > last)
17434                     nnode = last;
17435                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17436             }
17437         }
17438         else if (PL_regkind[(U8)op] == BRANCH) {
17439             assert(next);
17440             DUMPUNTIL(NEXTOPER(node), next);
17441         }
17442         else if ( PL_regkind[(U8)op]  == TRIE ) {
17443             const regnode *this_trie = node;
17444             const char op = OP(node);
17445             const U32 n = ARG(node);
17446             const reg_ac_data * const ac = op>=AHOCORASICK ?
17447                (reg_ac_data *)ri->data->data[n] :
17448                NULL;
17449             const reg_trie_data * const trie =
17450                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17451 #ifdef DEBUGGING
17452             AV *const trie_words
17453                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17454 #endif
17455             const regnode *nextbranch= NULL;
17456             I32 word_idx;
17457             sv_setpvs(sv, "");
17458             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17459                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17460
17461                 PerlIO_printf(Perl_debug_log, "%*s%s ",
17462                    (int)(2*(indent+3)), "",
17463                     elem_ptr
17464                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17465                                 SvCUR(*elem_ptr), 60,
17466                                 PL_colors[0], PL_colors[1],
17467                                 (SvUTF8(*elem_ptr)
17468                                  ? PERL_PV_ESCAPE_UNI
17469                                  : 0)
17470                                 | PERL_PV_PRETTY_ELLIPSES
17471                                 | PERL_PV_PRETTY_LTGT
17472                             )
17473                     : "???"
17474                 );
17475                 if (trie->jump) {
17476                     U16 dist= trie->jump[word_idx+1];
17477                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17478                                (UV)((dist ? this_trie + dist : next) - start));
17479                     if (dist) {
17480                         if (!nextbranch)
17481                             nextbranch= this_trie + trie->jump[0];
17482                         DUMPUNTIL(this_trie + dist, nextbranch);
17483                     }
17484                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17485                         nextbranch= regnext((regnode *)nextbranch);
17486                 } else {
17487                     PerlIO_printf(Perl_debug_log, "\n");
17488                 }
17489             }
17490             if (last && next > last)
17491                 node= last;
17492             else
17493                 node= next;
17494         }
17495         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
17496             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17497                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17498         }
17499         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17500             assert(next);
17501             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17502         }
17503         else if ( op == PLUS || op == STAR) {
17504             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17505         }
17506         else if (PL_regkind[(U8)op] == ANYOF) {
17507             /* arglen 1 + class block */
17508             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17509                           ? ANYOF_POSIXL_SKIP
17510                           : ANYOF_SKIP);
17511             node = NEXTOPER(node);
17512         }
17513         else if (PL_regkind[(U8)op] == EXACT) {
17514             /* Literal string, where present. */
17515             node += NODE_SZ_STR(node) - 1;
17516             node = NEXTOPER(node);
17517         }
17518         else {
17519             node = NEXTOPER(node);
17520             node += regarglen[(U8)op];
17521         }
17522         if (op == CURLYX || op == OPEN)
17523             indent++;
17524     }
17525     CLEAR_OPTSTART;
17526 #ifdef DEBUG_DUMPUNTIL
17527     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17528 #endif
17529     return node;
17530 }
17531
17532 #endif  /* DEBUGGING */
17533
17534 /*
17535  * Local variables:
17536  * c-indentation-style: bsd
17537  * c-basic-offset: 4
17538  * indent-tabs-mode: nil
17539  * End:
17540  *
17541  * ex: set ts=8 sts=4 sw=4 et:
17542  */