regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 #ifndef MIN
 106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 107 #endif
 108
 109 /* this is a chain of data about sub patterns we are processing that
 110    need to be handled separately/specially in study_chunk. Its so
 111    we can simulate recursion without losing state.  */
 112 struct scan_frame;
 113 typedef struct scan_frame {
 114     regnode *last_regnode;      /* last node to process in this frame */
 115     regnode *next_regnode;      /* next node to process when last is reached */
 116     U32 prev_recursed_depth;
 117     I32 stopparen;              /* what stopparen do we use */
 118     U32 is_top_frame;           /* what flags do we use? */
 119
 120     struct scan_frame *this_prev_frame; /* this previous frame */
 121     struct scan_frame *prev_frame;      /* previous frame */
 122     struct scan_frame *next_frame;      /* next frame */
 123 } scan_frame;
 124
 125 struct RExC_state_t {
 126     U32         flags;                  /* RXf_* are we folding, multilining? */
 127     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 128     char        *precomp;               /* uncompiled string. */
 129     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 130     regexp      *rx;                    /* perl core regexp structure */
 131     regexp_internal     *rxi;           /* internal data for regexp object
 132                                            pprivate field */
 133     char        *start;                 /* Start of input for compile */
 134     char        *end;                   /* End of input for compile */
 135     char        *parse;                 /* Input-scan pointer. */
 136     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 137     regnode     *emit_start;            /* Start of emitted-code area */
 138     regnode     *emit_bound;            /* First regnode outside of the
 139                                            allocated space */
 140     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 141                                            implies compiling, so don't emit */
 142     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 143                                            large enough for the largest
 144                                            non-EXACTish node, so can use it as
 145                                            scratch in pass1 */
 146     I32         naughty;                /* How bad is this pattern? */
 147     I32         sawback;                /* Did we see \1, ...? */
 148     U32         seen;
 149     SSize_t     size;                   /* Code size. */
 150     I32                npar;            /* Capture buffer count, (OPEN) plus
 151                                            one. ("par" 0 is the whole
 152                                            pattern)*/
 153     I32         nestroot;               /* root parens we are in - used by
 154                                            accept */
 155     I32         extralen;
 156     I32         seen_zerolen;
 157     regnode     **open_parens;          /* pointers to open parens */
 158     regnode     **close_parens;         /* pointers to close parens */
 159     regnode     *opend;                 /* END node in program */
 160     I32         utf8;           /* whether the pattern is utf8 or not */
 161     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 162                                 /* XXX use this for future optimisation of case
 163                                  * where pattern must be upgraded to utf8. */
 164     I32         uni_semantics;  /* If a d charset modifier should use unicode
 165                                    rules, even if the pattern is not in
 166                                    utf8 */
 167     HV          *paren_names;           /* Paren names */
 168
 169     regnode     **recurse;              /* Recurse regops */
 170     I32         recurse_count;          /* Number of recurse regops */
 171     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 172                                            through */
 173     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 174     I32         in_lookbehind;
 175     I32         contains_locale;
 176     I32         contains_i;
 177     I32         override_recoding;
 178     I32         in_multi_char_class;
 179     struct reg_code_block *code_blocks; /* positions of literal (?{})
 180                                             within pattern */
 181     int         num_code_blocks;        /* size of code_blocks[] */
 182     int         code_index;             /* next code_blocks[] slot */
 183     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 184     scan_frame *frame_head;
 185     scan_frame *frame_last;
 186     U32         frame_count;
 187 #ifdef ADD_TO_REGEXEC
 188     char        *starttry;              /* -Dr: where regtry was called. */
 189 #define RExC_starttry   (pRExC_state->starttry)
 190 #endif
 191     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 192 #ifdef DEBUGGING
 193     const char  *lastparse;
 194     I32         lastnum;
 195     AV          *paren_name_list;       /* idx -> name */
 196     U32         study_chunk_recursed_count;
 197     SV          *mysv1;
 198     SV          *mysv2;
 199 #define RExC_lastparse  (pRExC_state->lastparse)
 200 #define RExC_lastnum    (pRExC_state->lastnum)
 201 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 202 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 203 #define RExC_mysv       (pRExC_state->mysv1)
 204 #define RExC_mysv1      (pRExC_state->mysv1)
 205 #define RExC_mysv2      (pRExC_state->mysv2)
 206
 207 #endif
 208 };
 209
 210 #define RExC_flags      (pRExC_state->flags)
 211 #define RExC_pm_flags   (pRExC_state->pm_flags)
 212 #define RExC_precomp    (pRExC_state->precomp)
 213 #define RExC_rx_sv      (pRExC_state->rx_sv)
 214 #define RExC_rx         (pRExC_state->rx)
 215 #define RExC_rxi        (pRExC_state->rxi)
 216 #define RExC_start      (pRExC_state->start)
 217 #define RExC_end        (pRExC_state->end)
 218 #define RExC_parse      (pRExC_state->parse)
 219 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 220 #ifdef RE_TRACK_PATTERN_OFFSETS
 221 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 222                                                          others */
 223 #endif
 224 #define RExC_emit       (pRExC_state->emit)
 225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 226 #define RExC_emit_start (pRExC_state->emit_start)
 227 #define RExC_emit_bound (pRExC_state->emit_bound)
 228 #define RExC_naughty    (pRExC_state->naughty)
 229 #define RExC_sawback    (pRExC_state->sawback)
 230 #define RExC_seen       (pRExC_state->seen)
 231 #define RExC_size       (pRExC_state->size)
 232 #define RExC_maxlen        (pRExC_state->maxlen)
 233 #define RExC_npar       (pRExC_state->npar)
 234 #define RExC_nestroot   (pRExC_state->nestroot)
 235 #define RExC_extralen   (pRExC_state->extralen)
 236 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 237 #define RExC_utf8       (pRExC_state->utf8)
 238 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 239 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 240 #define RExC_open_parens        (pRExC_state->open_parens)
 241 #define RExC_close_parens       (pRExC_state->close_parens)
 242 #define RExC_opend      (pRExC_state->opend)
 243 #define RExC_paren_names        (pRExC_state->paren_names)
 244 #define RExC_recurse    (pRExC_state->recurse)
 245 #define RExC_recurse_count      (pRExC_state->recurse_count)
 246 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 247 #define RExC_study_chunk_recursed_bytes  \
 248                                    (pRExC_state->study_chunk_recursed_bytes)
 249 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 250 #define RExC_contains_locale    (pRExC_state->contains_locale)
 251 #define RExC_contains_i (pRExC_state->contains_i)
 252 #define RExC_override_recoding (pRExC_state->override_recoding)
 253 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 254 #define RExC_frame_head (pRExC_state->frame_head)
 255 #define RExC_frame_last (pRExC_state->frame_last)
 256 #define RExC_frame_count (pRExC_state->frame_count)
 257
 258
 259 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 260 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 261         ((*s) == '{' && regcurly(s)))
 262
 263 /*
 264  * Flags to be passed up and down.
 265  */
 266 #define WORST           0       /* Worst case. */
 267 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 268
 269 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 270  * character.  (There needs to be a case: in the switch statement in regexec.c
 271  * for any node marked SIMPLE.)  Note that this is not the same thing as
 272  * REGNODE_SIMPLE */
 273 #define SIMPLE          0x02
 274 #define SPSTART         0x04    /* Starts with * or + */
 275 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 276 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 277 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 278
 279 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 280
 281 /* whether trie related optimizations are enabled */
 282 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 283 #define TRIE_STUDY_OPT
 284 #define FULL_TRIE_STUDY
 285 #define TRIE_STCLASS
 286 #endif
 287
 288
 289
 290 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 291 #define PBITVAL(paren) (1 << ((paren) & 7))
 292 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 293 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 294 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 295
 296 #define REQUIRE_UTF8    STMT_START {                                       \
 297                                      if (!UTF) {                           \
 298                                          *flagp = RESTART_UTF8;            \
 299                                          return NULL;                      \
 300                                      }                                     \
 301                         } STMT_END
 302
 303 /* This converts the named class defined in regcomp.h to its equivalent class
 304  * number defined in handy.h. */
 305 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 306 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 307
 308 #define _invlist_union_complement_2nd(a, b, output) \
 309                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 310 #define _invlist_intersection_complement_2nd(a, b, output) \
 311                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 312
 313 /* About scan_data_t.
 314
 315   During optimisation we recurse through the regexp program performing
 316   various inplace (keyhole style) optimisations. In addition study_chunk
 317   and scan_commit populate this data structure with information about
 318   what strings MUST appear in the pattern. We look for the longest
 319   string that must appear at a fixed location, and we look for the
 320   longest string that may appear at a floating location. So for instance
 321   in the pattern:
 322
 323     /FOO[xX]A.*B[xX]BAR/
 324
 325   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 326   strings (because they follow a .* construct). study_chunk will identify
 327   both FOO and BAR as being the longest fixed and floating strings respectively.
 328
 329   The strings can be composites, for instance
 330
 331      /(f)(o)(o)/
 332
 333   will result in a composite fixed substring 'foo'.
 334
 335   For each string some basic information is maintained:
 336
 337   - offset or min_offset
 338     This is the position the string must appear at, or not before.
 339     It also implicitly (when combined with minlenp) tells us how many
 340     characters must match before the string we are searching for.
 341     Likewise when combined with minlenp and the length of the string it
 342     tells us how many characters must appear after the string we have
 343     found.
 344
 345   - max_offset
 346     Only used for floating strings. This is the rightmost point that
 347     the string can appear at. If set to SSize_t_MAX it indicates that the
 348     string can occur infinitely far to the right.
 349
 350   - minlenp
 351     A pointer to the minimum number of characters of the pattern that the
 352     string was found inside. This is important as in the case of positive
 353     lookahead or positive lookbehind we can have multiple patterns
 354     involved. Consider
 355
 356     /(?=FOO).*F/
 357
 358     The minimum length of the pattern overall is 3, the minimum length
 359     of the lookahead part is 3, but the minimum length of the part that
 360     will actually match is 1. So 'FOO's minimum length is 3, but the
 361     minimum length for the F is 1. This is important as the minimum length
 362     is used to determine offsets in front of and behind the string being
 363     looked for.  Since strings can be composites this is the length of the
 364     pattern at the time it was committed with a scan_commit. Note that
 365     the length is calculated by study_chunk, so that the minimum lengths
 366     are not known until the full pattern has been compiled, thus the
 367     pointer to the value.
 368
 369   - lookbehind
 370
 371     In the case of lookbehind the string being searched for can be
 372     offset past the start point of the final matching string.
 373     If this value was just blithely removed from the min_offset it would
 374     invalidate some of the calculations for how many chars must match
 375     before or after (as they are derived from min_offset and minlen and
 376     the length of the string being searched for).
 377     When the final pattern is compiled and the data is moved from the
 378     scan_data_t structure into the regexp structure the information
 379     about lookbehind is factored in, with the information that would
 380     have been lost precalculated in the end_shift field for the
 381     associated string.
 382
 383   The fields pos_min and pos_delta are used to store the minimum offset
 384   and the delta to the maximum offset at the current point in the pattern.
 385
 386 */
 387
 388 typedef struct scan_data_t {
 389     /*I32 len_min;      unused */
 390     /*I32 len_delta;    unused */
 391     SSize_t pos_min;
 392     SSize_t pos_delta;
 393     SV *last_found;
 394     SSize_t last_end;       /* min value, <0 unless valid. */
 395     SSize_t last_start_min;
 396     SSize_t last_start_max;
 397     SV **longest;           /* Either &l_fixed, or &l_float. */
 398     SV *longest_fixed;      /* longest fixed string found in pattern */
 399     SSize_t offset_fixed;   /* offset where it starts */
 400     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 401     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 402     SV *longest_float;      /* longest floating string found in pattern */
 403     SSize_t offset_float_min; /* earliest point in string it can appear */
 404     SSize_t offset_float_max; /* latest point in string it can appear */
 405     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 406     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 407     I32 flags;
 408     I32 whilem_c;
 409     SSize_t *last_closep;
 410     regnode_ssc *start_class;
 411 } scan_data_t;
 412
 413 /*
 414  * Forward declarations for pregcomp()'s friends.
 415  */
 416
 417 static const scan_data_t zero_scan_data =
 418   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 419
 420 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 421 #define SF_BEFORE_SEOL          0x0001
 422 #define SF_BEFORE_MEOL          0x0002
 423 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 424 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 425
 426 #define SF_FIX_SHIFT_EOL        (+2)
 427 #define SF_FL_SHIFT_EOL         (+4)
 428
 429 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 430 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 431
 432 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 433 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 434 #define SF_IS_INF               0x0040
 435 #define SF_HAS_PAR              0x0080
 436 #define SF_IN_PAR               0x0100
 437 #define SF_HAS_EVAL             0x0200
 438 #define SCF_DO_SUBSTR           0x0400
 439 #define SCF_DO_STCLASS_AND      0x0800
 440 #define SCF_DO_STCLASS_OR       0x1000
 441 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 442 #define SCF_WHILEM_VISITED_POS  0x2000
 443
 444 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 445 #define SCF_SEEN_ACCEPT         0x8000
 446 #define SCF_TRIE_DOING_RESTUDY 0x10000
 447 #define SCF_IN_DEFINE          0x20000
 448
 449
 450
 451
 452 #define UTF cBOOL(RExC_utf8)
 453
 454 /* The enums for all these are ordered so things work out correctly */
 455 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 456 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 457                                                      == REGEX_DEPENDS_CHARSET)
 458 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 459 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 460                                                      >= REGEX_UNICODE_CHARSET)
 461 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 462                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 463 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 464                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 465 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 466                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 467
 468 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 469
 470 /* For programs that want to be strictly Unicode compatible by dying if any
 471  * attempt is made to match a non-Unicode code point against a Unicode
 472  * property.  */
 473 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 474
 475 #define OOB_NAMEDCLASS          -1
 476
 477 /* There is no code point that is out-of-bounds, so this is problematic.  But
 478  * its only current use is to initialize a variable that is always set before
 479  * looked at. */
 480 #define OOB_UNICODE             0xDEADBEEF
 481
 482 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 483 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 484
 485
 486 /* length of regex to show in messages that don't mark a position within */
 487 #define RegexLengthToShowInErrorMessages 127
 488
 489 /*
 490  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 491  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 492  * op/pragma/warn/regcomp.
 493  */
 494 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 495 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 496
 497 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 498                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 499
 500 #define REPORT_LOCATION_ARGS(offset)            \
 501                 UTF8fARG(UTF, offset, RExC_precomp), \
 502                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 503
 504 /*
 505  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 506  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 507  * "...".
 508  */
 509 #define _FAIL(code) STMT_START {                                        \
 510     const char *ellipses = "";                                          \
 511     IV len = RExC_end - RExC_precomp;                                   \
 512                                                                         \
 513     if (!SIZE_ONLY)                                                     \
 514         SAVEFREESV(RExC_rx_sv);                                         \
 515     if (len > RegexLengthToShowInErrorMessages) {                       \
 516         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 517         len = RegexLengthToShowInErrorMessages - 10;                    \
 518         ellipses = "...";                                               \
 519     }                                                                   \
 520     code;                                                               \
 521 } STMT_END
 522
 523 #define FAIL(msg) _FAIL(                            \
 524     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 525             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 526
 527 #define FAIL2(msg,arg) _FAIL(                       \
 528     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 529             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 530
 531 /*
 532  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 533  */
 534 #define Simple_vFAIL(m) STMT_START {                                    \
 535     const IV offset =                                                   \
 536         (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
 537     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 538             m, REPORT_LOCATION_ARGS(offset));   \
 539 } STMT_END
 540
 541 /*
 542  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 543  */
 544 #define vFAIL(m) STMT_START {                           \
 545     if (!SIZE_ONLY)                                     \
 546         SAVEFREESV(RExC_rx_sv);                         \
 547     Simple_vFAIL(m);                                    \
 548 } STMT_END
 549
 550 /*
 551  * Like Simple_vFAIL(), but accepts two arguments.
 552  */
 553 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 554     const IV offset = RExC_parse - RExC_precomp;                        \
 555     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 556                       REPORT_LOCATION_ARGS(offset));    \
 557 } STMT_END
 558
 559 /*
 560  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 561  */
 562 #define vFAIL2(m,a1) STMT_START {                       \
 563     if (!SIZE_ONLY)                                     \
 564         SAVEFREESV(RExC_rx_sv);                         \
 565     Simple_vFAIL2(m, a1);                               \
 566 } STMT_END
 567
 568
 569 /*
 570  * Like Simple_vFAIL(), but accepts three arguments.
 571  */
 572 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 573     const IV offset = RExC_parse - RExC_precomp;                \
 574     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 575             REPORT_LOCATION_ARGS(offset));      \
 576 } STMT_END
 577
 578 /*
 579  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 580  */
 581 #define vFAIL3(m,a1,a2) STMT_START {                    \
 582     if (!SIZE_ONLY)                                     \
 583         SAVEFREESV(RExC_rx_sv);                         \
 584     Simple_vFAIL3(m, a1, a2);                           \
 585 } STMT_END
 586
 587 /*
 588  * Like Simple_vFAIL(), but accepts four arguments.
 589  */
 590 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 591     const IV offset = RExC_parse - RExC_precomp;                \
 592     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 593             REPORT_LOCATION_ARGS(offset));      \
 594 } STMT_END
 595
 596 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 597     if (!SIZE_ONLY)                                     \
 598         SAVEFREESV(RExC_rx_sv);                         \
 599     Simple_vFAIL4(m, a1, a2, a3);                       \
 600 } STMT_END
 601
 602 /* A specialized version of vFAIL2 that works with UTF8f */
 603 #define vFAIL2utf8f(m, a1) STMT_START { \
 604     const IV offset = RExC_parse - RExC_precomp;   \
 605     if (!SIZE_ONLY)                                \
 606         SAVEFREESV(RExC_rx_sv);                    \
 607     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 608             REPORT_LOCATION_ARGS(offset));         \
 609 } STMT_END
 610
 611 /* These have asserts in them because of [perl #122671] Many warnings in
 612  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 613  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 614  * would get output again.  So they should be output in pass2, and these
 615  * asserts make sure new warnings follow that paradigm. */
 616
 617 /* m is not necessarily a "literal string", in this macro */
 618 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 619     const IV offset = loc - RExC_precomp;                               \
 620     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 621             m, REPORT_LOCATION_ARGS(offset));       \
 622 } STMT_END
 623
 624 #define ckWARNreg(loc,m) STMT_START {                                   \
 625     const IV offset = loc - RExC_precomp;                               \
 626     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 627             REPORT_LOCATION_ARGS(offset));              \
 628 } STMT_END
 629
 630 #define vWARN_dep(loc, m) STMT_START {                                  \
 631     const IV offset = loc - RExC_precomp;                               \
 632     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,    \
 633             REPORT_LOCATION_ARGS(offset));              \
 634 } STMT_END
 635
 636 #define ckWARNdep(loc,m) STMT_START {                                   \
 637     const IV offset = loc - RExC_precomp;                               \
 638     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                  \
 639             m REPORT_LOCATION,                                          \
 640             REPORT_LOCATION_ARGS(offset));              \
 641 } STMT_END
 642
 643 #define ckWARNregdep(loc,m) STMT_START {                                \
 644     const IV offset = loc - RExC_precomp;                               \
 645     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),    \
 646             m REPORT_LOCATION,                                          \
 647             REPORT_LOCATION_ARGS(offset));              \
 648 } STMT_END
 649
 650 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 651     const IV offset = loc - RExC_precomp;                               \
 652     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                      \
 653             m REPORT_LOCATION,                                          \
 654             a1, REPORT_LOCATION_ARGS(offset));  \
 655 } STMT_END
 656
 657 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 658     const IV offset = loc - RExC_precomp;                               \
 659     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 660             a1, REPORT_LOCATION_ARGS(offset));  \
 661 } STMT_END
 662
 663 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 664     const IV offset = loc - RExC_precomp;                               \
 665     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 666             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 667 } STMT_END
 668
 669 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 670     const IV offset = loc - RExC_precomp;                               \
 671     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 672             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 673 } STMT_END
 674
 675 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 676     const IV offset = loc - RExC_precomp;                               \
 677     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 678             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 679 } STMT_END
 680
 681 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 682     const IV offset = loc - RExC_precomp;                               \
 683     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 684             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 685 } STMT_END
 686
 687 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 688     const IV offset = loc - RExC_precomp;                               \
 689     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 690             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 691 } STMT_END
 692
 693
 694 /* Allow for side effects in s */
 695 #define REGC(c,s) STMT_START {                  \
 696     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 697 } STMT_END
 698
 699 /* Macros for recording node offsets.   20001227 mjd@plover.com
 700  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 701  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 702  * Element 0 holds the number n.
 703  * Position is 1 indexed.
 704  */
 705 #ifndef RE_TRACK_PATTERN_OFFSETS
 706 #define Set_Node_Offset_To_R(node,byte)
 707 #define Set_Node_Offset(node,byte)
 708 #define Set_Cur_Node_Offset
 709 #define Set_Node_Length_To_R(node,len)
 710 #define Set_Node_Length(node,len)
 711 #define Set_Node_Cur_Length(node,start)
 712 #define Node_Offset(n)
 713 #define Node_Length(n)
 714 #define Set_Node_Offset_Length(node,offset,len)
 715 #define ProgLen(ri) ri->u.proglen
 716 #define SetProgLen(ri,x) ri->u.proglen = x
 717 #else
 718 #define ProgLen(ri) ri->u.offsets[0]
 719 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 720 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 721     if (! SIZE_ONLY) {                                                  \
 722         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 723                     __LINE__, (int)(node), (int)(byte)));               \
 724         if((node) < 0) {                                                \
 725             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 726                                          (int)(node));                  \
 727         } else {                                                        \
 728             RExC_offsets[2*(node)-1] = (byte);                          \
 729         }                                                               \
 730     }                                                                   \
 731 } STMT_END
 732
 733 #define Set_Node_Offset(node,byte) \
 734     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 735 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 736
 737 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 738     if (! SIZE_ONLY) {                                                  \
 739         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 740                 __LINE__, (int)(node), (int)(len)));                    \
 741         if((node) < 0) {                                                \
 742             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 743                                          (int)(node));                  \
 744         } else {                                                        \
 745             RExC_offsets[2*(node)] = (len);                             \
 746         }                                                               \
 747     }                                                                   \
 748 } STMT_END
 749
 750 #define Set_Node_Length(node,len) \
 751     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 752 #define Set_Node_Cur_Length(node, start)                \
 753     Set_Node_Length(node, RExC_parse - start)
 754
 755 /* Get offsets and lengths */
 756 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 757 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 758
 759 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 760     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 761     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 762 } STMT_END
 763 #endif
 764
 765 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 766 #define EXPERIMENTAL_INPLACESCAN
 767 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 768
 769 #define DEBUG_RExC_seen() \
 770         DEBUG_OPTIMISE_MORE_r({                                             \
 771             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 772                                                                             \
 773             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 774                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 775                                                                             \
 776             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 777                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 778                                                                             \
 779             if (RExC_seen & REG_GPOS_SEEN)                                  \
 780                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 781                                                                             \
 782             if (RExC_seen & REG_CANY_SEEN)                                  \
 783                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 784                                                                             \
 785             if (RExC_seen & REG_RECURSE_SEEN)                               \
 786                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 787                                                                             \
 788             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 789                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 790                                                                             \
 791             if (RExC_seen & REG_VERBARG_SEEN)                               \
 792                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 793                                                                             \
 794             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 795                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 796                                                                             \
 797             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 798                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 799                                                                             \
 800             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 801                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 802                                                                             \
 803             if (RExC_seen & REG_GOSTART_SEEN)                               \
 804                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 805                                                                             \
 806             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 807                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 808                                                                             \
 809             PerlIO_printf(Perl_debug_log,"\n");                             \
 810         });
 811
 812 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 813   if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
 814
 815 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 816     if ( ( flags ) ) {                                                      \
 817         PerlIO_printf(Perl_debug_log, "%s", open_str);                      \
 818         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 819         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 820         DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 821         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 822         DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 823         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 824         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 825         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 826         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 827         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
 828         DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
 829         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
 830         DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
 831         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
 832         DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
 833         PerlIO_printf(Perl_debug_log, "%s", close_str);                     \
 834     }
 835
 836
 837 #define DEBUG_STUDYDATA(str,data,depth)                              \
 838 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 839     PerlIO_printf(Perl_debug_log,                                    \
 840         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 841         " Flags: 0x%"UVXf,                                           \
 842         (int)(depth)*2, "",                                          \
 843         (IV)((data)->pos_min),                                       \
 844         (IV)((data)->pos_delta),                                     \
 845         (UV)((data)->flags)                                          \
 846     );                                                               \
 847     DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
 848     PerlIO_printf(Perl_debug_log,                                    \
 849         " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
 850         (IV)((data)->whilem_c),                                      \
 851         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 852         is_inf ? "INF " : ""                                         \
 853     );                                                               \
 854     if ((data)->last_found)                                          \
 855         PerlIO_printf(Perl_debug_log,                                \
 856             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 857             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 858             SvPVX_const((data)->last_found),                         \
 859             (IV)((data)->last_end),                                  \
 860             (IV)((data)->last_start_min),                            \
 861             (IV)((data)->last_start_max),                            \
 862             ((data)->longest &&                                      \
 863              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 864             SvPVX_const((data)->longest_fixed),                      \
 865             (IV)((data)->offset_fixed),                              \
 866             ((data)->longest &&                                      \
 867              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 868             SvPVX_const((data)->longest_float),                      \
 869             (IV)((data)->offset_float_min),                          \
 870             (IV)((data)->offset_float_max)                           \
 871         );                                                           \
 872     PerlIO_printf(Perl_debug_log,"\n");                              \
 873 });
 874
 875 #ifdef DEBUGGING
 876
 877 /* is c a control character for which we have a mnemonic? */
 878 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 879
 880 STATIC const char *
 881 S_cntrl_to_mnemonic(const U8 c)
 882 {
 883     /* Returns the mnemonic string that represents character 'c', if one
 884      * exists; NULL otherwise.  The only ones that exist for the purposes of
 885      * this routine are a few control characters */
 886
 887     switch (c) {
 888         case '\a':       return "\\a";
 889         case '\b':       return "\\b";
 890         case ESC_NATIVE: return "\\e";
 891         case '\f':       return "\\f";
 892         case '\n':       return "\\n";
 893         case '\r':       return "\\r";
 894         case '\t':       return "\\t";
 895     }
 896
 897     return NULL;
 898 }
 899
 900 #endif
 901
 902 /* Mark that we cannot extend a found fixed substring at this point.
 903    Update the longest found anchored substring and the longest found
 904    floating substrings if needed. */
 905
 906 STATIC void
 907 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 908                     SSize_t *minlenp, int is_inf)
 909 {
 910     const STRLEN l = CHR_SVLEN(data->last_found);
 911     const STRLEN old_l = CHR_SVLEN(*data->longest);
 912     GET_RE_DEBUG_FLAGS_DECL;
 913
 914     PERL_ARGS_ASSERT_SCAN_COMMIT;
 915
 916     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 917         SvSetMagicSV(*data->longest, data->last_found);
 918         if (*data->longest == data->longest_fixed) {
 919             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 920             if (data->flags & SF_BEFORE_EOL)
 921                 data->flags
 922                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 923             else
 924                 data->flags &= ~SF_FIX_BEFORE_EOL;
 925             data->minlen_fixed=minlenp;
 926             data->lookbehind_fixed=0;
 927         }
 928         else { /* *data->longest == data->longest_float */
 929             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 930             data->offset_float_max = (l
 931                                       ? data->last_start_max
 932                                       : (data->pos_delta == SSize_t_MAX
 933                                          ? SSize_t_MAX
 934                                          : data->pos_min + data->pos_delta));
 935             if (is_inf
 936                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 937                 data->offset_float_max = SSize_t_MAX;
 938             if (data->flags & SF_BEFORE_EOL)
 939                 data->flags
 940                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 941             else
 942                 data->flags &= ~SF_FL_BEFORE_EOL;
 943             data->minlen_float=minlenp;
 944             data->lookbehind_float=0;
 945         }
 946     }
 947     SvCUR_set(data->last_found, 0);
 948     {
 949         SV * const sv = data->last_found;
 950         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 951             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 952             if (mg)
 953                 mg->mg_len = 0;
 954         }
 955     }
 956     data->last_end = -1;
 957     data->flags &= ~SF_BEFORE_EOL;
 958     DEBUG_STUDYDATA("commit: ",data,0);
 959 }
 960
 961 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 962  * list that describes which code points it matches */
 963
 964 STATIC void
 965 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 966 {
 967     /* Set the SSC 'ssc' to match an empty string or any code point */
 968
 969     PERL_ARGS_ASSERT_SSC_ANYTHING;
 970
 971     assert(is_ANYOF_SYNTHETIC(ssc));
 972
 973     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 974     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 975     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
 976 }
 977
 978 STATIC int
 979 S_ssc_is_anything(const regnode_ssc *ssc)
 980 {
 981     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 982      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 983      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 984      * in any way, so there's no point in using it */
 985
 986     UV start, end;
 987     bool ret;
 988
 989     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 990
 991     assert(is_ANYOF_SYNTHETIC(ssc));
 992
 993     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
 994         return FALSE;
 995     }
 996
 997     /* See if the list consists solely of the range 0 - Infinity */
 998     invlist_iterinit(ssc->invlist);
 999     ret = invlist_iternext(ssc->invlist, &start, &end)
1000           && start == 0
1001           && end == UV_MAX;
1002
1003     invlist_iterfinish(ssc->invlist);
1004
1005     if (ret) {
1006         return TRUE;
1007     }
1008
1009     /* If e.g., both \w and \W are set, matches everything */
1010     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1011         int i;
1012         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1013             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1014                 return TRUE;
1015             }
1016         }
1017     }
1018
1019     return FALSE;
1020 }
1021
1022 STATIC void
1023 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1024 {
1025     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1026      * string, any code point, or any posix class under locale */
1027
1028     PERL_ARGS_ASSERT_SSC_INIT;
1029
1030     Zero(ssc, 1, regnode_ssc);
1031     set_ANYOF_SYNTHETIC(ssc);
1032     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1033     ssc_anything(ssc);
1034
1035     /* If any portion of the regex is to operate under locale rules,
1036      * initialization includes it.  The reason this isn't done for all regexes
1037      * is that the optimizer was written under the assumption that locale was
1038      * all-or-nothing.  Given the complexity and lack of documentation in the
1039      * optimizer, and that there are inadequate test cases for locale, many
1040      * parts of it may not work properly, it is safest to avoid locale unless
1041      * necessary. */
1042     if (RExC_contains_locale) {
1043         ANYOF_POSIXL_SETALL(ssc);
1044     }
1045     else {
1046         ANYOF_POSIXL_ZERO(ssc);
1047     }
1048 }
1049
1050 STATIC int
1051 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1052                         const regnode_ssc *ssc)
1053 {
1054     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1055      * to the list of code points matched, and locale posix classes; hence does
1056      * not check its flags) */
1057
1058     UV start, end;
1059     bool ret;
1060
1061     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1062
1063     assert(is_ANYOF_SYNTHETIC(ssc));
1064
1065     invlist_iterinit(ssc->invlist);
1066     ret = invlist_iternext(ssc->invlist, &start, &end)
1067           && start == 0
1068           && end == UV_MAX;
1069
1070     invlist_iterfinish(ssc->invlist);
1071
1072     if (! ret) {
1073         return FALSE;
1074     }
1075
1076     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1077         return FALSE;
1078     }
1079
1080     return TRUE;
1081 }
1082
1083 STATIC SV*
1084 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1085                                const regnode_charclass* const node)
1086 {
1087     /* Returns a mortal inversion list defining which code points are matched
1088      * by 'node', which is of type ANYOF.  Handles complementing the result if
1089      * appropriate.  If some code points aren't knowable at this time, the
1090      * returned list must, and will, contain every code point that is a
1091      * possibility. */
1092
1093     SV* invlist = sv_2mortal(_new_invlist(0));
1094     SV* only_utf8_locale_invlist = NULL;
1095     unsigned int i;
1096     const U32 n = ARG(node);
1097     bool new_node_has_latin1 = FALSE;
1098
1099     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1100
1101     /* Look at the data structure created by S_set_ANYOF_arg() */
1102     if (n != ANYOF_ONLY_HAS_BITMAP) {
1103         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1104         AV * const av = MUTABLE_AV(SvRV(rv));
1105         SV **const ary = AvARRAY(av);
1106         assert(RExC_rxi->data->what[n] == 's');
1107
1108         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1109             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1110         }
1111         else if (ary[0] && ary[0] != &PL_sv_undef) {
1112
1113             /* Here, no compile-time swash, and there are things that won't be
1114              * known until runtime -- we have to assume it could be anything */
1115             return _add_range_to_invlist(invlist, 0, UV_MAX);
1116         }
1117         else if (ary[3] && ary[3] != &PL_sv_undef) {
1118
1119             /* Here no compile-time swash, and no run-time only data.  Use the
1120              * node's inversion list */
1121             invlist = sv_2mortal(invlist_clone(ary[3]));
1122         }
1123
1124         /* Get the code points valid only under UTF-8 locales */
1125         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1126             && ary[2] && ary[2] != &PL_sv_undef)
1127         {
1128             only_utf8_locale_invlist = ary[2];
1129         }
1130     }
1131
1132     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1133      * code points, and an inversion list for the others, but if there are code
1134      * points that should match only conditionally on the target string being
1135      * UTF-8, those are placed in the inversion list, and not the bitmap.
1136      * Since there are circumstances under which they could match, they are
1137      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1138      * to exclude them here, so that when we invert below, the end result
1139      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1140      * have to do this here before we add the unconditionally matched code
1141      * points */
1142     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1143         _invlist_intersection_complement_2nd(invlist,
1144                                              PL_UpperLatin1,
1145                                              &invlist);
1146     }
1147
1148     /* Add in the points from the bit map */
1149     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1150         if (ANYOF_BITMAP_TEST(node, i)) {
1151             invlist = add_cp_to_invlist(invlist, i);
1152             new_node_has_latin1 = TRUE;
1153         }
1154     }
1155
1156     /* If this can match all upper Latin1 code points, have to add them
1157      * as well */
1158     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1159         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1160     }
1161
1162     /* Similarly for these */
1163     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1164         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1165     }
1166
1167     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1168         _invlist_invert(invlist);
1169     }
1170     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1171
1172         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1173          * locale.  We can skip this if there are no 0-255 at all. */
1174         _invlist_union(invlist, PL_Latin1, &invlist);
1175     }
1176
1177     /* Similarly add the UTF-8 locale possible matches.  These have to be
1178      * deferred until after the non-UTF-8 locale ones are taken care of just
1179      * above, or it leads to wrong results under ANYOF_INVERT */
1180     if (only_utf8_locale_invlist) {
1181         _invlist_union_maybe_complement_2nd(invlist,
1182                                             only_utf8_locale_invlist,
1183                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1184                                             &invlist);
1185     }
1186
1187     return invlist;
1188 }
1189
1190 /* These two functions currently do the exact same thing */
1191 #define ssc_init_zero           ssc_init
1192
1193 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1194 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1195
1196 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1197  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1198  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1199
1200 STATIC void
1201 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1202                 const regnode_charclass *and_with)
1203 {
1204     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1205      * another SSC or a regular ANYOF class.  Can create false positives. */
1206
1207     SV* anded_cp_list;
1208     U8  anded_flags;
1209
1210     PERL_ARGS_ASSERT_SSC_AND;
1211
1212     assert(is_ANYOF_SYNTHETIC(ssc));
1213
1214     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1215      * the code point inversion list and just the relevant flags */
1216     if (is_ANYOF_SYNTHETIC(and_with)) {
1217         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1218         anded_flags = ANYOF_FLAGS(and_with);
1219
1220         /* XXX This is a kludge around what appears to be deficiencies in the
1221          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1222          * there are paths through the optimizer where it doesn't get weeded
1223          * out when it should.  And if we don't make some extra provision for
1224          * it like the code just below, it doesn't get added when it should.
1225          * This solution is to add it only when AND'ing, which is here, and
1226          * only when what is being AND'ed is the pristine, original node
1227          * matching anything.  Thus it is like adding it to ssc_anything() but
1228          * only when the result is to be AND'ed.  Probably the same solution
1229          * could be adopted for the same problem we have with /l matching,
1230          * which is solved differently in S_ssc_init(), and that would lead to
1231          * fewer false positives than that solution has.  But if this solution
1232          * creates bugs, the consequences are only that a warning isn't raised
1233          * that should be; while the consequences for having /l bugs is
1234          * incorrect matches */
1235         if (ssc_is_anything((regnode_ssc *)and_with)) {
1236             anded_flags |= ANYOF_WARN_SUPER;
1237         }
1238     }
1239     else {
1240         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1241         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1242     }
1243
1244     ANYOF_FLAGS(ssc) &= anded_flags;
1245
1246     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1247      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1248      * 'and_with' may be inverted.  When not inverted, we have the situation of
1249      * computing:
1250      *  (C1 | P1) & (C2 | P2)
1251      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1252      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1253      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1254      *                    <=  ((C1 & C2) | P1 | P2)
1255      * Alternatively, the last few steps could be:
1256      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1257      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1258      *                    <=  (C1 | C2 | (P1 & P2))
1259      * We favor the second approach if either P1 or P2 is non-empty.  This is
1260      * because these components are a barrier to doing optimizations, as what
1261      * they match cannot be known until the moment of matching as they are
1262      * dependent on the current locale, 'AND"ing them likely will reduce or
1263      * eliminate them.
1264      * But we can do better if we know that C1,P1 are in their initial state (a
1265      * frequent occurrence), each matching everything:
1266      *  (<everything>) & (C2 | P2) =  C2 | P2
1267      * Similarly, if C2,P2 are in their initial state (again a frequent
1268      * occurrence), the result is a no-op
1269      *  (C1 | P1) & (<everything>) =  C1 | P1
1270      *
1271      * Inverted, we have
1272      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1273      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1274      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1275      * */
1276
1277     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1278         && ! is_ANYOF_SYNTHETIC(and_with))
1279     {
1280         unsigned int i;
1281
1282         ssc_intersection(ssc,
1283                          anded_cp_list,
1284                          FALSE /* Has already been inverted */
1285                          );
1286
1287         /* If either P1 or P2 is empty, the intersection will be also; can skip
1288          * the loop */
1289         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1290             ANYOF_POSIXL_ZERO(ssc);
1291         }
1292         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1293
1294             /* Note that the Posix class component P from 'and_with' actually
1295              * looks like:
1296              *      P = Pa | Pb | ... | Pn
1297              * where each component is one posix class, such as in [\w\s].
1298              * Thus
1299              *      ~P = ~(Pa | Pb | ... | Pn)
1300              *         = ~Pa & ~Pb & ... & ~Pn
1301              *        <= ~Pa | ~Pb | ... | ~Pn
1302              * The last is something we can easily calculate, but unfortunately
1303              * is likely to have many false positives.  We could do better
1304              * in some (but certainly not all) instances if two classes in
1305              * P have known relationships.  For example
1306              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1307              * So
1308              *      :lower: & :print: = :lower:
1309              * And similarly for classes that must be disjoint.  For example,
1310              * since \s and \w can have no elements in common based on rules in
1311              * the POSIX standard,
1312              *      \w & ^\S = nothing
1313              * Unfortunately, some vendor locales do not meet the Posix
1314              * standard, in particular almost everything by Microsoft.
1315              * The loop below just changes e.g., \w into \W and vice versa */
1316
1317             regnode_charclass_posixl temp;
1318             int add = 1;    /* To calculate the index of the complement */
1319
1320             ANYOF_POSIXL_ZERO(&temp);
1321             for (i = 0; i < ANYOF_MAX; i++) {
1322                 assert(i % 2 != 0
1323                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1324                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1325
1326                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1327                     ANYOF_POSIXL_SET(&temp, i + add);
1328                 }
1329                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1330             }
1331             ANYOF_POSIXL_AND(&temp, ssc);
1332
1333         } /* else ssc already has no posixes */
1334     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1335          in its initial state */
1336     else if (! is_ANYOF_SYNTHETIC(and_with)
1337              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1338     {
1339         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1340          * copy it over 'ssc' */
1341         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1342             if (is_ANYOF_SYNTHETIC(and_with)) {
1343                 StructCopy(and_with, ssc, regnode_ssc);
1344             }
1345             else {
1346                 ssc->invlist = anded_cp_list;
1347                 ANYOF_POSIXL_ZERO(ssc);
1348                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1349                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1350                 }
1351             }
1352         }
1353         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1354                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1355         {
1356             /* One or the other of P1, P2 is non-empty. */
1357             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1358                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1359             }
1360             ssc_union(ssc, anded_cp_list, FALSE);
1361         }
1362         else { /* P1 = P2 = empty */
1363             ssc_intersection(ssc, anded_cp_list, FALSE);
1364         }
1365     }
1366 }
1367
1368 STATIC void
1369 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1370                const regnode_charclass *or_with)
1371 {
1372     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1373      * another SSC or a regular ANYOF class.  Can create false positives if
1374      * 'or_with' is to be inverted. */
1375
1376     SV* ored_cp_list;
1377     U8 ored_flags;
1378
1379     PERL_ARGS_ASSERT_SSC_OR;
1380
1381     assert(is_ANYOF_SYNTHETIC(ssc));
1382
1383     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1384      * the code point inversion list and just the relevant flags */
1385     if (is_ANYOF_SYNTHETIC(or_with)) {
1386         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1387         ored_flags = ANYOF_FLAGS(or_with);
1388     }
1389     else {
1390         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1391         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1392     }
1393
1394     ANYOF_FLAGS(ssc) |= ored_flags;
1395
1396     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1397      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1398      * 'or_with' may be inverted.  When not inverted, we have the simple
1399      * situation of computing:
1400      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1401      * If P1|P2 yields a situation with both a class and its complement are
1402      * set, like having both \w and \W, this matches all code points, and we
1403      * can delete these from the P component of the ssc going forward.  XXX We
1404      * might be able to delete all the P components, but I (khw) am not certain
1405      * about this, and it is better to be safe.
1406      *
1407      * Inverted, we have
1408      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1409      *                         <=  (C1 | P1) | ~C2
1410      *                         <=  (C1 | ~C2) | P1
1411      * (which results in actually simpler code than the non-inverted case)
1412      * */
1413
1414     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1415         && ! is_ANYOF_SYNTHETIC(or_with))
1416     {
1417         /* We ignore P2, leaving P1 going forward */
1418     }   /* else  Not inverted */
1419     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1420         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1421         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1422             unsigned int i;
1423             for (i = 0; i < ANYOF_MAX; i += 2) {
1424                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1425                 {
1426                     ssc_match_all_cp(ssc);
1427                     ANYOF_POSIXL_CLEAR(ssc, i);
1428                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1429                 }
1430             }
1431         }
1432     }
1433
1434     ssc_union(ssc,
1435               ored_cp_list,
1436               FALSE /* Already has been inverted */
1437               );
1438 }
1439
1440 PERL_STATIC_INLINE void
1441 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1442 {
1443     PERL_ARGS_ASSERT_SSC_UNION;
1444
1445     assert(is_ANYOF_SYNTHETIC(ssc));
1446
1447     _invlist_union_maybe_complement_2nd(ssc->invlist,
1448                                         invlist,
1449                                         invert2nd,
1450                                         &ssc->invlist);
1451 }
1452
1453 PERL_STATIC_INLINE void
1454 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1455                          SV* const invlist,
1456                          const bool invert2nd)
1457 {
1458     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1459
1460     assert(is_ANYOF_SYNTHETIC(ssc));
1461
1462     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1463                                                invlist,
1464                                                invert2nd,
1465                                                &ssc->invlist);
1466 }
1467
1468 PERL_STATIC_INLINE void
1469 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1470 {
1471     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1472
1473     assert(is_ANYOF_SYNTHETIC(ssc));
1474
1475     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1476 }
1477
1478 PERL_STATIC_INLINE void
1479 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1480 {
1481     /* AND just the single code point 'cp' into the SSC 'ssc' */
1482
1483     SV* cp_list = _new_invlist(2);
1484
1485     PERL_ARGS_ASSERT_SSC_CP_AND;
1486
1487     assert(is_ANYOF_SYNTHETIC(ssc));
1488
1489     cp_list = add_cp_to_invlist(cp_list, cp);
1490     ssc_intersection(ssc, cp_list,
1491                      FALSE /* Not inverted */
1492                      );
1493     SvREFCNT_dec_NN(cp_list);
1494 }
1495
1496 PERL_STATIC_INLINE void
1497 S_ssc_clear_locale(regnode_ssc *ssc)
1498 {
1499     /* Set the SSC 'ssc' to not match any locale things */
1500     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1501
1502     assert(is_ANYOF_SYNTHETIC(ssc));
1503
1504     ANYOF_POSIXL_ZERO(ssc);
1505     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1506 }
1507
1508 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1509
1510 STATIC bool
1511 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1512 {
1513     /* The synthetic start class is used to hopefully quickly winnow down
1514      * places where a pattern could start a match in the target string.  If it
1515      * doesn't really narrow things down that much, there isn't much point to
1516      * having the overhead of using it.  This function uses some very crude
1517      * heuristics to decide if to use the ssc or not.
1518      *
1519      * It returns TRUE if 'ssc' rules out more than half what it considers to
1520      * be the "likely" possible matches, but of course it doesn't know what the
1521      * actual things being matched are going to be; these are only guesses
1522      *
1523      * For /l matches, it assumes that the only likely matches are going to be
1524      *      in the 0-255 range, uniformly distributed, so half of that is 127
1525      * For /a and /d matches, it assumes that the likely matches will be just
1526      *      the ASCII range, so half of that is 63
1527      * For /u and there isn't anything matching above the Latin1 range, it
1528      *      assumes that that is the only range likely to be matched, and uses
1529      *      half that as the cut-off: 127.  If anything matches above Latin1,
1530      *      it assumes that all of Unicode could match (uniformly), except for
1531      *      non-Unicode code points and things in the General Category "Other"
1532      *      (unassigned, private use, surrogates, controls and formats).  This
1533      *      is a much large number. */
1534
1535     const U32 max_match = (LOC)
1536                           ? 127
1537                           : (! UNI_SEMANTICS)
1538                             ? 63
1539                             : (invlist_highest(ssc->invlist) < 256)
1540                               ? 127
1541                               : ((NON_OTHER_COUNT + 1) / 2) - 1;
1542     U32 count = 0;      /* Running total of number of code points matched by
1543                            'ssc' */
1544     UV start, end;      /* Start and end points of current range in inversion
1545                            list */
1546
1547     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1548
1549     invlist_iterinit(ssc->invlist);
1550     while (invlist_iternext(ssc->invlist, &start, &end)) {
1551
1552         /* /u is the only thing that we expect to match above 255; so if not /u
1553          * and even if there are matches above 255, ignore them.  This catches
1554          * things like \d under /d which does match the digits above 255, but
1555          * since the pattern is /d, it is not likely to be expecting them */
1556         if (! UNI_SEMANTICS) {
1557             if (start > 255) {
1558                 break;
1559             }
1560             end = MIN(end, 255);
1561         }
1562         count += end - start + 1;
1563         if (count > max_match) {
1564             invlist_iterfinish(ssc->invlist);
1565             return FALSE;
1566         }
1567     }
1568
1569     return TRUE;
1570 }
1571
1572
1573 STATIC void
1574 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1575 {
1576     /* The inversion list in the SSC is marked mortal; now we need a more
1577      * permanent copy, which is stored the same way that is done in a regular
1578      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1579      * map */
1580
1581     SV* invlist = invlist_clone(ssc->invlist);
1582
1583     PERL_ARGS_ASSERT_SSC_FINALIZE;
1584
1585     assert(is_ANYOF_SYNTHETIC(ssc));
1586
1587     /* The code in this file assumes that all but these flags aren't relevant
1588      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1589      * by the time we reach here */
1590     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1591
1592     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1593
1594     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1595                                 NULL, NULL, NULL, FALSE);
1596
1597     /* Make sure is clone-safe */
1598     ssc->invlist = NULL;
1599
1600     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1601         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1602     }
1603
1604     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1605 }
1606
1607 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1608 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1609 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1610 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1611                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1612                                : 0 )
1613
1614
1615 #ifdef DEBUGGING
1616 /*
1617    dump_trie(trie,widecharmap,revcharmap)
1618    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1619    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1620
1621    These routines dump out a trie in a somewhat readable format.
1622    The _interim_ variants are used for debugging the interim
1623    tables that are used to generate the final compressed
1624    representation which is what dump_trie expects.
1625
1626    Part of the reason for their existence is to provide a form
1627    of documentation as to how the different representations function.
1628
1629 */
1630
1631 /*
1632   Dumps the final compressed table form of the trie to Perl_debug_log.
1633   Used for debugging make_trie().
1634 */
1635
1636 STATIC void
1637 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1638             AV *revcharmap, U32 depth)
1639 {
1640     U32 state;
1641     SV *sv=sv_newmortal();
1642     int colwidth= widecharmap ? 6 : 4;
1643     U16 word;
1644     GET_RE_DEBUG_FLAGS_DECL;
1645
1646     PERL_ARGS_ASSERT_DUMP_TRIE;
1647
1648     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1649         (int)depth * 2 + 2,"",
1650         "Match","Base","Ofs" );
1651
1652     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1653         SV ** const tmp = av_fetch( revcharmap, state, 0);
1654         if ( tmp ) {
1655             PerlIO_printf( Perl_debug_log, "%*s",
1656                 colwidth,
1657                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1658                             PL_colors[0], PL_colors[1],
1659                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1660                             PERL_PV_ESCAPE_FIRSTCHAR
1661                 )
1662             );
1663         }
1664     }
1665     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1666         (int)depth * 2 + 2,"");
1667
1668     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1669         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1670     PerlIO_printf( Perl_debug_log, "\n");
1671
1672     for( state = 1 ; state < trie->statecount ; state++ ) {
1673         const U32 base = trie->states[ state ].trans.base;
1674
1675         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1676                                        (int)depth * 2 + 2,"", (UV)state);
1677
1678         if ( trie->states[ state ].wordnum ) {
1679             PerlIO_printf( Perl_debug_log, " W%4X",
1680                                            trie->states[ state ].wordnum );
1681         } else {
1682             PerlIO_printf( Perl_debug_log, "%6s", "" );
1683         }
1684
1685         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1686
1687         if ( base ) {
1688             U32 ofs = 0;
1689
1690             while( ( base + ofs  < trie->uniquecharcount ) ||
1691                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1692                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1693                                                                     != state))
1694                     ofs++;
1695
1696             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1697
1698             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1699                 if ( ( base + ofs >= trie->uniquecharcount )
1700                         && ( base + ofs - trie->uniquecharcount
1701                                                         < trie->lasttrans )
1702                         && trie->trans[ base + ofs
1703                                     - trie->uniquecharcount ].check == state )
1704                 {
1705                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1706                     colwidth,
1707                     (UV)trie->trans[ base + ofs
1708                                              - trie->uniquecharcount ].next );
1709                 } else {
1710                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1711                 }
1712             }
1713
1714             PerlIO_printf( Perl_debug_log, "]");
1715
1716         }
1717         PerlIO_printf( Perl_debug_log, "\n" );
1718     }
1719     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1720                                 (int)depth*2, "");
1721     for (word=1; word <= trie->wordcount; word++) {
1722         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1723             (int)word, (int)(trie->wordinfo[word].prev),
1724             (int)(trie->wordinfo[word].len));
1725     }
1726     PerlIO_printf(Perl_debug_log, "\n" );
1727 }
1728 /*
1729   Dumps a fully constructed but uncompressed trie in list form.
1730   List tries normally only are used for construction when the number of
1731   possible chars (trie->uniquecharcount) is very high.
1732   Used for debugging make_trie().
1733 */
1734 STATIC void
1735 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1736                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1737                          U32 depth)
1738 {
1739     U32 state;
1740     SV *sv=sv_newmortal();
1741     int colwidth= widecharmap ? 6 : 4;
1742     GET_RE_DEBUG_FLAGS_DECL;
1743
1744     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1745
1746     /* print out the table precompression.  */
1747     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1748         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1749         "------:-----+-----------------\n" );
1750
1751     for( state=1 ; state < next_alloc ; state ++ ) {
1752         U16 charid;
1753
1754         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1755             (int)depth * 2 + 2,"", (UV)state  );
1756         if ( ! trie->states[ state ].wordnum ) {
1757             PerlIO_printf( Perl_debug_log, "%5s| ","");
1758         } else {
1759             PerlIO_printf( Perl_debug_log, "W%4x| ",
1760                 trie->states[ state ].wordnum
1761             );
1762         }
1763         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1764             SV ** const tmp = av_fetch( revcharmap,
1765                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1766             if ( tmp ) {
1767                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1768                     colwidth,
1769                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1770                               colwidth,
1771                               PL_colors[0], PL_colors[1],
1772                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1773                               | PERL_PV_ESCAPE_FIRSTCHAR
1774                     ) ,
1775                     TRIE_LIST_ITEM(state,charid).forid,
1776                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1777                 );
1778                 if (!(charid % 10))
1779                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1780                         (int)((depth * 2) + 14), "");
1781             }
1782         }
1783         PerlIO_printf( Perl_debug_log, "\n");
1784     }
1785 }
1786
1787 /*
1788   Dumps a fully constructed but uncompressed trie in table form.
1789   This is the normal DFA style state transition table, with a few
1790   twists to facilitate compression later.
1791   Used for debugging make_trie().
1792 */
1793 STATIC void
1794 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1795                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1796                           U32 depth)
1797 {
1798     U32 state;
1799     U16 charid;
1800     SV *sv=sv_newmortal();
1801     int colwidth= widecharmap ? 6 : 4;
1802     GET_RE_DEBUG_FLAGS_DECL;
1803
1804     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1805
1806     /*
1807        print out the table precompression so that we can do a visual check
1808        that they are identical.
1809      */
1810
1811     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1812
1813     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1814         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1815         if ( tmp ) {
1816             PerlIO_printf( Perl_debug_log, "%*s",
1817                 colwidth,
1818                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1819                             PL_colors[0], PL_colors[1],
1820                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1821                             PERL_PV_ESCAPE_FIRSTCHAR
1822                 )
1823             );
1824         }
1825     }
1826
1827     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1828
1829     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1830         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1831     }
1832
1833     PerlIO_printf( Perl_debug_log, "\n" );
1834
1835     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1836
1837         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1838             (int)depth * 2 + 2,"",
1839             (UV)TRIE_NODENUM( state ) );
1840
1841         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1842             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1843             if (v)
1844                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1845             else
1846                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1847         }
1848         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1849             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1850                                             (UV)trie->trans[ state ].check );
1851         } else {
1852             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1853                                             (UV)trie->trans[ state ].check,
1854             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1855         }
1856     }
1857 }
1858
1859 #endif
1860
1861
1862 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1863   startbranch: the first branch in the whole branch sequence
1864   first      : start branch of sequence of branch-exact nodes.
1865                May be the same as startbranch
1866   last       : Thing following the last branch.
1867                May be the same as tail.
1868   tail       : item following the branch sequence
1869   count      : words in the sequence
1870   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1871   depth      : indent depth
1872
1873 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1874
1875 A trie is an N'ary tree where the branches are determined by digital
1876 decomposition of the key. IE, at the root node you look up the 1st character and
1877 follow that branch repeat until you find the end of the branches. Nodes can be
1878 marked as "accepting" meaning they represent a complete word. Eg:
1879
1880   /he|she|his|hers/
1881
1882 would convert into the following structure. Numbers represent states, letters
1883 following numbers represent valid transitions on the letter from that state, if
1884 the number is in square brackets it represents an accepting state, otherwise it
1885 will be in parenthesis.
1886
1887       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1888       |    |
1889       |   (2)
1890       |    |
1891      (1)   +-i->(6)-+-s->[7]
1892       |
1893       +-s->(3)-+-h->(4)-+-e->[5]
1894
1895       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1896
1897 This shows that when matching against the string 'hers' we will begin at state 1
1898 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1899 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1900 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1901 single traverse. We store a mapping from accepting to state to which word was
1902 matched, and then when we have multiple possibilities we try to complete the
1903 rest of the regex in the order in which they occured in the alternation.
1904
1905 The only prior NFA like behaviour that would be changed by the TRIE support is
1906 the silent ignoring of duplicate alternations which are of the form:
1907
1908  / (DUPE|DUPE) X? (?{ ... }) Y /x
1909
1910 Thus EVAL blocks following a trie may be called a different number of times with
1911 and without the optimisation. With the optimisations dupes will be silently
1912 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1913 the following demonstrates:
1914
1915  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1916
1917 which prints out 'word' three times, but
1918
1919  'words'=~/(word|word|word)(?{ print $1 })S/
1920
1921 which doesnt print it out at all. This is due to other optimisations kicking in.
1922
1923 Example of what happens on a structural level:
1924
1925 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1926
1927    1: CURLYM[1] {1,32767}(18)
1928    5:   BRANCH(8)
1929    6:     EXACT <ac>(16)
1930    8:   BRANCH(11)
1931    9:     EXACT <ad>(16)
1932   11:   BRANCH(14)
1933   12:     EXACT <ab>(16)
1934   16:   SUCCEED(0)
1935   17:   NOTHING(18)
1936   18: END(0)
1937
1938 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1939 and should turn into:
1940
1941    1: CURLYM[1] {1,32767}(18)
1942    5:   TRIE(16)
1943         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1944           <ac>
1945           <ad>
1946           <ab>
1947   16:   SUCCEED(0)
1948   17:   NOTHING(18)
1949   18: END(0)
1950
1951 Cases where tail != last would be like /(?foo|bar)baz/:
1952
1953    1: BRANCH(4)
1954    2:   EXACT <foo>(8)
1955    4: BRANCH(7)
1956    5:   EXACT <bar>(8)
1957    7: TAIL(8)
1958    8: EXACT <baz>(10)
1959   10: END(0)
1960
1961 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1962 and would end up looking like:
1963
1964     1: TRIE(8)
1965       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1966         <foo>
1967         <bar>
1968    7: TAIL(8)
1969    8: EXACT <baz>(10)
1970   10: END(0)
1971
1972     d = uvchr_to_utf8_flags(d, uv, 0);
1973
1974 is the recommended Unicode-aware way of saying
1975
1976     *(d++) = uv;
1977 */
1978
1979 #define TRIE_STORE_REVCHAR(val)                                            \
1980     STMT_START {                                                           \
1981         if (UTF) {                                                         \
1982             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1983             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1984             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1985             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1986             SvPOK_on(zlopp);                                               \
1987             SvUTF8_on(zlopp);                                              \
1988             av_push(revcharmap, zlopp);                                    \
1989         } else {                                                           \
1990             char ooooff = (char)val;                                           \
1991             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1992         }                                                                  \
1993         } STMT_END
1994
1995 /* This gets the next character from the input, folding it if not already
1996  * folded. */
1997 #define TRIE_READ_CHAR STMT_START {                                           \
1998     wordlen++;                                                                \
1999     if ( UTF ) {                                                              \
2000         /* if it is UTF then it is either already folded, or does not need    \
2001          * folding */                                                         \
2002         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2003     }                                                                         \
2004     else if (folder == PL_fold_latin1) {                                      \
2005         /* This folder implies Unicode rules, which in the range expressible  \
2006          *  by not UTF is the lower case, with the two exceptions, one of     \
2007          *  which should have been taken care of before calling this */       \
2008         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2009         uvc = toLOWER_L1(*uc);                                                \
2010         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2011         len = 1;                                                              \
2012     } else {                                                                  \
2013         /* raw data, will be folded later if needed */                        \
2014         uvc = (U32)*uc;                                                       \
2015         len = 1;                                                              \
2016     }                                                                         \
2017 } STMT_END
2018
2019
2020
2021 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2022     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2023         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2024         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2025     }                                                           \
2026     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2027     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2028     TRIE_LIST_CUR( state )++;                                   \
2029 } STMT_END
2030
2031 #define TRIE_LIST_NEW(state) STMT_START {                       \
2032     Newxz( trie->states[ state ].trans.list,               \
2033         4, reg_trie_trans_le );                                 \
2034      TRIE_LIST_CUR( state ) = 1;                                \
2035      TRIE_LIST_LEN( state ) = 4;                                \
2036 } STMT_END
2037
2038 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2039     U16 dupe= trie->states[ state ].wordnum;                    \
2040     regnode * const noper_next = regnext( noper );              \
2041                                                                 \
2042     DEBUG_r({                                                   \
2043         /* store the word for dumping */                        \
2044         SV* tmp;                                                \
2045         if (OP(noper) != NOTHING)                               \
2046             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2047         else                                                    \
2048             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2049         av_push( trie_words, tmp );                             \
2050     });                                                         \
2051                                                                 \
2052     curword++;                                                  \
2053     trie->wordinfo[curword].prev   = 0;                         \
2054     trie->wordinfo[curword].len    = wordlen;                   \
2055     trie->wordinfo[curword].accept = state;                     \
2056                                                                 \
2057     if ( noper_next < tail ) {                                  \
2058         if (!trie->jump)                                        \
2059             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2060                                                  sizeof(U16) ); \
2061         trie->jump[curword] = (U16)(noper_next - convert);      \
2062         if (!jumper)                                            \
2063             jumper = noper_next;                                \
2064         if (!nextbranch)                                        \
2065             nextbranch= regnext(cur);                           \
2066     }                                                           \
2067                                                                 \
2068     if ( dupe ) {                                               \
2069         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2070         /* chain, so that when the bits of chain are later    */\
2071         /* linked together, the dups appear in the chain      */\
2072         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2073         trie->wordinfo[dupe].prev = curword;                    \
2074     } else {                                                    \
2075         /* we haven't inserted this word yet.                */ \
2076         trie->states[ state ].wordnum = curword;                \
2077     }                                                           \
2078 } STMT_END
2079
2080
2081 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2082      ( ( base + charid >=  ucharcount                                   \
2083          && base + charid < ubound                                      \
2084          && state == trie->trans[ base - ucharcount + charid ].check    \
2085          && trie->trans[ base - ucharcount + charid ].next )            \
2086            ? trie->trans[ base - ucharcount + charid ].next             \
2087            : ( state==1 ? special : 0 )                                 \
2088       )
2089
2090 #define MADE_TRIE       1
2091 #define MADE_JUMP_TRIE  2
2092 #define MADE_EXACT_TRIE 4
2093
2094 STATIC I32
2095 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2096                   regnode *first, regnode *last, regnode *tail,
2097                   U32 word_count, U32 flags, U32 depth)
2098 {
2099     /* first pass, loop through and scan words */
2100     reg_trie_data *trie;
2101     HV *widecharmap = NULL;
2102     AV *revcharmap = newAV();
2103     regnode *cur;
2104     STRLEN len = 0;
2105     UV uvc = 0;
2106     U16 curword = 0;
2107     U32 next_alloc = 0;
2108     regnode *jumper = NULL;
2109     regnode *nextbranch = NULL;
2110     regnode *convert = NULL;
2111     U32 *prev_states; /* temp array mapping each state to previous one */
2112     /* we just use folder as a flag in utf8 */
2113     const U8 * folder = NULL;
2114
2115 #ifdef DEBUGGING
2116     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2117     AV *trie_words = NULL;
2118     /* along with revcharmap, this only used during construction but both are
2119      * useful during debugging so we store them in the struct when debugging.
2120      */
2121 #else
2122     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2123     STRLEN trie_charcount=0;
2124 #endif
2125     SV *re_trie_maxbuff;
2126     GET_RE_DEBUG_FLAGS_DECL;
2127
2128     PERL_ARGS_ASSERT_MAKE_TRIE;
2129 #ifndef DEBUGGING
2130     PERL_UNUSED_ARG(depth);
2131 #endif
2132
2133     switch (flags) {
2134         case EXACT: break;
2135         case EXACTFA:
2136         case EXACTFU_SS:
2137         case EXACTFU: folder = PL_fold_latin1; break;
2138         case EXACTF:  folder = PL_fold; break;
2139         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2140     }
2141
2142     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2143     trie->refcount = 1;
2144     trie->startstate = 1;
2145     trie->wordcount = word_count;
2146     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2147     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2148     if (flags == EXACT)
2149         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2150     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2151                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2152
2153     DEBUG_r({
2154         trie_words = newAV();
2155     });
2156
2157     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2158     assert(re_trie_maxbuff);
2159     if (!SvIOK(re_trie_maxbuff)) {
2160         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2161     }
2162     DEBUG_TRIE_COMPILE_r({
2163         PerlIO_printf( Perl_debug_log,
2164           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2165           (int)depth * 2 + 2, "",
2166           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2167           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2168     });
2169
2170    /* Find the node we are going to overwrite */
2171     if ( first == startbranch && OP( last ) != BRANCH ) {
2172         /* whole branch chain */
2173         convert = first;
2174     } else {
2175         /* branch sub-chain */
2176         convert = NEXTOPER( first );
2177     }
2178
2179     /*  -- First loop and Setup --
2180
2181        We first traverse the branches and scan each word to determine if it
2182        contains widechars, and how many unique chars there are, this is
2183        important as we have to build a table with at least as many columns as we
2184        have unique chars.
2185
2186        We use an array of integers to represent the character codes 0..255
2187        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2188        the native representation of the character value as the key and IV's for
2189        the coded index.
2190
2191        *TODO* If we keep track of how many times each character is used we can
2192        remap the columns so that the table compression later on is more
2193        efficient in terms of memory by ensuring the most common value is in the
2194        middle and the least common are on the outside.  IMO this would be better
2195        than a most to least common mapping as theres a decent chance the most
2196        common letter will share a node with the least common, meaning the node
2197        will not be compressible. With a middle is most common approach the worst
2198        case is when we have the least common nodes twice.
2199
2200      */
2201
2202     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2203         regnode *noper = NEXTOPER( cur );
2204         const U8 *uc = (U8*)STRING( noper );
2205         const U8 *e  = uc + STR_LEN( noper );
2206         int foldlen = 0;
2207         U32 wordlen      = 0;         /* required init */
2208         STRLEN minchars = 0;
2209         STRLEN maxchars = 0;
2210         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2211                                                bitmap?*/
2212
2213         if (OP(noper) == NOTHING) {
2214             regnode *noper_next= regnext(noper);
2215             if (noper_next != tail && OP(noper_next) == flags) {
2216                 noper = noper_next;
2217                 uc= (U8*)STRING(noper);
2218                 e= uc + STR_LEN(noper);
2219                 trie->minlen= STR_LEN(noper);
2220             } else {
2221                 trie->minlen= 0;
2222                 continue;
2223             }
2224         }
2225
2226         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2227             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2228                                           regardless of encoding */
2229             if (OP( noper ) == EXACTFU_SS) {
2230                 /* false positives are ok, so just set this */
2231                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2232             }
2233         }
2234         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2235                                            branch */
2236             TRIE_CHARCOUNT(trie)++;
2237             TRIE_READ_CHAR;
2238
2239             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2240              * is in effect.  Under /i, this character can match itself, or
2241              * anything that folds to it.  If not under /i, it can match just
2242              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2243              * all fold to k, and all are single characters.   But some folds
2244              * expand to more than one character, so for example LATIN SMALL
2245              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2246              * the string beginning at 'uc' is 'ffi', it could be matched by
2247              * three characters, or just by the one ligature character. (It
2248              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2249              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2250              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2251              * match.)  The trie needs to know the minimum and maximum number
2252              * of characters that could match so that it can use size alone to
2253              * quickly reject many match attempts.  The max is simple: it is
2254              * the number of folded characters in this branch (since a fold is
2255              * never shorter than what folds to it. */
2256
2257             maxchars++;
2258
2259             /* And the min is equal to the max if not under /i (indicated by
2260              * 'folder' being NULL), or there are no multi-character folds.  If
2261              * there is a multi-character fold, the min is incremented just
2262              * once, for the character that folds to the sequence.  Each
2263              * character in the sequence needs to be added to the list below of
2264              * characters in the trie, but we count only the first towards the
2265              * min number of characters needed.  This is done through the
2266              * variable 'foldlen', which is returned by the macros that look
2267              * for these sequences as the number of bytes the sequence
2268              * occupies.  Each time through the loop, we decrement 'foldlen' by
2269              * how many bytes the current char occupies.  Only when it reaches
2270              * 0 do we increment 'minchars' or look for another multi-character
2271              * sequence. */
2272             if (folder == NULL) {
2273                 minchars++;
2274             }
2275             else if (foldlen > 0) {
2276                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2277             }
2278             else {
2279                 minchars++;
2280
2281                 /* See if *uc is the beginning of a multi-character fold.  If
2282                  * so, we decrement the length remaining to look at, to account
2283                  * for the current character this iteration.  (We can use 'uc'
2284                  * instead of the fold returned by TRIE_READ_CHAR because for
2285                  * non-UTF, the latin1_safe macro is smart enough to account
2286                  * for all the unfolded characters, and because for UTF, the
2287                  * string will already have been folded earlier in the
2288                  * compilation process */
2289                 if (UTF) {
2290                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2291                         foldlen -= UTF8SKIP(uc);
2292                     }
2293                 }
2294                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2295                     foldlen--;
2296                 }
2297             }
2298
2299             /* The current character (and any potential folds) should be added
2300              * to the possible matching characters for this position in this
2301              * branch */
2302             if ( uvc < 256 ) {
2303                 if ( folder ) {
2304                     U8 folded= folder[ (U8) uvc ];
2305                     if ( !trie->charmap[ folded ] ) {
2306                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2307                         TRIE_STORE_REVCHAR( folded );
2308                     }
2309                 }
2310                 if ( !trie->charmap[ uvc ] ) {
2311                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2312                     TRIE_STORE_REVCHAR( uvc );
2313                 }
2314                 if ( set_bit ) {
2315                     /* store the codepoint in the bitmap, and its folded
2316                      * equivalent. */
2317                     TRIE_BITMAP_SET(trie, uvc);
2318
2319                     /* store the folded codepoint */
2320                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2321
2322                     if ( !UTF ) {
2323                         /* store first byte of utf8 representation of
2324                            variant codepoints */
2325                         if (! UVCHR_IS_INVARIANT(uvc)) {
2326                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2327                         }
2328                     }
2329                     set_bit = 0; /* We've done our bit :-) */
2330                 }
2331             } else {
2332
2333                 /* XXX We could come up with the list of code points that fold
2334                  * to this using PL_utf8_foldclosures, except not for
2335                  * multi-char folds, as there may be multiple combinations
2336                  * there that could work, which needs to wait until runtime to
2337                  * resolve (The comment about LIGATURE FFI above is such an
2338                  * example */
2339
2340                 SV** svpp;
2341                 if ( !widecharmap )
2342                     widecharmap = newHV();
2343
2344                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2345
2346                 if ( !svpp )
2347                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2348
2349                 if ( !SvTRUE( *svpp ) ) {
2350                     sv_setiv( *svpp, ++trie->uniquecharcount );
2351                     TRIE_STORE_REVCHAR(uvc);
2352                 }
2353             }
2354         } /* end loop through characters in this branch of the trie */
2355
2356         /* We take the min and max for this branch and combine to find the min
2357          * and max for all branches processed so far */
2358         if( cur == first ) {
2359             trie->minlen = minchars;
2360             trie->maxlen = maxchars;
2361         } else if (minchars < trie->minlen) {
2362             trie->minlen = minchars;
2363         } else if (maxchars > trie->maxlen) {
2364             trie->maxlen = maxchars;
2365         }
2366     } /* end first pass */
2367     DEBUG_TRIE_COMPILE_r(
2368         PerlIO_printf( Perl_debug_log,
2369                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2370                 (int)depth * 2 + 2,"",
2371                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2372                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2373                 (int)trie->minlen, (int)trie->maxlen )
2374     );
2375
2376     /*
2377         We now know what we are dealing with in terms of unique chars and
2378         string sizes so we can calculate how much memory a naive
2379         representation using a flat table  will take. If it's over a reasonable
2380         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2381         conservative but potentially much slower representation using an array
2382         of lists.
2383
2384         At the end we convert both representations into the same compressed
2385         form that will be used in regexec.c for matching with. The latter
2386         is a form that cannot be used to construct with but has memory
2387         properties similar to the list form and access properties similar
2388         to the table form making it both suitable for fast searches and
2389         small enough that its feasable to store for the duration of a program.
2390
2391         See the comment in the code where the compressed table is produced
2392         inplace from the flat tabe representation for an explanation of how
2393         the compression works.
2394
2395     */
2396
2397
2398     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2399     prev_states[1] = 0;
2400
2401     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2402                                                     > SvIV(re_trie_maxbuff) )
2403     {
2404         /*
2405             Second Pass -- Array Of Lists Representation
2406
2407             Each state will be represented by a list of charid:state records
2408             (reg_trie_trans_le) the first such element holds the CUR and LEN
2409             points of the allocated array. (See defines above).
2410
2411             We build the initial structure using the lists, and then convert
2412             it into the compressed table form which allows faster lookups
2413             (but cant be modified once converted).
2414         */
2415
2416         STRLEN transcount = 1;
2417
2418         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2419             "%*sCompiling trie using list compiler\n",
2420             (int)depth * 2 + 2, ""));
2421
2422         trie->states = (reg_trie_state *)
2423             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2424                                   sizeof(reg_trie_state) );
2425         TRIE_LIST_NEW(1);
2426         next_alloc = 2;
2427
2428         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2429
2430             regnode *noper   = NEXTOPER( cur );
2431             U8 *uc           = (U8*)STRING( noper );
2432             const U8 *e      = uc + STR_LEN( noper );
2433             U32 state        = 1;         /* required init */
2434             U16 charid       = 0;         /* sanity init */
2435             U32 wordlen      = 0;         /* required init */
2436
2437             if (OP(noper) == NOTHING) {
2438                 regnode *noper_next= regnext(noper);
2439                 if (noper_next != tail && OP(noper_next) == flags) {
2440                     noper = noper_next;
2441                     uc= (U8*)STRING(noper);
2442                     e= uc + STR_LEN(noper);
2443                 }
2444             }
2445
2446             if (OP(noper) != NOTHING) {
2447                 for ( ; uc < e ; uc += len ) {
2448
2449                     TRIE_READ_CHAR;
2450
2451                     if ( uvc < 256 ) {
2452                         charid = trie->charmap[ uvc ];
2453                     } else {
2454                         SV** const svpp = hv_fetch( widecharmap,
2455                                                     (char*)&uvc,
2456                                                     sizeof( UV ),
2457                                                     0);
2458                         if ( !svpp ) {
2459                             charid = 0;
2460                         } else {
2461                             charid=(U16)SvIV( *svpp );
2462                         }
2463                     }
2464                     /* charid is now 0 if we dont know the char read, or
2465                      * nonzero if we do */
2466                     if ( charid ) {
2467
2468                         U16 check;
2469                         U32 newstate = 0;
2470
2471                         charid--;
2472                         if ( !trie->states[ state ].trans.list ) {
2473                             TRIE_LIST_NEW( state );
2474                         }
2475                         for ( check = 1;
2476                               check <= TRIE_LIST_USED( state );
2477                               check++ )
2478                         {
2479                             if ( TRIE_LIST_ITEM( state, check ).forid
2480                                                                     == charid )
2481                             {
2482                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2483                                 break;
2484                             }
2485                         }
2486                         if ( ! newstate ) {
2487                             newstate = next_alloc++;
2488                             prev_states[newstate] = state;
2489                             TRIE_LIST_PUSH( state, charid, newstate );
2490                             transcount++;
2491                         }
2492                         state = newstate;
2493                     } else {
2494                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2495                     }
2496                 }
2497             }
2498             TRIE_HANDLE_WORD(state);
2499
2500         } /* end second pass */
2501
2502         /* next alloc is the NEXT state to be allocated */
2503         trie->statecount = next_alloc;
2504         trie->states = (reg_trie_state *)
2505             PerlMemShared_realloc( trie->states,
2506                                    next_alloc
2507                                    * sizeof(reg_trie_state) );
2508
2509         /* and now dump it out before we compress it */
2510         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2511                                                          revcharmap, next_alloc,
2512                                                          depth+1)
2513         );
2514
2515         trie->trans = (reg_trie_trans *)
2516             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2517         {
2518             U32 state;
2519             U32 tp = 0;
2520             U32 zp = 0;
2521
2522
2523             for( state=1 ; state < next_alloc ; state ++ ) {
2524                 U32 base=0;
2525
2526                 /*
2527                 DEBUG_TRIE_COMPILE_MORE_r(
2528                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2529                 );
2530                 */
2531
2532                 if (trie->states[state].trans.list) {
2533                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2534                     U16 maxid=minid;
2535                     U16 idx;
2536
2537                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2538                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2539                         if ( forid < minid ) {
2540                             minid=forid;
2541                         } else if ( forid > maxid ) {
2542                             maxid=forid;
2543                         }
2544                     }
2545                     if ( transcount < tp + maxid - minid + 1) {
2546                         transcount *= 2;
2547                         trie->trans = (reg_trie_trans *)
2548                             PerlMemShared_realloc( trie->trans,
2549                                                      transcount
2550                                                      * sizeof(reg_trie_trans) );
2551                         Zero( trie->trans + (transcount / 2),
2552                               transcount / 2,
2553                               reg_trie_trans );
2554                     }
2555                     base = trie->uniquecharcount + tp - minid;
2556                     if ( maxid == minid ) {
2557                         U32 set = 0;
2558                         for ( ; zp < tp ; zp++ ) {
2559                             if ( ! trie->trans[ zp ].next ) {
2560                                 base = trie->uniquecharcount + zp - minid;
2561                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2562                                                                    1).newstate;
2563                                 trie->trans[ zp ].check = state;
2564                                 set = 1;
2565                                 break;
2566                             }
2567                         }
2568                         if ( !set ) {
2569                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2570                                                                    1).newstate;
2571                             trie->trans[ tp ].check = state;
2572                             tp++;
2573                             zp = tp;
2574                         }
2575                     } else {
2576                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2577                             const U32 tid = base
2578                                            - trie->uniquecharcount
2579                                            + TRIE_LIST_ITEM( state, idx ).forid;
2580                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2581                                                                 idx ).newstate;
2582                             trie->trans[ tid ].check = state;
2583                         }
2584                         tp += ( maxid - minid + 1 );
2585                     }
2586                     Safefree(trie->states[ state ].trans.list);
2587                 }
2588                 /*
2589                 DEBUG_TRIE_COMPILE_MORE_r(
2590                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2591                 );
2592                 */
2593                 trie->states[ state ].trans.base=base;
2594             }
2595             trie->lasttrans = tp + 1;
2596         }
2597     } else {
2598         /*
2599            Second Pass -- Flat Table Representation.
2600
2601            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2602            each.  We know that we will need Charcount+1 trans at most to store
2603            the data (one row per char at worst case) So we preallocate both
2604            structures assuming worst case.
2605
2606            We then construct the trie using only the .next slots of the entry
2607            structs.
2608
2609            We use the .check field of the first entry of the node temporarily
2610            to make compression both faster and easier by keeping track of how
2611            many non zero fields are in the node.
2612
2613            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2614            transition.
2615
2616            There are two terms at use here: state as a TRIE_NODEIDX() which is
2617            a number representing the first entry of the node, and state as a
2618            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2619            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2620            if there are 2 entrys per node. eg:
2621
2622              A B       A B
2623           1. 2 4    1. 3 7
2624           2. 0 3    3. 0 5
2625           3. 0 0    5. 0 0
2626           4. 0 0    7. 0 0
2627
2628            The table is internally in the right hand, idx form. However as we
2629            also have to deal with the states array which is indexed by nodenum
2630            we have to use TRIE_NODENUM() to convert.
2631
2632         */
2633         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2634             "%*sCompiling trie using table compiler\n",
2635             (int)depth * 2 + 2, ""));
2636
2637         trie->trans = (reg_trie_trans *)
2638             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2639                                   * trie->uniquecharcount + 1,
2640                                   sizeof(reg_trie_trans) );
2641         trie->states = (reg_trie_state *)
2642             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2643                                   sizeof(reg_trie_state) );
2644         next_alloc = trie->uniquecharcount + 1;
2645
2646
2647         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2648
2649             regnode *noper   = NEXTOPER( cur );
2650             const U8 *uc     = (U8*)STRING( noper );
2651             const U8 *e      = uc + STR_LEN( noper );
2652
2653             U32 state        = 1;         /* required init */
2654
2655             U16 charid       = 0;         /* sanity init */
2656             U32 accept_state = 0;         /* sanity init */
2657
2658             U32 wordlen      = 0;         /* required init */
2659
2660             if (OP(noper) == NOTHING) {
2661                 regnode *noper_next= regnext(noper);
2662                 if (noper_next != tail && OP(noper_next) == flags) {
2663                     noper = noper_next;
2664                     uc= (U8*)STRING(noper);
2665                     e= uc + STR_LEN(noper);
2666                 }
2667             }
2668
2669             if ( OP(noper) != NOTHING ) {
2670                 for ( ; uc < e ; uc += len ) {
2671
2672                     TRIE_READ_CHAR;
2673
2674                     if ( uvc < 256 ) {
2675                         charid = trie->charmap[ uvc ];
2676                     } else {
2677                         SV* const * const svpp = hv_fetch( widecharmap,
2678                                                            (char*)&uvc,
2679                                                            sizeof( UV ),
2680                                                            0);
2681                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2682                     }
2683                     if ( charid ) {
2684                         charid--;
2685                         if ( !trie->trans[ state + charid ].next ) {
2686                             trie->trans[ state + charid ].next = next_alloc;
2687                             trie->trans[ state ].check++;
2688                             prev_states[TRIE_NODENUM(next_alloc)]
2689                                     = TRIE_NODENUM(state);
2690                             next_alloc += trie->uniquecharcount;
2691                         }
2692                         state = trie->trans[ state + charid ].next;
2693                     } else {
2694                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2695                     }
2696                     /* charid is now 0 if we dont know the char read, or
2697                      * nonzero if we do */
2698                 }
2699             }
2700             accept_state = TRIE_NODENUM( state );
2701             TRIE_HANDLE_WORD(accept_state);
2702
2703         } /* end second pass */
2704
2705         /* and now dump it out before we compress it */
2706         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2707                                                           revcharmap,
2708                                                           next_alloc, depth+1));
2709
2710         {
2711         /*
2712            * Inplace compress the table.*
2713
2714            For sparse data sets the table constructed by the trie algorithm will
2715            be mostly 0/FAIL transitions or to put it another way mostly empty.
2716            (Note that leaf nodes will not contain any transitions.)
2717
2718            This algorithm compresses the tables by eliminating most such
2719            transitions, at the cost of a modest bit of extra work during lookup:
2720
2721            - Each states[] entry contains a .base field which indicates the
2722            index in the state[] array wheres its transition data is stored.
2723
2724            - If .base is 0 there are no valid transitions from that node.
2725
2726            - If .base is nonzero then charid is added to it to find an entry in
2727            the trans array.
2728
2729            -If trans[states[state].base+charid].check!=state then the
2730            transition is taken to be a 0/Fail transition. Thus if there are fail
2731            transitions at the front of the node then the .base offset will point
2732            somewhere inside the previous nodes data (or maybe even into a node
2733            even earlier), but the .check field determines if the transition is
2734            valid.
2735
2736            XXX - wrong maybe?
2737            The following process inplace converts the table to the compressed
2738            table: We first do not compress the root node 1,and mark all its
2739            .check pointers as 1 and set its .base pointer as 1 as well. This
2740            allows us to do a DFA construction from the compressed table later,
2741            and ensures that any .base pointers we calculate later are greater
2742            than 0.
2743
2744            - We set 'pos' to indicate the first entry of the second node.
2745
2746            - We then iterate over the columns of the node, finding the first and
2747            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2748            and set the .check pointers accordingly, and advance pos
2749            appropriately and repreat for the next node. Note that when we copy
2750            the next pointers we have to convert them from the original
2751            NODEIDX form to NODENUM form as the former is not valid post
2752            compression.
2753
2754            - If a node has no transitions used we mark its base as 0 and do not
2755            advance the pos pointer.
2756
2757            - If a node only has one transition we use a second pointer into the
2758            structure to fill in allocated fail transitions from other states.
2759            This pointer is independent of the main pointer and scans forward
2760            looking for null transitions that are allocated to a state. When it
2761            finds one it writes the single transition into the "hole".  If the
2762            pointer doesnt find one the single transition is appended as normal.
2763
2764            - Once compressed we can Renew/realloc the structures to release the
2765            excess space.
2766
2767            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2768            specifically Fig 3.47 and the associated pseudocode.
2769
2770            demq
2771         */
2772         const U32 laststate = TRIE_NODENUM( next_alloc );
2773         U32 state, charid;
2774         U32 pos = 0, zp=0;
2775         trie->statecount = laststate;
2776
2777         for ( state = 1 ; state < laststate ; state++ ) {
2778             U8 flag = 0;
2779             const U32 stateidx = TRIE_NODEIDX( state );
2780             const U32 o_used = trie->trans[ stateidx ].check;
2781             U32 used = trie->trans[ stateidx ].check;
2782             trie->trans[ stateidx ].check = 0;
2783
2784             for ( charid = 0;
2785                   used && charid < trie->uniquecharcount;
2786                   charid++ )
2787             {
2788                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2789                     if ( trie->trans[ stateidx + charid ].next ) {
2790                         if (o_used == 1) {
2791                             for ( ; zp < pos ; zp++ ) {
2792                                 if ( ! trie->trans[ zp ].next ) {
2793                                     break;
2794                                 }
2795                             }
2796                             trie->states[ state ].trans.base
2797                                                     = zp
2798                                                       + trie->uniquecharcount
2799                                                       - charid ;
2800                             trie->trans[ zp ].next
2801                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2802                                                              + charid ].next );
2803                             trie->trans[ zp ].check = state;
2804                             if ( ++zp > pos ) pos = zp;
2805                             break;
2806                         }
2807                         used--;
2808                     }
2809                     if ( !flag ) {
2810                         flag = 1;
2811                         trie->states[ state ].trans.base
2812                                        = pos + trie->uniquecharcount - charid ;
2813                     }
2814                     trie->trans[ pos ].next
2815                         = SAFE_TRIE_NODENUM(
2816                                        trie->trans[ stateidx + charid ].next );
2817                     trie->trans[ pos ].check = state;
2818                     pos++;
2819                 }
2820             }
2821         }
2822         trie->lasttrans = pos + 1;
2823         trie->states = (reg_trie_state *)
2824             PerlMemShared_realloc( trie->states, laststate
2825                                    * sizeof(reg_trie_state) );
2826         DEBUG_TRIE_COMPILE_MORE_r(
2827             PerlIO_printf( Perl_debug_log,
2828                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2829                 (int)depth * 2 + 2,"",
2830                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2831                        + 1 ),
2832                 (IV)next_alloc,
2833                 (IV)pos,
2834                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2835             );
2836
2837         } /* end table compress */
2838     }
2839     DEBUG_TRIE_COMPILE_MORE_r(
2840             PerlIO_printf(Perl_debug_log,
2841                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2842                 (int)depth * 2 + 2, "",
2843                 (UV)trie->statecount,
2844                 (UV)trie->lasttrans)
2845     );
2846     /* resize the trans array to remove unused space */
2847     trie->trans = (reg_trie_trans *)
2848         PerlMemShared_realloc( trie->trans, trie->lasttrans
2849                                * sizeof(reg_trie_trans) );
2850
2851     {   /* Modify the program and insert the new TRIE node */
2852         U8 nodetype =(U8)(flags & 0xFF);
2853         char *str=NULL;
2854
2855 #ifdef DEBUGGING
2856         regnode *optimize = NULL;
2857 #ifdef RE_TRACK_PATTERN_OFFSETS
2858
2859         U32 mjd_offset = 0;
2860         U32 mjd_nodelen = 0;
2861 #endif /* RE_TRACK_PATTERN_OFFSETS */
2862 #endif /* DEBUGGING */
2863         /*
2864            This means we convert either the first branch or the first Exact,
2865            depending on whether the thing following (in 'last') is a branch
2866            or not and whther first is the startbranch (ie is it a sub part of
2867            the alternation or is it the whole thing.)
2868            Assuming its a sub part we convert the EXACT otherwise we convert
2869            the whole branch sequence, including the first.
2870          */
2871         /* Find the node we are going to overwrite */
2872         if ( first != startbranch || OP( last ) == BRANCH ) {
2873             /* branch sub-chain */
2874             NEXT_OFF( first ) = (U16)(last - first);
2875 #ifdef RE_TRACK_PATTERN_OFFSETS
2876             DEBUG_r({
2877                 mjd_offset= Node_Offset((convert));
2878                 mjd_nodelen= Node_Length((convert));
2879             });
2880 #endif
2881             /* whole branch chain */
2882         }
2883 #ifdef RE_TRACK_PATTERN_OFFSETS
2884         else {
2885             DEBUG_r({
2886                 const  regnode *nop = NEXTOPER( convert );
2887                 mjd_offset= Node_Offset((nop));
2888                 mjd_nodelen= Node_Length((nop));
2889             });
2890         }
2891         DEBUG_OPTIMISE_r(
2892             PerlIO_printf(Perl_debug_log,
2893                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2894                 (int)depth * 2 + 2, "",
2895                 (UV)mjd_offset, (UV)mjd_nodelen)
2896         );
2897 #endif
2898         /* But first we check to see if there is a common prefix we can
2899            split out as an EXACT and put in front of the TRIE node.  */
2900         trie->startstate= 1;
2901         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2902             U32 state;
2903             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2904                 U32 ofs = 0;
2905                 I32 idx = -1;
2906                 U32 count = 0;
2907                 const U32 base = trie->states[ state ].trans.base;
2908
2909                 if ( trie->states[state].wordnum )
2910                         count = 1;
2911
2912                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2913                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2914                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2915                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2916                     {
2917                         if ( ++count > 1 ) {
2918                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2919                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2920                             if ( state == 1 ) break;
2921                             if ( count == 2 ) {
2922                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2923                                 DEBUG_OPTIMISE_r(
2924                                     PerlIO_printf(Perl_debug_log,
2925                                         "%*sNew Start State=%"UVuf" Class: [",
2926                                         (int)depth * 2 + 2, "",
2927                                         (UV)state));
2928                                 if (idx >= 0) {
2929                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2930                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2931
2932                                     TRIE_BITMAP_SET(trie,*ch);
2933                                     if ( folder )
2934                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2935                                     DEBUG_OPTIMISE_r(
2936                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2937                                     );
2938                                 }
2939                             }
2940                             TRIE_BITMAP_SET(trie,*ch);
2941                             if ( folder )
2942                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2943                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2944                         }
2945                         idx = ofs;
2946                     }
2947                 }
2948                 if ( count == 1 ) {
2949                     SV **tmp = av_fetch( revcharmap, idx, 0);
2950                     STRLEN len;
2951                     char *ch = SvPV( *tmp, len );
2952                     DEBUG_OPTIMISE_r({
2953                         SV *sv=sv_newmortal();
2954                         PerlIO_printf( Perl_debug_log,
2955                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2956                             (int)depth * 2 + 2, "",
2957                             (UV)state, (UV)idx,
2958                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2959                                 PL_colors[0], PL_colors[1],
2960                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2961                                 PERL_PV_ESCAPE_FIRSTCHAR
2962                             )
2963                         );
2964                     });
2965                     if ( state==1 ) {
2966                         OP( convert ) = nodetype;
2967                         str=STRING(convert);
2968                         STR_LEN(convert)=0;
2969                     }
2970                     STR_LEN(convert) += len;
2971                     while (len--)
2972                         *str++ = *ch++;
2973                 } else {
2974 #ifdef DEBUGGING
2975                     if (state>1)
2976                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2977 #endif
2978                     break;
2979                 }
2980             }
2981             trie->prefixlen = (state-1);
2982             if (str) {
2983                 regnode *n = convert+NODE_SZ_STR(convert);
2984                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2985                 trie->startstate = state;
2986                 trie->minlen -= (state - 1);
2987                 trie->maxlen -= (state - 1);
2988 #ifdef DEBUGGING
2989                /* At least the UNICOS C compiler choked on this
2990                 * being argument to DEBUG_r(), so let's just have
2991                 * it right here. */
2992                if (
2993 #ifdef PERL_EXT_RE_BUILD
2994                    1
2995 #else
2996                    DEBUG_r_TEST
2997 #endif
2998                    ) {
2999                    regnode *fix = convert;
3000                    U32 word = trie->wordcount;
3001                    mjd_nodelen++;
3002                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3003                    while( ++fix < n ) {
3004                        Set_Node_Offset_Length(fix, 0, 0);
3005                    }
3006                    while (word--) {
3007                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3008                        if (tmp) {
3009                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3010                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3011                            else
3012                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3013                        }
3014                    }
3015                }
3016 #endif
3017                 if (trie->maxlen) {
3018                     convert = n;
3019                 } else {
3020                     NEXT_OFF(convert) = (U16)(tail - convert);
3021                     DEBUG_r(optimize= n);
3022                 }
3023             }
3024         }
3025         if (!jumper)
3026             jumper = last;
3027         if ( trie->maxlen ) {
3028             NEXT_OFF( convert ) = (U16)(tail - convert);
3029             ARG_SET( convert, data_slot );
3030             /* Store the offset to the first unabsorbed branch in
3031                jump[0], which is otherwise unused by the jump logic.
3032                We use this when dumping a trie and during optimisation. */
3033             if (trie->jump)
3034                 trie->jump[0] = (U16)(nextbranch - convert);
3035
3036             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3037              *   and there is a bitmap
3038              *   and the first "jump target" node we found leaves enough room
3039              * then convert the TRIE node into a TRIEC node, with the bitmap
3040              * embedded inline in the opcode - this is hypothetically faster.
3041              */
3042             if ( !trie->states[trie->startstate].wordnum
3043                  && trie->bitmap
3044                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3045             {
3046                 OP( convert ) = TRIEC;
3047                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3048                 PerlMemShared_free(trie->bitmap);
3049                 trie->bitmap= NULL;
3050             } else
3051                 OP( convert ) = TRIE;
3052
3053             /* store the type in the flags */
3054             convert->flags = nodetype;
3055             DEBUG_r({
3056             optimize = convert
3057                       + NODE_STEP_REGNODE
3058                       + regarglen[ OP( convert ) ];
3059             });
3060             /* XXX We really should free up the resource in trie now,
3061                    as we won't use them - (which resources?) dmq */
3062         }
3063         /* needed for dumping*/
3064         DEBUG_r(if (optimize) {
3065             regnode *opt = convert;
3066
3067             while ( ++opt < optimize) {
3068                 Set_Node_Offset_Length(opt,0,0);
3069             }
3070             /*
3071                 Try to clean up some of the debris left after the
3072                 optimisation.
3073              */
3074             while( optimize < jumper ) {
3075                 mjd_nodelen += Node_Length((optimize));
3076                 OP( optimize ) = OPTIMIZED;
3077                 Set_Node_Offset_Length(optimize,0,0);
3078                 optimize++;
3079             }
3080             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3081         });
3082     } /* end node insert */
3083
3084     /*  Finish populating the prev field of the wordinfo array.  Walk back
3085      *  from each accept state until we find another accept state, and if
3086      *  so, point the first word's .prev field at the second word. If the
3087      *  second already has a .prev field set, stop now. This will be the
3088      *  case either if we've already processed that word's accept state,
3089      *  or that state had multiple words, and the overspill words were
3090      *  already linked up earlier.
3091      */
3092     {
3093         U16 word;
3094         U32 state;
3095         U16 prev;
3096
3097         for (word=1; word <= trie->wordcount; word++) {
3098             prev = 0;
3099             if (trie->wordinfo[word].prev)
3100                 continue;
3101             state = trie->wordinfo[word].accept;
3102             while (state) {
3103                 state = prev_states[state];
3104                 if (!state)
3105                     break;
3106                 prev = trie->states[state].wordnum;
3107                 if (prev)
3108                     break;
3109             }
3110             trie->wordinfo[word].prev = prev;
3111         }
3112         Safefree(prev_states);
3113     }
3114
3115
3116     /* and now dump out the compressed format */
3117     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3118
3119     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3120 #ifdef DEBUGGING
3121     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3122     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3123 #else
3124     SvREFCNT_dec_NN(revcharmap);
3125 #endif
3126     return trie->jump
3127            ? MADE_JUMP_TRIE
3128            : trie->startstate>1
3129              ? MADE_EXACT_TRIE
3130              : MADE_TRIE;
3131 }
3132
3133 STATIC regnode *
3134 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3135 {
3136 /* The Trie is constructed and compressed now so we can build a fail array if
3137  * it's needed
3138
3139    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3140    3.32 in the
3141    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3142    Ullman 1985/88
3143    ISBN 0-201-10088-6
3144
3145    We find the fail state for each state in the trie, this state is the longest
3146    proper suffix of the current state's 'word' that is also a proper prefix of
3147    another word in our trie. State 1 represents the word '' and is thus the
3148    default fail state. This allows the DFA not to have to restart after its
3149    tried and failed a word at a given point, it simply continues as though it
3150    had been matching the other word in the first place.
3151    Consider
3152       'abcdgu'=~/abcdefg|cdgu/
3153    When we get to 'd' we are still matching the first word, we would encounter
3154    'g' which would fail, which would bring us to the state representing 'd' in
3155    the second word where we would try 'g' and succeed, proceeding to match
3156    'cdgu'.
3157  */
3158  /* add a fail transition */
3159     const U32 trie_offset = ARG(source);
3160     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3161     U32 *q;
3162     const U32 ucharcount = trie->uniquecharcount;
3163     const U32 numstates = trie->statecount;
3164     const U32 ubound = trie->lasttrans + ucharcount;
3165     U32 q_read = 0;
3166     U32 q_write = 0;
3167     U32 charid;
3168     U32 base = trie->states[ 1 ].trans.base;
3169     U32 *fail;
3170     reg_ac_data *aho;
3171     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3172     regnode *stclass;
3173     GET_RE_DEBUG_FLAGS_DECL;
3174
3175     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3176     PERL_UNUSED_CONTEXT;
3177 #ifndef DEBUGGING
3178     PERL_UNUSED_ARG(depth);
3179 #endif
3180
3181     if ( OP(source) == TRIE ) {
3182         struct regnode_1 *op = (struct regnode_1 *)
3183             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3184         StructCopy(source,op,struct regnode_1);
3185         stclass = (regnode *)op;
3186     } else {
3187         struct regnode_charclass *op = (struct regnode_charclass *)
3188             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3189         StructCopy(source,op,struct regnode_charclass);
3190         stclass = (regnode *)op;
3191     }
3192     OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3193
3194     ARG_SET( stclass, data_slot );
3195     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3196     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3197     aho->trie=trie_offset;
3198     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3199     Copy( trie->states, aho->states, numstates, reg_trie_state );
3200     Newxz( q, numstates, U32);
3201     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3202     aho->refcount = 1;
3203     fail = aho->fail;
3204     /* initialize fail[0..1] to be 1 so that we always have
3205        a valid final fail state */
3206     fail[ 0 ] = fail[ 1 ] = 1;
3207
3208     for ( charid = 0; charid < ucharcount ; charid++ ) {
3209         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3210         if ( newstate ) {
3211             q[ q_write ] = newstate;
3212             /* set to point at the root */
3213             fail[ q[ q_write++ ] ]=1;
3214         }
3215     }
3216     while ( q_read < q_write) {
3217         const U32 cur = q[ q_read++ % numstates ];
3218         base = trie->states[ cur ].trans.base;
3219
3220         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3221             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3222             if (ch_state) {
3223                 U32 fail_state = cur;
3224                 U32 fail_base;
3225                 do {
3226                     fail_state = fail[ fail_state ];
3227                     fail_base = aho->states[ fail_state ].trans.base;
3228                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3229
3230                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3231                 fail[ ch_state ] = fail_state;
3232                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3233                 {
3234                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3235                 }
3236                 q[ q_write++ % numstates] = ch_state;
3237             }
3238         }
3239     }
3240     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3241        when we fail in state 1, this allows us to use the
3242        charclass scan to find a valid start char. This is based on the principle
3243        that theres a good chance the string being searched contains lots of stuff
3244        that cant be a start char.
3245      */
3246     fail[ 0 ] = fail[ 1 ] = 0;
3247     DEBUG_TRIE_COMPILE_r({
3248         PerlIO_printf(Perl_debug_log,
3249                       "%*sStclass Failtable (%"UVuf" states): 0",
3250                       (int)(depth * 2), "", (UV)numstates
3251         );
3252         for( q_read=1; q_read<numstates; q_read++ ) {
3253             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3254         }
3255         PerlIO_printf(Perl_debug_log, "\n");
3256     });
3257     Safefree(q);
3258     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3259     return stclass;
3260 }
3261
3262
3263 #define DEBUG_PEEP(str,scan,depth) \
3264     DEBUG_OPTIMISE_r({if (scan){ \
3265        regnode *Next = regnext(scan); \
3266        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3267        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3268            (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3269            Next ? (REG_NODE_NUM(Next)) : 0 ); \
3270        DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3271        PerlIO_printf(Perl_debug_log, "\n"); \
3272    }});
3273
3274 /* The below joins as many adjacent EXACTish nodes as possible into a single
3275  * one.  The regop may be changed if the node(s) contain certain sequences that
3276  * require special handling.  The joining is only done if:
3277  * 1) there is room in the current conglomerated node to entirely contain the
3278  *    next one.
3279  * 2) they are the exact same node type
3280  *
3281  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3282  * these get optimized out
3283  *
3284  * If a node is to match under /i (folded), the number of characters it matches
3285  * can be different than its character length if it contains a multi-character
3286  * fold.  *min_subtract is set to the total delta number of characters of the
3287  * input nodes.
3288  *
3289  * And *unfolded_multi_char is set to indicate whether or not the node contains
3290  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3291  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3292  * SMALL LETTER SHARP S, as only if the target string being matched against
3293  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3294  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3295  * whose components are all above the Latin1 range are not run-time locale
3296  * dependent, and have already been folded by the time this function is
3297  * called.)
3298  *
3299  * This is as good a place as any to discuss the design of handling these
3300  * multi-character fold sequences.  It's been wrong in Perl for a very long
3301  * time.  There are three code points in Unicode whose multi-character folds
3302  * were long ago discovered to mess things up.  The previous designs for
3303  * dealing with these involved assigning a special node for them.  This
3304  * approach doesn't always work, as evidenced by this example:
3305  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3306  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3307  * would match just the \xDF, it won't be able to handle the case where a
3308  * successful match would have to cross the node's boundary.  The new approach
3309  * that hopefully generally solves the problem generates an EXACTFU_SS node
3310  * that is "sss" in this case.
3311  *
3312  * It turns out that there are problems with all multi-character folds, and not
3313  * just these three.  Now the code is general, for all such cases.  The
3314  * approach taken is:
3315  * 1)   This routine examines each EXACTFish node that could contain multi-
3316  *      character folded sequences.  Since a single character can fold into
3317  *      such a sequence, the minimum match length for this node is less than
3318  *      the number of characters in the node.  This routine returns in
3319  *      *min_subtract how many characters to subtract from the the actual
3320  *      length of the string to get a real minimum match length; it is 0 if
3321  *      there are no multi-char foldeds.  This delta is used by the caller to
3322  *      adjust the min length of the match, and the delta between min and max,
3323  *      so that the optimizer doesn't reject these possibilities based on size
3324  *      constraints.
3325  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3326  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3327  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3328  *      there is a possible fold length change.  That means that a regular
3329  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3330  *      with length changes, and so can be processed faster.  regexec.c takes
3331  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3332  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3333  *      known until runtime).  This saves effort in regex matching.  However,
3334  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3335  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3336  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3337  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3338  *      possibilities for the non-UTF8 patterns are quite simple, except for
3339  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3340  *      members of a fold-pair, and arrays are set up for all of them so that
3341  *      the other member of the pair can be found quickly.  Code elsewhere in
3342  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3343  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3344  *      described in the next item.
3345  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3346  *      validity of the fold won't be known until runtime, and so must remain
3347  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3348  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3349  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3350  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3351  *      The reason this is a problem is that the optimizer part of regexec.c
3352  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3353  *      that a character in the pattern corresponds to at most a single
3354  *      character in the target string.  (And I do mean character, and not byte
3355  *      here, unlike other parts of the documentation that have never been
3356  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3357  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3358  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3359  *      nodes, violate the assumption, and they are the only instances where it
3360  *      is violated.  I'm reluctant to try to change the assumption, as the
3361  *      code involved is impenetrable to me (khw), so instead the code here
3362  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3363  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3364  *      boolean indicating whether or not the node contains such a fold.  When
3365  *      it is true, the caller sets a flag that later causes the optimizer in
3366  *      this file to not set values for the floating and fixed string lengths,
3367  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3368  *      assumption.  Thus, there is no optimization based on string lengths for
3369  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3370  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3371  *      assumption is wrong only in these cases is that all other non-UTF-8
3372  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3373  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3374  *      EXACTF nodes because we don't know at compile time if it actually
3375  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3376  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3377  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3378  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3379  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3380  *      string would require the pattern to be forced into UTF-8, the overhead
3381  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3382  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3383  *      locale.)
3384  *
3385  *      Similarly, the code that generates tries doesn't currently handle
3386  *      not-already-folded multi-char folds, and it looks like a pain to change
3387  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3388  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3389  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3390  *      using /iaa matching will be doing so almost entirely with ASCII
3391  *      strings, so this should rarely be encountered in practice */
3392
3393 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3394     if (PL_regkind[OP(scan)] == EXACT) \
3395         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3396
3397 STATIC U32
3398 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3399                    UV *min_subtract, bool *unfolded_multi_char,
3400                    U32 flags,regnode *val, U32 depth)
3401 {
3402     /* Merge several consecutive EXACTish nodes into one. */
3403     regnode *n = regnext(scan);
3404     U32 stringok = 1;
3405     regnode *next = scan + NODE_SZ_STR(scan);
3406     U32 merged = 0;
3407     U32 stopnow = 0;
3408 #ifdef DEBUGGING
3409     regnode *stop = scan;
3410     GET_RE_DEBUG_FLAGS_DECL;
3411 #else
3412     PERL_UNUSED_ARG(depth);
3413 #endif
3414
3415     PERL_ARGS_ASSERT_JOIN_EXACT;
3416 #ifndef EXPERIMENTAL_INPLACESCAN
3417     PERL_UNUSED_ARG(flags);
3418     PERL_UNUSED_ARG(val);
3419 #endif
3420     DEBUG_PEEP("join",scan,depth);
3421
3422     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3423      * EXACT ones that are mergeable to the current one. */
3424     while (n
3425            && (PL_regkind[OP(n)] == NOTHING
3426                || (stringok && OP(n) == OP(scan)))
3427            && NEXT_OFF(n)
3428            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3429     {
3430
3431         if (OP(n) == TAIL || n > next)
3432             stringok = 0;
3433         if (PL_regkind[OP(n)] == NOTHING) {
3434             DEBUG_PEEP("skip:",n,depth);
3435             NEXT_OFF(scan) += NEXT_OFF(n);
3436             next = n + NODE_STEP_REGNODE;
3437 #ifdef DEBUGGING
3438             if (stringok)
3439                 stop = n;
3440 #endif
3441             n = regnext(n);
3442         }
3443         else if (stringok) {
3444             const unsigned int oldl = STR_LEN(scan);
3445             regnode * const nnext = regnext(n);
3446
3447             /* XXX I (khw) kind of doubt that this works on platforms (should
3448              * Perl ever run on one) where U8_MAX is above 255 because of lots
3449              * of other assumptions */
3450             /* Don't join if the sum can't fit into a single node */
3451             if (oldl + STR_LEN(n) > U8_MAX)
3452                 break;
3453
3454             DEBUG_PEEP("merg",n,depth);
3455             merged++;
3456
3457             NEXT_OFF(scan) += NEXT_OFF(n);
3458             STR_LEN(scan) += STR_LEN(n);
3459             next = n + NODE_SZ_STR(n);
3460             /* Now we can overwrite *n : */
3461             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3462 #ifdef DEBUGGING
3463             stop = next - 1;
3464 #endif
3465             n = nnext;
3466             if (stopnow) break;
3467         }
3468
3469 #ifdef EXPERIMENTAL_INPLACESCAN
3470         if (flags && !NEXT_OFF(n)) {
3471             DEBUG_PEEP("atch", val, depth);
3472             if (reg_off_by_arg[OP(n)]) {
3473                 ARG_SET(n, val - n);
3474             }
3475             else {
3476                 NEXT_OFF(n) = val - n;
3477             }
3478             stopnow = 1;
3479         }
3480 #endif
3481     }
3482
3483     *min_subtract = 0;
3484     *unfolded_multi_char = FALSE;
3485
3486     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3487      * can now analyze for sequences of problematic code points.  (Prior to
3488      * this final joining, sequences could have been split over boundaries, and
3489      * hence missed).  The sequences only happen in folding, hence for any
3490      * non-EXACT EXACTish node */
3491     if (OP(scan) != EXACT) {
3492         U8* s0 = (U8*) STRING(scan);
3493         U8* s = s0;
3494         U8* s_end = s0 + STR_LEN(scan);
3495
3496         int total_count_delta = 0;  /* Total delta number of characters that
3497                                        multi-char folds expand to */
3498
3499         /* One pass is made over the node's string looking for all the
3500          * possibilities.  To avoid some tests in the loop, there are two main
3501          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3502          * non-UTF-8 */
3503         if (UTF) {
3504             U8* folded = NULL;
3505
3506             if (OP(scan) == EXACTFL) {
3507                 U8 *d;
3508
3509                 /* An EXACTFL node would already have been changed to another
3510                  * node type unless there is at least one character in it that
3511                  * is problematic; likely a character whose fold definition
3512                  * won't be known until runtime, and so has yet to be folded.
3513                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3514                  * to handle the UTF-8 case, we need to create a temporary
3515                  * folded copy using UTF-8 locale rules in order to analyze it.
3516                  * This is because our macros that look to see if a sequence is
3517                  * a multi-char fold assume everything is folded (otherwise the
3518                  * tests in those macros would be too complicated and slow).
3519                  * Note that here, the non-problematic folds will have already
3520                  * been done, so we can just copy such characters.  We actually
3521                  * don't completely fold the EXACTFL string.  We skip the
3522                  * unfolded multi-char folds, as that would just create work
3523                  * below to figure out the size they already are */
3524
3525                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3526                 d = folded;
3527                 while (s < s_end) {
3528                     STRLEN s_len = UTF8SKIP(s);
3529                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3530                         Copy(s, d, s_len, U8);
3531                         d += s_len;
3532                     }
3533                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3534                         *unfolded_multi_char = TRUE;
3535                         Copy(s, d, s_len, U8);
3536                         d += s_len;
3537                     }
3538                     else if (isASCII(*s)) {
3539                         *(d++) = toFOLD(*s);
3540                     }
3541                     else {
3542                         STRLEN len;
3543                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3544                         d += len;
3545                     }
3546                     s += s_len;
3547                 }
3548
3549                 /* Point the remainder of the routine to look at our temporary
3550                  * folded copy */
3551                 s = folded;
3552                 s_end = d;
3553             } /* End of creating folded copy of EXACTFL string */
3554
3555             /* Examine the string for a multi-character fold sequence.  UTF-8
3556              * patterns have all characters pre-folded by the time this code is
3557              * executed */
3558             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3559                                      length sequence we are looking for is 2 */
3560             {
3561                 int count = 0;  /* How many characters in a multi-char fold */
3562                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3563                 if (! len) {    /* Not a multi-char fold: get next char */
3564                     s += UTF8SKIP(s);
3565                     continue;
3566                 }
3567
3568                 /* Nodes with 'ss' require special handling, except for
3569                  * EXACTFA-ish for which there is no multi-char fold to this */
3570                 if (len == 2 && *s == 's' && *(s+1) == 's'
3571                     && OP(scan) != EXACTFA
3572                     && OP(scan) != EXACTFA_NO_TRIE)
3573                 {
3574                     count = 2;
3575                     if (OP(scan) != EXACTFL) {
3576                         OP(scan) = EXACTFU_SS;
3577                     }
3578                     s += 2;
3579                 }
3580                 else { /* Here is a generic multi-char fold. */
3581                     U8* multi_end  = s + len;
3582
3583                     /* Count how many characters are in it.  In the case of
3584                      * /aa, no folds which contain ASCII code points are
3585                      * allowed, so check for those, and skip if found. */
3586                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3587                         count = utf8_length(s, multi_end);
3588                         s = multi_end;
3589                     }
3590                     else {
3591                         while (s < multi_end) {
3592                             if (isASCII(*s)) {
3593                                 s++;
3594                                 goto next_iteration;
3595                             }
3596                             else {
3597                                 s += UTF8SKIP(s);
3598                             }
3599                             count++;
3600                         }
3601                     }
3602                 }
3603
3604                 /* The delta is how long the sequence is minus 1 (1 is how long
3605                  * the character that folds to the sequence is) */
3606                 total_count_delta += count - 1;
3607               next_iteration: ;
3608             }
3609
3610             /* We created a temporary folded copy of the string in EXACTFL
3611              * nodes.  Therefore we need to be sure it doesn't go below zero,
3612              * as the real string could be shorter */
3613             if (OP(scan) == EXACTFL) {
3614                 int total_chars = utf8_length((U8*) STRING(scan),
3615                                            (U8*) STRING(scan) + STR_LEN(scan));
3616                 if (total_count_delta > total_chars) {
3617                     total_count_delta = total_chars;
3618                 }
3619             }
3620
3621             *min_subtract += total_count_delta;
3622             Safefree(folded);
3623         }
3624         else if (OP(scan) == EXACTFA) {
3625
3626             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3627              * fold to the ASCII range (and there are no existing ones in the
3628              * upper latin1 range).  But, as outlined in the comments preceding
3629              * this function, we need to flag any occurrences of the sharp s.
3630              * This character forbids trie formation (because of added
3631              * complexity) */
3632             while (s < s_end) {
3633                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3634                     OP(scan) = EXACTFA_NO_TRIE;
3635                     *unfolded_multi_char = TRUE;
3636                     break;
3637                 }
3638                 s++;
3639                 continue;
3640             }
3641         }
3642         else {
3643
3644             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3645              * folds that are all Latin1.  As explained in the comments
3646              * preceding this function, we look also for the sharp s in EXACTF
3647              * and EXACTFL nodes; it can be in the final position.  Otherwise
3648              * we can stop looking 1 byte earlier because have to find at least
3649              * two characters for a multi-fold */
3650             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3651                               ? s_end
3652                               : s_end -1;
3653
3654             while (s < upper) {
3655                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3656                 if (! len) {    /* Not a multi-char fold. */
3657                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3658                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3659                     {
3660                         *unfolded_multi_char = TRUE;
3661                     }
3662                     s++;
3663                     continue;
3664                 }
3665
3666                 if (len == 2
3667                     && isALPHA_FOLD_EQ(*s, 's')
3668                     && isALPHA_FOLD_EQ(*(s+1), 's'))
3669                 {
3670
3671                     /* EXACTF nodes need to know that the minimum length
3672                      * changed so that a sharp s in the string can match this
3673                      * ss in the pattern, but they remain EXACTF nodes, as they
3674                      * won't match this unless the target string is is UTF-8,
3675                      * which we don't know until runtime.  EXACTFL nodes can't
3676                      * transform into EXACTFU nodes */
3677                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3678                         OP(scan) = EXACTFU_SS;
3679                     }
3680                 }
3681
3682                 *min_subtract += len - 1;
3683                 s += len;
3684             }
3685         }
3686     }
3687
3688 #ifdef DEBUGGING
3689     /* Allow dumping but overwriting the collection of skipped
3690      * ops and/or strings with fake optimized ops */
3691     n = scan + NODE_SZ_STR(scan);
3692     while (n <= stop) {
3693         OP(n) = OPTIMIZED;
3694         FLAGS(n) = 0;
3695         NEXT_OFF(n) = 0;
3696         n++;
3697     }
3698 #endif
3699     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3700     return stopnow;
3701 }
3702
3703 /* REx optimizer.  Converts nodes into quicker variants "in place".
3704    Finds fixed substrings.  */
3705
3706 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3707    to the position after last scanned or to NULL. */
3708
3709 #define INIT_AND_WITHP \
3710     assert(!and_withp); \
3711     Newx(and_withp,1, regnode_ssc); \
3712     SAVEFREEPV(and_withp)
3713
3714
3715 static void
3716 S_unwind_scan_frames(pTHX_ const void *p)
3717 {
3718     scan_frame *f= (scan_frame *)p;
3719     do {
3720         scan_frame *n= f->next_frame;
3721         Safefree(f);
3722         f= n;
3723     } while (f);
3724 }
3725
3726
3727 STATIC SSize_t
3728 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3729                         SSize_t *minlenp, SSize_t *deltap,
3730                         regnode *last,
3731                         scan_data_t *data,
3732                         I32 stopparen,
3733                         U32 recursed_depth,
3734                         regnode_ssc *and_withp,
3735                         U32 flags, U32 depth)
3736                         /* scanp: Start here (read-write). */
3737                         /* deltap: Write maxlen-minlen here. */
3738                         /* last: Stop before this one. */
3739                         /* data: string data about the pattern */
3740                         /* stopparen: treat close N as END */
3741                         /* recursed: which subroutines have we recursed into */
3742                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3743 {
3744     /* There must be at least this number of characters to match */
3745     SSize_t min = 0;
3746     I32 pars = 0, code;
3747     regnode *scan = *scanp, *next;
3748     SSize_t delta = 0;
3749     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3750     int is_inf_internal = 0;            /* The studied chunk is infinite */
3751     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3752     scan_data_t data_fake;
3753     SV *re_trie_maxbuff = NULL;
3754     regnode *first_non_open = scan;
3755     SSize_t stopmin = SSize_t_MAX;
3756     scan_frame *frame = NULL;
3757     GET_RE_DEBUG_FLAGS_DECL;
3758
3759     PERL_ARGS_ASSERT_STUDY_CHUNK;
3760
3761
3762     if ( depth == 0 ) {
3763         while (first_non_open && OP(first_non_open) == OPEN)
3764             first_non_open=regnext(first_non_open);
3765     }
3766
3767
3768   fake_study_recurse:
3769     DEBUG_r(
3770         RExC_study_chunk_recursed_count++;
3771     );
3772     DEBUG_OPTIMISE_MORE_r(
3773     {
3774         PerlIO_printf(Perl_debug_log,
3775             "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3776             (int)(depth*2), "", (long)stopparen,
3777             (unsigned long)RExC_study_chunk_recursed_count,
3778             (unsigned long)depth, (unsigned long)recursed_depth,
3779             scan,
3780             last);
3781         if (recursed_depth) {
3782             U32 i;
3783             U32 j;
3784             for ( j = 0 ; j < recursed_depth ; j++ ) {
3785                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3786                     if (
3787                         PAREN_TEST(RExC_study_chunk_recursed +
3788                                    ( j * RExC_study_chunk_recursed_bytes), i )
3789                         && (
3790                             !j ||
3791                             !PAREN_TEST(RExC_study_chunk_recursed +
3792                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3793                         )
3794                     ) {
3795                         PerlIO_printf(Perl_debug_log," %d",(int)i);
3796                         break;
3797                     }
3798                 }
3799                 if ( j + 1 < recursed_depth ) {
3800                     PerlIO_printf(Perl_debug_log, ",");
3801                 }
3802             }
3803         }
3804         PerlIO_printf(Perl_debug_log,"\n");
3805     }
3806     );
3807     while ( scan && OP(scan) != END && scan < last ){
3808         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3809                                    node length to get a real minimum (because
3810                                    the folded version may be shorter) */
3811         bool unfolded_multi_char = FALSE;
3812         /* Peephole optimizer: */
3813         DEBUG_STUDYDATA("Peep:", data, depth);
3814         DEBUG_PEEP("Peep", scan, depth);
3815
3816
3817         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3818          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3819          * by a different invocation of reg() -- Yves
3820          */
3821         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3822
3823         /* Follow the next-chain of the current node and optimize
3824            away all the NOTHINGs from it.  */
3825         if (OP(scan) != CURLYX) {
3826             const int max = (reg_off_by_arg[OP(scan)]
3827                        ? I32_MAX
3828                        /* I32 may be smaller than U16 on CRAYs! */
3829                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3830             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3831             int noff;
3832             regnode *n = scan;
3833
3834             /* Skip NOTHING and LONGJMP. */
3835             while ((n = regnext(n))
3836                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3837                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3838                    && off + noff < max)
3839                 off += noff;
3840             if (reg_off_by_arg[OP(scan)])
3841                 ARG(scan) = off;
3842             else
3843                 NEXT_OFF(scan) = off;
3844         }
3845
3846         /* The principal pseudo-switch.  Cannot be a switch, since we
3847            look into several different things.  */
3848         if ( OP(scan) == DEFINEP ) {
3849             SSize_t minlen = 0;
3850             SSize_t deltanext = 0;
3851             SSize_t fake_last_close = 0;
3852             I32 f = SCF_IN_DEFINE;
3853
3854             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3855             scan = regnext(scan);
3856             assert( OP(scan) == IFTHEN );
3857             DEBUG_PEEP("expect IFTHEN", scan, depth);
3858
3859             data_fake.last_closep= &fake_last_close;
3860             minlen = *minlenp;
3861             next = regnext(scan);
3862             scan = NEXTOPER(NEXTOPER(scan));
3863             DEBUG_PEEP("scan", scan, depth);
3864             DEBUG_PEEP("next", next, depth);
3865
3866             /* we suppose the run is continuous, last=next...
3867              * NOTE we dont use the return here! */
3868             (void)study_chunk(pRExC_state, &scan, &minlen,
3869                               &deltanext, next, &data_fake, stopparen,
3870                               recursed_depth, NULL, f, depth+1);
3871
3872             scan = next;
3873         } else
3874         if (
3875             OP(scan) == BRANCH  ||
3876             OP(scan) == BRANCHJ ||
3877             OP(scan) == IFTHEN
3878         ) {
3879             next = regnext(scan);
3880             code = OP(scan);
3881
3882             /* The op(next)==code check below is to see if we
3883              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3884              * IFTHEN is special as it might not appear in pairs.
3885              * Not sure whether BRANCH-BRANCHJ is possible, regardless
3886              * we dont handle it cleanly. */
3887             if (OP(next) == code || code == IFTHEN) {
3888                 /* NOTE - There is similar code to this block below for
3889                  * handling TRIE nodes on a re-study.  If you change stuff here
3890                  * check there too. */
3891                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3892                 regnode_ssc accum;
3893                 regnode * const startbranch=scan;
3894
3895                 if (flags & SCF_DO_SUBSTR) {
3896                     /* Cannot merge strings after this. */
3897                     scan_commit(pRExC_state, data, minlenp, is_inf);
3898                 }
3899
3900                 if (flags & SCF_DO_STCLASS)
3901                     ssc_init_zero(pRExC_state, &accum);
3902
3903                 while (OP(scan) == code) {
3904                     SSize_t deltanext, minnext, fake;
3905                     I32 f = 0;
3906                     regnode_ssc this_class;
3907
3908                     DEBUG_PEEP("Branch", scan, depth);
3909
3910                     num++;
3911                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3912                     if (data) {
3913                         data_fake.whilem_c = data->whilem_c;
3914                         data_fake.last_closep = data->last_closep;
3915                     }
3916                     else
3917                         data_fake.last_closep = &fake;
3918
3919                     data_fake.pos_delta = delta;
3920                     next = regnext(scan);
3921
3922                     scan = NEXTOPER(scan); /* everything */
3923                     if (code != BRANCH)    /* everything but BRANCH */
3924                         scan = NEXTOPER(scan);
3925
3926                     if (flags & SCF_DO_STCLASS) {
3927                         ssc_init(pRExC_state, &this_class);
3928                         data_fake.start_class = &this_class;
3929                         f = SCF_DO_STCLASS_AND;
3930                     }
3931                     if (flags & SCF_WHILEM_VISITED_POS)
3932                         f |= SCF_WHILEM_VISITED_POS;
3933
3934                     /* we suppose the run is continuous, last=next...*/
3935                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3936                                       &deltanext, next, &data_fake, stopparen,
3937                                       recursed_depth, NULL, f,depth+1);
3938
3939                     if (min1 > minnext)
3940                         min1 = minnext;
3941                     if (deltanext == SSize_t_MAX) {
3942                         is_inf = is_inf_internal = 1;
3943                         max1 = SSize_t_MAX;
3944                     } else if (max1 < minnext + deltanext)
3945                         max1 = minnext + deltanext;
3946                     scan = next;
3947                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3948                         pars++;
3949                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3950                         if ( stopmin > minnext)
3951                             stopmin = min + min1;
3952                         flags &= ~SCF_DO_SUBSTR;
3953                         if (data)
3954                             data->flags |= SCF_SEEN_ACCEPT;
3955                     }
3956                     if (data) {
3957                         if (data_fake.flags & SF_HAS_EVAL)
3958                             data->flags |= SF_HAS_EVAL;
3959                         data->whilem_c = data_fake.whilem_c;
3960                     }
3961                     if (flags & SCF_DO_STCLASS)
3962                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3963                 }
3964                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3965                     min1 = 0;
3966                 if (flags & SCF_DO_SUBSTR) {
3967                     data->pos_min += min1;
3968                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3969                         data->pos_delta = SSize_t_MAX;
3970                     else
3971                         data->pos_delta += max1 - min1;
3972                     if (max1 != min1 || is_inf)
3973                         data->longest = &(data->longest_float);
3974                 }
3975                 min += min1;
3976                 if (delta == SSize_t_MAX
3977                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3978                     delta = SSize_t_MAX;
3979                 else
3980                     delta += max1 - min1;
3981                 if (flags & SCF_DO_STCLASS_OR) {
3982                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3983                     if (min1) {
3984                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3985                         flags &= ~SCF_DO_STCLASS;
3986                     }
3987                 }
3988                 else if (flags & SCF_DO_STCLASS_AND) {
3989                     if (min1) {
3990                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3991                         flags &= ~SCF_DO_STCLASS;
3992                     }
3993                     else {
3994                         /* Switch to OR mode: cache the old value of
3995                          * data->start_class */
3996                         INIT_AND_WITHP;
3997                         StructCopy(data->start_class, and_withp, regnode_ssc);
3998                         flags &= ~SCF_DO_STCLASS_AND;
3999                         StructCopy(&accum, data->start_class, regnode_ssc);
4000                         flags |= SCF_DO_STCLASS_OR;
4001                     }
4002                 }
4003
4004                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4005                         OP( startbranch ) == BRANCH )
4006                 {
4007                 /* demq.
4008
4009                    Assuming this was/is a branch we are dealing with: 'scan'
4010                    now points at the item that follows the branch sequence,
4011                    whatever it is. We now start at the beginning of the
4012                    sequence and look for subsequences of
4013
4014                    BRANCH->EXACT=>x1
4015                    BRANCH->EXACT=>x2
4016                    tail
4017
4018                    which would be constructed from a pattern like
4019                    /A|LIST|OF|WORDS/
4020
4021                    If we can find such a subsequence we need to turn the first
4022                    element into a trie and then add the subsequent branch exact
4023                    strings to the trie.
4024
4025                    We have two cases
4026
4027                      1. patterns where the whole set of branches can be
4028                         converted.
4029
4030                      2. patterns where only a subset can be converted.
4031
4032                    In case 1 we can replace the whole set with a single regop
4033                    for the trie. In case 2 we need to keep the start and end
4034                    branches so
4035
4036                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4037                      becomes BRANCH TRIE; BRANCH X;
4038
4039                   There is an additional case, that being where there is a
4040                   common prefix, which gets split out into an EXACT like node
4041                   preceding the TRIE node.
4042
4043                   If x(1..n)==tail then we can do a simple trie, if not we make
4044                   a "jump" trie, such that when we match the appropriate word
4045                   we "jump" to the appropriate tail node. Essentially we turn
4046                   a nested if into a case structure of sorts.
4047
4048                 */
4049
4050                     int made=0;
4051                     if (!re_trie_maxbuff) {
4052                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4053                         if (!SvIOK(re_trie_maxbuff))
4054                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4055                     }
4056                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4057                         regnode *cur;
4058                         regnode *first = (regnode *)NULL;
4059                         regnode *last = (regnode *)NULL;
4060                         regnode *tail = scan;
4061                         U8 trietype = 0;
4062                         U32 count=0;
4063
4064                         /* var tail is used because there may be a TAIL
4065                            regop in the way. Ie, the exacts will point to the
4066                            thing following the TAIL, but the last branch will
4067                            point at the TAIL. So we advance tail. If we
4068                            have nested (?:) we may have to move through several
4069                            tails.
4070                          */
4071
4072                         while ( OP( tail ) == TAIL ) {
4073                             /* this is the TAIL generated by (?:) */
4074                             tail = regnext( tail );
4075                         }
4076
4077
4078                         DEBUG_TRIE_COMPILE_r({
4079                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4080                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4081                               (int)depth * 2 + 2, "",
4082                               "Looking for TRIE'able sequences. Tail node is: ",
4083                               SvPV_nolen_const( RExC_mysv )
4084                             );
4085                         });
4086
4087                         /*
4088
4089                             Step through the branches
4090                                 cur represents each branch,
4091                                 noper is the first thing to be matched as part
4092                                       of that branch
4093                                 noper_next is the regnext() of that node.
4094
4095                             We normally handle a case like this
4096                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4097                             support building with NOJUMPTRIE, which restricts
4098                             the trie logic to structures like /FOO|BAR/.
4099
4100                             If noper is a trieable nodetype then the branch is
4101                             a possible optimization target. If we are building
4102                             under NOJUMPTRIE then we require that noper_next is
4103                             the same as scan (our current position in the regex
4104                             program).
4105
4106                             Once we have two or more consecutive such branches
4107                             we can create a trie of the EXACT's contents and
4108                             stitch it in place into the program.
4109
4110                             If the sequence represents all of the branches in
4111                             the alternation we replace the entire thing with a
4112                             single TRIE node.
4113
4114                             Otherwise when it is a subsequence we need to
4115                             stitch it in place and replace only the relevant
4116                             branches. This means the first branch has to remain
4117                             as it is used by the alternation logic, and its
4118                             next pointer, and needs to be repointed at the item
4119                             on the branch chain following the last branch we
4120                             have optimized away.
4121
4122                             This could be either a BRANCH, in which case the
4123                             subsequence is internal, or it could be the item
4124                             following the branch sequence in which case the
4125                             subsequence is at the end (which does not
4126                             necessarily mean the first node is the start of the
4127                             alternation).
4128
4129                             TRIE_TYPE(X) is a define which maps the optype to a
4130                             trietype.
4131
4132                                 optype          |  trietype
4133                                 ----------------+-----------
4134                                 NOTHING         | NOTHING
4135                                 EXACT           | EXACT
4136                                 EXACTFU         | EXACTFU
4137                                 EXACTFU_SS      | EXACTFU
4138                                 EXACTFA         | EXACTFA
4139
4140
4141                         */
4142 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
4143                        ( EXACT == (X) )   ? EXACT :        \
4144                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
4145                        ( EXACTFA == (X) ) ? EXACTFA :        \
4146                        0 )
4147
4148                         /* dont use tail as the end marker for this traverse */
4149                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4150                             regnode * const noper = NEXTOPER( cur );
4151                             U8 noper_type = OP( noper );
4152                             U8 noper_trietype = TRIE_TYPE( noper_type );
4153 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4154                             regnode * const noper_next = regnext( noper );
4155                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4156                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4157 #endif
4158
4159                             DEBUG_TRIE_COMPILE_r({
4160                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4161                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4162                                    (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4163
4164                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4165                                 PerlIO_printf( Perl_debug_log, " -> %s",
4166                                     SvPV_nolen_const(RExC_mysv));
4167
4168                                 if ( noper_next ) {
4169                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4170                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4171                                     SvPV_nolen_const(RExC_mysv));
4172                                 }
4173                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4174                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4175                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4176                                 );
4177                             });
4178
4179                             /* Is noper a trieable nodetype that can be merged
4180                              * with the current trie (if there is one)? */
4181                             if ( noper_trietype
4182                                   &&
4183                                   (
4184                                         ( noper_trietype == NOTHING)
4185                                         || ( trietype == NOTHING )
4186                                         || ( trietype == noper_trietype )
4187                                   )
4188 #ifdef NOJUMPTRIE
4189                                   && noper_next == tail
4190 #endif
4191                                   && count < U16_MAX)
4192                             {
4193                                 /* Handle mergable triable node Either we are
4194                                  * the first node in a new trieable sequence,
4195                                  * in which case we do some bookkeeping,
4196                                  * otherwise we update the end pointer. */
4197                                 if ( !first ) {
4198                                     first = cur;
4199                                     if ( noper_trietype == NOTHING ) {
4200 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4201                                         regnode * const noper_next = regnext( noper );
4202                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4203                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4204 #endif
4205
4206                                         if ( noper_next_trietype ) {
4207                                             trietype = noper_next_trietype;
4208                                         } else if (noper_next_type)  {
4209                                             /* a NOTHING regop is 1 regop wide.
4210                                              * We need at least two for a trie
4211                                              * so we can't merge this in */
4212                                             first = NULL;
4213                                         }
4214                                     } else {
4215                                         trietype = noper_trietype;
4216                                     }
4217                                 } else {
4218                                     if ( trietype == NOTHING )
4219                                         trietype = noper_trietype;
4220                                     last = cur;
4221                                 }
4222                                 if (first)
4223                                     count++;
4224                             } /* end handle mergable triable node */
4225                             else {
4226                                 /* handle unmergable node -
4227                                  * noper may either be a triable node which can
4228                                  * not be tried together with the current trie,
4229                                  * or a non triable node */
4230                                 if ( last ) {
4231                                     /* If last is set and trietype is not
4232                                      * NOTHING then we have found at least two
4233                                      * triable branch sequences in a row of a
4234                                      * similar trietype so we can turn them
4235                                      * into a trie. If/when we allow NOTHING to
4236                                      * start a trie sequence this condition
4237                                      * will be required, and it isn't expensive
4238                                      * so we leave it in for now. */
4239                                     if ( trietype && trietype != NOTHING )
4240                                         make_trie( pRExC_state,
4241                                                 startbranch, first, cur, tail,
4242                                                 count, trietype, depth+1 );
4243                                     last = NULL; /* note: we clear/update
4244                                                     first, trietype etc below,
4245                                                     so we dont do it here */
4246                                 }
4247                                 if ( noper_trietype
4248 #ifdef NOJUMPTRIE
4249                                      && noper_next == tail
4250 #endif
4251                                 ){
4252                                     /* noper is triable, so we can start a new
4253                                      * trie sequence */
4254                                     count = 1;
4255                                     first = cur;
4256                                     trietype = noper_trietype;
4257                                 } else if (first) {
4258                                     /* if we already saw a first but the
4259                                      * current node is not triable then we have
4260                                      * to reset the first information. */
4261                                     count = 0;
4262                                     first = NULL;
4263                                     trietype = 0;
4264                                 }
4265                             } /* end handle unmergable node */
4266                         } /* loop over branches */
4267                         DEBUG_TRIE_COMPILE_r({
4268                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4269                             PerlIO_printf( Perl_debug_log,
4270                               "%*s- %s (%d) <SCAN FINISHED>\n",
4271                               (int)depth * 2 + 2,
4272                               "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4273
4274                         });
4275                         if ( last && trietype ) {
4276                             if ( trietype != NOTHING ) {
4277                                 /* the last branch of the sequence was part of
4278                                  * a trie, so we have to construct it here
4279                                  * outside of the loop */
4280                                 made= make_trie( pRExC_state, startbranch,
4281                                                  first, scan, tail, count,
4282                                                  trietype, depth+1 );
4283 #ifdef TRIE_STUDY_OPT
4284                                 if ( ((made == MADE_EXACT_TRIE &&
4285                                      startbranch == first)
4286                                      || ( first_non_open == first )) &&
4287                                      depth==0 ) {
4288                                     flags |= SCF_TRIE_RESTUDY;
4289                                     if ( startbranch == first
4290                                          && scan == tail )
4291                                     {
4292                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4293                                     }
4294                                 }
4295 #endif
4296                             } else {
4297                                 /* at this point we know whatever we have is a
4298                                  * NOTHING sequence/branch AND if 'startbranch'
4299                                  * is 'first' then we can turn the whole thing
4300                                  * into a NOTHING
4301                                  */
4302                                 if ( startbranch == first ) {
4303                                     regnode *opt;
4304                                     /* the entire thing is a NOTHING sequence,
4305                                      * something like this: (?:|) So we can
4306                                      * turn it into a plain NOTHING op. */
4307                                     DEBUG_TRIE_COMPILE_r({
4308                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4309                                         PerlIO_printf( Perl_debug_log,
4310                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4311                                           "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4312
4313                                     });
4314                                     OP(startbranch)= NOTHING;
4315                                     NEXT_OFF(startbranch)= tail - startbranch;
4316                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4317                                         OP(opt)= OPTIMIZED;
4318                                 }
4319                             }
4320                         } /* end if ( last) */
4321                     } /* TRIE_MAXBUF is non zero */
4322
4323                 } /* do trie */
4324
4325             }
4326             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4327                 scan = NEXTOPER(NEXTOPER(scan));
4328             } else                      /* single branch is optimized. */
4329                 scan = NEXTOPER(scan);
4330             continue;
4331         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4332             I32 paren = 0;
4333             regnode *start = NULL;
4334             regnode *end = NULL;
4335             U32 my_recursed_depth= recursed_depth;
4336
4337
4338             if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4339                 /* Do setup, note this code has side effects beyond
4340                  * the rest of this block. Specifically setting
4341                  * RExC_recurse[] must happen at least once during
4342                  * study_chunk(). */
4343                 if (OP(scan) == GOSUB) {
4344                     paren = ARG(scan);
4345                     RExC_recurse[ARG2L(scan)] = scan;
4346                     start = RExC_open_parens[paren-1];
4347                     end   = RExC_close_parens[paren-1];
4348                 } else {
4349                     start = RExC_rxi->program + 1;
4350                     end   = RExC_opend;
4351                 }
4352                 /* NOTE we MUST always execute the above code, even
4353                  * if we do nothing with a GOSUB/GOSTART */
4354                 if (
4355                     ( flags & SCF_IN_DEFINE )
4356                     ||
4357                     (
4358                         (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4359                         &&
4360                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4361                     )
4362                 ) {
4363                     /* no need to do anything here if we are in a define. */
4364                     /* or we are after some kind of infinite construct
4365                      * so we can skip recursing into this item.
4366                      * Since it is infinite we will not change the maxlen
4367                      * or delta, and if we miss something that might raise
4368                      * the minlen it will merely pessimise a little.
4369                      *
4370                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4371                      * might result in a minlen of 1 and not of 4,
4372                      * but this doesn't make us mismatch, just try a bit
4373                      * harder than we should.
4374                      * */
4375                     scan= regnext(scan);
4376                     continue;
4377                 }
4378
4379                 if (
4380                     !recursed_depth
4381                     ||
4382                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4383                 ) {
4384                     /* it is quite possible that there are more efficient ways
4385                      * to do this. We maintain a bitmap per level of recursion
4386                      * of which patterns we have entered so we can detect if a
4387                      * pattern creates a possible infinite loop. When we
4388                      * recurse down a level we copy the previous levels bitmap
4389                      * down. When we are at recursion level 0 we zero the top
4390                      * level bitmap. It would be nice to implement a different
4391                      * more efficient way of doing this. In particular the top
4392                      * level bitmap may be unnecessary.
4393                      */
4394                     if (!recursed_depth) {
4395                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4396                     } else {
4397                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4398                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4399                              RExC_study_chunk_recursed_bytes, U8);
4400                     }
4401                     /* we havent recursed into this paren yet, so recurse into it */
4402                     DEBUG_STUDYDATA("set:", data,depth);
4403                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4404                     my_recursed_depth= recursed_depth + 1;
4405                 } else {
4406                     DEBUG_STUDYDATA("inf:", data,depth);
4407                     /* some form of infinite recursion, assume infinite length
4408                      * */
4409                     if (flags & SCF_DO_SUBSTR) {
4410                         scan_commit(pRExC_state, data, minlenp, is_inf);
4411                         data->longest = &(data->longest_float);
4412                     }
4413                     is_inf = is_inf_internal = 1;
4414                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4415                         ssc_anything(data->start_class);
4416                     flags &= ~SCF_DO_STCLASS;
4417
4418                     start= NULL; /* reset start so we dont recurse later on. */
4419                 }
4420             } else {
4421                 paren = stopparen;
4422                 start = scan + 2;
4423                 end = regnext(scan);
4424             }
4425             if (start) {
4426                 scan_frame *newframe;
4427                 assert(end);
4428                 if (!RExC_frame_last) {
4429                     Newxz(newframe, 1, scan_frame);
4430                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4431                     RExC_frame_head= newframe;
4432                     RExC_frame_count++;
4433                 } else if (!RExC_frame_last->next_frame) {
4434                     Newxz(newframe,1,scan_frame);
4435                     RExC_frame_last->next_frame= newframe;
4436                     newframe->prev_frame= RExC_frame_last;
4437                     RExC_frame_count++;
4438                 } else {
4439                     newframe= RExC_frame_last->next_frame;
4440                 }
4441                 RExC_frame_last= newframe;
4442
4443                 newframe->next_regnode = regnext(scan);
4444                 newframe->last_regnode = last;
4445                 newframe->stopparen = stopparen;
4446                 newframe->prev_recursed_depth = recursed_depth;
4447                 newframe->this_prev_frame= frame;
4448
4449                 DEBUG_STUDYDATA("frame-new:",data,depth);
4450                 DEBUG_PEEP("fnew", scan, depth);
4451
4452                 frame = newframe;
4453                 scan =  start;
4454                 stopparen = paren;
4455                 last = end;
4456                 depth = depth + 1;
4457                 recursed_depth= my_recursed_depth;
4458
4459                 continue;
4460             }
4461         }
4462         else if (OP(scan) == EXACT) {
4463             SSize_t l = STR_LEN(scan);
4464             UV uc;
4465             if (UTF) {
4466                 const U8 * const s = (U8*)STRING(scan);
4467                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4468                 l = utf8_length(s, s + l);
4469             } else {
4470                 uc = *((U8*)STRING(scan));
4471             }
4472             min += l;
4473             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4474                 /* The code below prefers earlier match for fixed
4475                    offset, later match for variable offset.  */
4476                 if (data->last_end == -1) { /* Update the start info. */
4477                     data->last_start_min = data->pos_min;
4478                     data->last_start_max = is_inf
4479                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4480                 }
4481                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4482                 if (UTF)
4483                     SvUTF8_on(data->last_found);
4484                 {
4485                     SV * const sv = data->last_found;
4486                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4487                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4488                     if (mg && mg->mg_len >= 0)
4489                         mg->mg_len += utf8_length((U8*)STRING(scan),
4490                                               (U8*)STRING(scan)+STR_LEN(scan));
4491                 }
4492                 data->last_end = data->pos_min + l;
4493                 data->pos_min += l; /* As in the first entry. */
4494                 data->flags &= ~SF_BEFORE_EOL;
4495             }
4496
4497             /* ANDing the code point leaves at most it, and not in locale, and
4498              * can't match null string */
4499             if (flags & SCF_DO_STCLASS_AND) {
4500                 ssc_cp_and(data->start_class, uc);
4501                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4502                 ssc_clear_locale(data->start_class);
4503             }
4504             else if (flags & SCF_DO_STCLASS_OR) {
4505                 ssc_add_cp(data->start_class, uc);
4506                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4507
4508                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4509                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4510             }
4511             flags &= ~SCF_DO_STCLASS;
4512         }
4513         else if (PL_regkind[OP(scan)] == EXACT) {
4514             /* But OP != EXACT!, so is EXACTFish */
4515             SSize_t l = STR_LEN(scan);
4516             UV uc = *((U8*)STRING(scan));
4517             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4518                                                      separate code points */
4519             const U8 * s = (U8*)STRING(scan);
4520
4521             /* Search for fixed substrings supports EXACT only. */
4522             if (flags & SCF_DO_SUBSTR) {
4523                 assert(data);
4524                 scan_commit(pRExC_state, data, minlenp, is_inf);
4525             }
4526             if (UTF) {
4527                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4528                 l = utf8_length(s, s + l);
4529             }
4530             if (unfolded_multi_char) {
4531                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4532             }
4533             min += l - min_subtract;
4534             assert (min >= 0);
4535             delta += min_subtract;
4536             if (flags & SCF_DO_SUBSTR) {
4537                 data->pos_min += l - min_subtract;
4538                 if (data->pos_min < 0) {
4539                     data->pos_min = 0;
4540                 }
4541                 data->pos_delta += min_subtract;
4542                 if (min_subtract) {
4543                     data->longest = &(data->longest_float);
4544                 }
4545             }
4546
4547             if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4548                 ssc_clear_locale(data->start_class);
4549             }
4550
4551             if (! UTF) {
4552
4553                 /* We punt and assume can match anything if the node begins
4554                  * with a multi-character fold.  Things are complicated.  For
4555                  * example, /ffi/i could match any of:
4556                  *  "\N{LATIN SMALL LIGATURE FFI}"
4557                  *  "\N{LATIN SMALL LIGATURE FF}I"
4558                  *  "F\N{LATIN SMALL LIGATURE FI}"
4559                  *  plus several other things; and making sure we have all the
4560                  *  possibilities is hard. */
4561                 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4562                     EXACTF_invlist =
4563                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4564                 }
4565                 else {
4566
4567                     /* Any Latin1 range character can potentially match any
4568                      * other depending on the locale */
4569                     if (OP(scan) == EXACTFL) {
4570                         _invlist_union(EXACTF_invlist, PL_Latin1,
4571                                                               &EXACTF_invlist);
4572                     }
4573                     else {
4574                         /* But otherwise, it matches at least itself.  We can
4575                          * quickly tell if it has a distinct fold, and if so,
4576                          * it matches that as well */
4577                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4578                         if (IS_IN_SOME_FOLD_L1(uc)) {
4579                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4580                                                            PL_fold_latin1[uc]);
4581                         }
4582                     }
4583
4584                     /* Some characters match above-Latin1 ones under /i.  This
4585                      * is true of EXACTFL ones when the locale is UTF-8 */
4586                     if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4587                         && (! isASCII(uc) || (OP(scan) != EXACTFA
4588                                             && OP(scan) != EXACTFA_NO_TRIE)))
4589                     {
4590                         add_above_Latin1_folds(pRExC_state,
4591                                                (U8) uc,
4592                                                &EXACTF_invlist);
4593                     }
4594                 }
4595             }
4596             else {  /* Pattern is UTF-8 */
4597                 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4598                 STRLEN foldlen = UTF8SKIP(s);
4599                 const U8* e = s + STR_LEN(scan);
4600                 SV** listp;
4601
4602                 /* The only code points that aren't folded in a UTF EXACTFish
4603                  * node are are the problematic ones in EXACTFL nodes */
4604                 if (OP(scan) == EXACTFL
4605                     && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4606                 {
4607                     /* We need to check for the possibility that this EXACTFL
4608                      * node begins with a multi-char fold.  Therefore we fold
4609                      * the first few characters of it so that we can make that
4610                      * check */
4611                     U8 *d = folded;
4612                     int i;
4613
4614                     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4615                         if (isASCII(*s)) {
4616                             *(d++) = (U8) toFOLD(*s);
4617                             s++;
4618                         }
4619                         else {
4620                             STRLEN len;
4621                             to_utf8_fold(s, d, &len);
4622                             d += len;
4623                             s += UTF8SKIP(s);
4624                         }
4625                     }
4626
4627                     /* And set up so the code below that looks in this folded
4628                      * buffer instead of the node's string */
4629                     e = d;
4630                     foldlen = UTF8SKIP(folded);
4631                     s = folded;
4632                 }
4633
4634                 /* When we reach here 's' points to the fold of the first
4635                  * character(s) of the node; and 'e' points to far enough along
4636                  * the folded string to be just past any possible multi-char
4637                  * fold. 'foldlen' is the length in bytes of the first
4638                  * character in 's'
4639                  *
4640                  * Unlike the non-UTF-8 case, the macro for determining if a
4641                  * string is a multi-char fold requires all the characters to
4642                  * already be folded.  This is because of all the complications
4643                  * if not.  Note that they are folded anyway, except in EXACTFL
4644                  * nodes.  Like the non-UTF case above, we punt if the node
4645                  * begins with a multi-char fold  */
4646
4647                 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4648                     EXACTF_invlist =
4649                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4650                 }
4651                 else {  /* Single char fold */
4652
4653                     /* It matches all the things that fold to it, which are
4654                      * found in PL_utf8_foldclosures (including itself) */
4655                     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4656                     if (! PL_utf8_foldclosures) {
4657                         _load_PL_utf8_foldclosures();
4658                     }
4659                     if ((listp = hv_fetch(PL_utf8_foldclosures,
4660                                         (char *) s, foldlen, FALSE)))
4661                     {
4662                         AV* list = (AV*) *listp;
4663                         IV k;
4664                         for (k = 0; k <= av_tindex(list); k++) {
4665                             SV** c_p = av_fetch(list, k, FALSE);
4666                             UV c;
4667                             assert(c_p);
4668
4669                             c = SvUV(*c_p);
4670
4671                             /* /aa doesn't allow folds between ASCII and non- */
4672                             if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4673                                 && isASCII(c) != isASCII(uc))
4674                             {
4675                                 continue;
4676                             }
4677
4678                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4679                         }
4680                     }
4681                 }
4682             }
4683             if (flags & SCF_DO_STCLASS_AND) {
4684                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4685                 ANYOF_POSIXL_ZERO(data->start_class);
4686                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4687             }
4688             else if (flags & SCF_DO_STCLASS_OR) {
4689                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4690                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4691
4692                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4693                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4694             }
4695             flags &= ~SCF_DO_STCLASS;
4696             SvREFCNT_dec(EXACTF_invlist);
4697         }
4698         else if (REGNODE_VARIES(OP(scan))) {
4699             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4700             I32 fl = 0, f = flags;
4701             regnode * const oscan = scan;
4702             regnode_ssc this_class;
4703             regnode_ssc *oclass = NULL;
4704             I32 next_is_eval = 0;
4705
4706             switch (PL_regkind[OP(scan)]) {
4707             case WHILEM:                /* End of (?:...)* . */
4708                 scan = NEXTOPER(scan);
4709                 goto finish;
4710             case PLUS:
4711                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4712                     next = NEXTOPER(scan);
4713                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4714                         mincount = 1;
4715                         maxcount = REG_INFTY;
4716                         next = regnext(scan);
4717                         scan = NEXTOPER(scan);
4718                         goto do_curly;
4719                     }
4720                 }
4721                 if (flags & SCF_DO_SUBSTR)
4722                     data->pos_min++;
4723                 min++;
4724                 /* FALLTHROUGH */
4725             case STAR:
4726                 if (flags & SCF_DO_STCLASS) {
4727                     mincount = 0;
4728                     maxcount = REG_INFTY;
4729                     next = regnext(scan);
4730                     scan = NEXTOPER(scan);
4731                     goto do_curly;
4732                 }
4733                 if (flags & SCF_DO_SUBSTR) {
4734                     scan_commit(pRExC_state, data, minlenp, is_inf);
4735                     /* Cannot extend fixed substrings */
4736                     data->longest = &(data->longest_float);
4737                 }
4738                 is_inf = is_inf_internal = 1;
4739                 scan = regnext(scan);
4740                 goto optimize_curly_tail;
4741             case CURLY:
4742                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4743                     && (scan->flags == stopparen))
4744                 {
4745                     mincount = 1;
4746                     maxcount = 1;
4747                 } else {
4748                     mincount = ARG1(scan);
4749                     maxcount = ARG2(scan);
4750                 }
4751                 next = regnext(scan);
4752                 if (OP(scan) == CURLYX) {
4753                     I32 lp = (data ? *(data->last_closep) : 0);
4754                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4755                 }
4756                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4757                 next_is_eval = (OP(scan) == EVAL);
4758               do_curly:
4759                 if (flags & SCF_DO_SUBSTR) {
4760                     if (mincount == 0)
4761                         scan_commit(pRExC_state, data, minlenp, is_inf);
4762                     /* Cannot extend fixed substrings */
4763                     pos_before = data->pos_min;
4764                 }
4765                 if (data) {
4766                     fl = data->flags;
4767                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4768                     if (is_inf)
4769                         data->flags |= SF_IS_INF;
4770                 }
4771                 if (flags & SCF_DO_STCLASS) {
4772                     ssc_init(pRExC_state, &this_class);
4773                     oclass = data->start_class;
4774                     data->start_class = &this_class;
4775                     f |= SCF_DO_STCLASS_AND;
4776                     f &= ~SCF_DO_STCLASS_OR;
4777                 }
4778                 /* Exclude from super-linear cache processing any {n,m}
4779                    regops for which the combination of input pos and regex
4780                    pos is not enough information to determine if a match
4781                    will be possible.
4782
4783                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4784                    regex pos at the \s*, the prospects for a match depend not
4785                    only on the input position but also on how many (bar\s*)
4786                    repeats into the {4,8} we are. */
4787                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4788                     f &= ~SCF_WHILEM_VISITED_POS;
4789
4790                 /* This will finish on WHILEM, setting scan, or on NULL: */
4791                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4792                                   last, data, stopparen, recursed_depth, NULL,
4793                                   (mincount == 0
4794                                    ? (f & ~SCF_DO_SUBSTR)
4795                                    : f)
4796                                   ,depth+1);
4797
4798                 if (flags & SCF_DO_STCLASS)
4799                     data->start_class = oclass;
4800                 if (mincount == 0 || minnext == 0) {
4801                     if (flags & SCF_DO_STCLASS_OR) {
4802                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4803                     }
4804                     else if (flags & SCF_DO_STCLASS_AND) {
4805                         /* Switch to OR mode: cache the old value of
4806                          * data->start_class */
4807                         INIT_AND_WITHP;
4808                         StructCopy(data->start_class, and_withp, regnode_ssc);
4809                         flags &= ~SCF_DO_STCLASS_AND;
4810                         StructCopy(&this_class, data->start_class, regnode_ssc);
4811                         flags |= SCF_DO_STCLASS_OR;
4812                         ANYOF_FLAGS(data->start_class)
4813                                                 |= SSC_MATCHES_EMPTY_STRING;
4814                     }
4815                 } else {                /* Non-zero len */
4816                     if (flags & SCF_DO_STCLASS_OR) {
4817                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4818                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4819                     }
4820                     else if (flags & SCF_DO_STCLASS_AND)
4821                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4822                     flags &= ~SCF_DO_STCLASS;
4823                 }
4824                 if (!scan)              /* It was not CURLYX, but CURLY. */
4825                     scan = next;
4826                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4827                     /* ? quantifier ok, except for (?{ ... }) */
4828                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4829                     && (minnext == 0) && (deltanext == 0)
4830                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4831                     && maxcount <= REG_INFTY/3) /* Complement check for big
4832                                                    count */
4833                 {
4834                     /* Fatal warnings may leak the regexp without this: */
4835                     SAVEFREESV(RExC_rx_sv);
4836                     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4837                         "Quantifier unexpected on zero-length expression "
4838                         "in regex m/%"UTF8f"/",
4839                          UTF8fARG(UTF, RExC_end - RExC_precomp,
4840                                   RExC_precomp));
4841                     (void)ReREFCNT_inc(RExC_rx_sv);
4842                 }
4843
4844                 min += minnext * mincount;
4845                 is_inf_internal |= deltanext == SSize_t_MAX
4846                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4847                 is_inf |= is_inf_internal;
4848                 if (is_inf) {
4849                     delta = SSize_t_MAX;
4850                 } else {
4851                     delta += (minnext + deltanext) * maxcount
4852                              - minnext * mincount;
4853                 }
4854                 /* Try powerful optimization CURLYX => CURLYN. */
4855                 if (  OP(oscan) == CURLYX && data
4856                       && data->flags & SF_IN_PAR
4857                       && !(data->flags & SF_HAS_EVAL)
4858                       && !deltanext && minnext == 1 ) {
4859                     /* Try to optimize to CURLYN.  */
4860                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4861                     regnode * const nxt1 = nxt;
4862 #ifdef DEBUGGING
4863                     regnode *nxt2;
4864 #endif
4865
4866                     /* Skip open. */
4867                     nxt = regnext(nxt);
4868                     if (!REGNODE_SIMPLE(OP(nxt))
4869                         && !(PL_regkind[OP(nxt)] == EXACT
4870                              && STR_LEN(nxt) == 1))
4871                         goto nogo;
4872 #ifdef DEBUGGING
4873                     nxt2 = nxt;
4874 #endif
4875                     nxt = regnext(nxt);
4876                     if (OP(nxt) != CLOSE)
4877                         goto nogo;
4878                     if (RExC_open_parens) {
4879                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4880                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4881                     }
4882                     /* Now we know that nxt2 is the only contents: */
4883                     oscan->flags = (U8)ARG(nxt);
4884                     OP(oscan) = CURLYN;
4885                     OP(nxt1) = NOTHING; /* was OPEN. */
4886
4887 #ifdef DEBUGGING
4888                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4889                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4890                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4891                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4892                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4893                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4894 #endif
4895                 }
4896               nogo:
4897
4898                 /* Try optimization CURLYX => CURLYM. */
4899                 if (  OP(oscan) == CURLYX && data
4900                       && !(data->flags & SF_HAS_PAR)
4901                       && !(data->flags & SF_HAS_EVAL)
4902                       && !deltanext     /* atom is fixed width */
4903                       && minnext != 0   /* CURLYM can't handle zero width */
4904
4905                          /* Nor characters whose fold at run-time may be
4906                           * multi-character */
4907                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4908                 ) {
4909                     /* XXXX How to optimize if data == 0? */
4910                     /* Optimize to a simpler form.  */
4911                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4912                     regnode *nxt2;
4913
4914                     OP(oscan) = CURLYM;
4915                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4916                             && (OP(nxt2) != WHILEM))
4917                         nxt = nxt2;
4918                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4919                     /* Need to optimize away parenths. */
4920                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4921                         /* Set the parenth number.  */
4922                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4923
4924                         oscan->flags = (U8)ARG(nxt);
4925                         if (RExC_open_parens) {
4926                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4927                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4928                         }
4929                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4930                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4931
4932 #ifdef DEBUGGING
4933                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4934                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4935                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4936                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4937 #endif
4938 #if 0
4939                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4940                             regnode *nnxt = regnext(nxt1);
4941                             if (nnxt == nxt) {
4942                                 if (reg_off_by_arg[OP(nxt1)])
4943                                     ARG_SET(nxt1, nxt2 - nxt1);
4944                                 else if (nxt2 - nxt1 < U16_MAX)
4945                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4946                                 else
4947                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4948                             }
4949                             nxt1 = nnxt;
4950                         }
4951 #endif
4952                         /* Optimize again: */
4953                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4954                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4955                     }
4956                     else
4957                         oscan->flags = 0;
4958                 }
4959                 else if ((OP(oscan) == CURLYX)
4960                          && (flags & SCF_WHILEM_VISITED_POS)
4961                          /* See the comment on a similar expression above.
4962                             However, this time it's not a subexpression
4963                             we care about, but the expression itself. */
4964                          && (maxcount == REG_INFTY)
4965                          && data && ++data->whilem_c < 16) {
4966                     /* This stays as CURLYX, we can put the count/of pair. */
4967                     /* Find WHILEM (as in regexec.c) */
4968                     regnode *nxt = oscan + NEXT_OFF(oscan);
4969
4970                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4971                         nxt += ARG(nxt);
4972                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4973                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4974                 }
4975                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4976                     pars++;
4977                 if (flags & SCF_DO_SUBSTR) {
4978                     SV *last_str = NULL;
4979                     STRLEN last_chrs = 0;
4980                     int counted = mincount != 0;
4981
4982                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4983                                                                   string. */
4984                         SSize_t b = pos_before >= data->last_start_min
4985                             ? pos_before : data->last_start_min;
4986                         STRLEN l;
4987                         const char * const s = SvPV_const(data->last_found, l);
4988                         SSize_t old = b - data->last_start_min;
4989
4990                         if (UTF)
4991                             old = utf8_hop((U8*)s, old) - (U8*)s;
4992                         l -= old;
4993                         /* Get the added string: */
4994                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4995                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4996                                             (U8*)(s + old + l)) : l;
4997                         if (deltanext == 0 && pos_before == b) {
4998                             /* What was added is a constant string */
4999                             if (mincount > 1) {
5000
5001                                 SvGROW(last_str, (mincount * l) + 1);
5002                                 repeatcpy(SvPVX(last_str) + l,
5003                                           SvPVX_const(last_str), l,
5004                                           mincount - 1);
5005                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5006                                 /* Add additional parts. */
5007                                 SvCUR_set(data->last_found,
5008                                           SvCUR(data->last_found) - l);
5009                                 sv_catsv(data->last_found, last_str);
5010                                 {
5011                                     SV * sv = data->last_found;
5012                                     MAGIC *mg =
5013                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5014                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5015                                     if (mg && mg->mg_len >= 0)
5016                                         mg->mg_len += last_chrs * (mincount-1);
5017                                 }
5018                                 last_chrs *= mincount;
5019                                 data->last_end += l * (mincount - 1);
5020                             }
5021                         } else {
5022                             /* start offset must point into the last copy */
5023                             data->last_start_min += minnext * (mincount - 1);
5024                             data->last_start_max += is_inf ? SSize_t_MAX
5025                                 : (maxcount - 1) * (minnext + data->pos_delta);
5026                         }
5027                     }
5028                     /* It is counted once already... */
5029                     data->pos_min += minnext * (mincount - counted);
5030 #if 0
5031 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5032                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5033                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
5034     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5035     (UV)mincount);
5036 if (deltanext != SSize_t_MAX)
5037 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5038     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5039           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5040 #endif
5041                     if (deltanext == SSize_t_MAX
5042                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5043                         data->pos_delta = SSize_t_MAX;
5044                     else
5045                         data->pos_delta += - counted * deltanext +
5046                         (minnext + deltanext) * maxcount - minnext * mincount;
5047                     if (mincount != maxcount) {
5048                          /* Cannot extend fixed substrings found inside
5049                             the group.  */
5050                         scan_commit(pRExC_state, data, minlenp, is_inf);
5051                         if (mincount && last_str) {
5052                             SV * const sv = data->last_found;
5053                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5054                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5055
5056                             if (mg)
5057                                 mg->mg_len = -1;
5058                             sv_setsv(sv, last_str);
5059                             data->last_end = data->pos_min;
5060                             data->last_start_min = data->pos_min - last_chrs;
5061                             data->last_start_max = is_inf
5062                                 ? SSize_t_MAX
5063                                 : data->pos_min + data->pos_delta - last_chrs;
5064                         }
5065                         data->longest = &(data->longest_float);
5066                     }
5067                     SvREFCNT_dec(last_str);
5068                 }
5069                 if (data && (fl & SF_HAS_EVAL))
5070                     data->flags |= SF_HAS_EVAL;
5071               optimize_curly_tail:
5072                 if (OP(oscan) != CURLYX) {
5073                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5074                            && NEXT_OFF(next))
5075                         NEXT_OFF(oscan) += NEXT_OFF(next);
5076                 }
5077                 continue;
5078
5079             default:
5080 #ifdef DEBUGGING
5081                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5082                                                                     OP(scan));
5083 #endif
5084             case REF:
5085             case CLUMP:
5086                 if (flags & SCF_DO_SUBSTR) {
5087                     /* Cannot expect anything... */
5088                     scan_commit(pRExC_state, data, minlenp, is_inf);
5089                     data->longest = &(data->longest_float);
5090                 }
5091                 is_inf = is_inf_internal = 1;
5092                 if (flags & SCF_DO_STCLASS_OR) {
5093                     if (OP(scan) == CLUMP) {
5094                         /* Actually is any start char, but very few code points
5095                          * aren't start characters */
5096                         ssc_match_all_cp(data->start_class);
5097                     }
5098                     else {
5099                         ssc_anything(data->start_class);
5100                     }
5101                 }
5102                 flags &= ~SCF_DO_STCLASS;
5103                 break;
5104             }
5105         }
5106         else if (OP(scan) == LNBREAK) {
5107             if (flags & SCF_DO_STCLASS) {
5108                 if (flags & SCF_DO_STCLASS_AND) {
5109                     ssc_intersection(data->start_class,
5110                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5111                     ssc_clear_locale(data->start_class);
5112                     ANYOF_FLAGS(data->start_class)
5113                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5114                 }
5115                 else if (flags & SCF_DO_STCLASS_OR) {
5116                     ssc_union(data->start_class,
5117                               PL_XPosix_ptrs[_CC_VERTSPACE],
5118                               FALSE);
5119                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5120
5121                     /* See commit msg for
5122                      * 749e076fceedeb708a624933726e7989f2302f6a */
5123                     ANYOF_FLAGS(data->start_class)
5124                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5125                 }
5126                 flags &= ~SCF_DO_STCLASS;
5127             }
5128             min++;
5129             delta++;    /* Because of the 2 char string cr-lf */
5130             if (flags & SCF_DO_SUBSTR) {
5131                 /* Cannot expect anything... */
5132                 scan_commit(pRExC_state, data, minlenp, is_inf);
5133                 data->pos_min += 1;
5134                 data->pos_delta += 1;
5135                 data->longest = &(data->longest_float);
5136             }
5137         }
5138         else if (REGNODE_SIMPLE(OP(scan))) {
5139
5140             if (flags & SCF_DO_SUBSTR) {
5141                 scan_commit(pRExC_state, data, minlenp, is_inf);
5142                 data->pos_min++;
5143             }
5144             min++;
5145             if (flags & SCF_DO_STCLASS) {
5146                 bool invert = 0;
5147                 SV* my_invlist = NULL;
5148                 U8 namedclass;
5149
5150                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5151                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5152
5153                 /* Some of the logic below assumes that switching
5154                    locale on will only add false positives. */
5155                 switch (OP(scan)) {
5156
5157                 default:
5158 #ifdef DEBUGGING
5159                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5160                                                                      OP(scan));
5161 #endif
5162                 case CANY:
5163                 case SANY:
5164                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5165                         ssc_match_all_cp(data->start_class);
5166                     break;
5167
5168                 case REG_ANY:
5169                     {
5170                         SV* REG_ANY_invlist = _new_invlist(2);
5171                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5172                                                             '\n');
5173                         if (flags & SCF_DO_STCLASS_OR) {
5174                             ssc_union(data->start_class,
5175                                       REG_ANY_invlist,
5176                                       TRUE /* TRUE => invert, hence all but \n
5177                                             */
5178                                       );
5179                         }
5180                         else if (flags & SCF_DO_STCLASS_AND) {
5181                             ssc_intersection(data->start_class,
5182                                              REG_ANY_invlist,
5183                                              TRUE  /* TRUE => invert */
5184                                              );
5185                             ssc_clear_locale(data->start_class);
5186                         }
5187                         SvREFCNT_dec_NN(REG_ANY_invlist);
5188                     }
5189                     break;
5190
5191                 case ANYOF:
5192                     if (flags & SCF_DO_STCLASS_AND)
5193                         ssc_and(pRExC_state, data->start_class,
5194                                 (regnode_charclass *) scan);
5195                     else
5196                         ssc_or(pRExC_state, data->start_class,
5197                                                           (regnode_charclass *) scan);
5198                     break;
5199
5200                 case NPOSIXL:
5201                     invert = 1;
5202                     /* FALLTHROUGH */
5203
5204                 case POSIXL:
5205                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5206                     if (flags & SCF_DO_STCLASS_AND) {
5207                         bool was_there = cBOOL(
5208                                           ANYOF_POSIXL_TEST(data->start_class,
5209                                                                  namedclass));
5210                         ANYOF_POSIXL_ZERO(data->start_class);
5211                         if (was_there) {    /* Do an AND */
5212                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5213                         }
5214                         /* No individual code points can now match */
5215                         data->start_class->invlist
5216                                                 = sv_2mortal(_new_invlist(0));
5217                     }
5218                     else {
5219                         int complement = namedclass + ((invert) ? -1 : 1);
5220
5221                         assert(flags & SCF_DO_STCLASS_OR);
5222
5223                         /* If the complement of this class was already there,
5224                          * the result is that they match all code points,
5225                          * (\d + \D == everything).  Remove the classes from
5226                          * future consideration.  Locale is not relevant in
5227                          * this case */
5228                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5229                             ssc_match_all_cp(data->start_class);
5230                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5231                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5232                         }
5233                         else {  /* The usual case; just add this class to the
5234                                    existing set */
5235                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5236                         }
5237                     }
5238                     break;
5239
5240                 case NPOSIXA:   /* For these, we always know the exact set of
5241                                    what's matched */
5242                     invert = 1;
5243                     /* FALLTHROUGH */
5244                 case POSIXA:
5245                     if (FLAGS(scan) == _CC_ASCII) {
5246                         my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5247                     }
5248                     else {
5249                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5250                                               PL_XPosix_ptrs[_CC_ASCII],
5251                                               &my_invlist);
5252                     }
5253                     goto join_posix;
5254
5255                 case NPOSIXD:
5256                 case NPOSIXU:
5257                     invert = 1;
5258                     /* FALLTHROUGH */
5259                 case POSIXD:
5260                 case POSIXU:
5261                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5262
5263                     /* NPOSIXD matches all upper Latin1 code points unless the
5264                      * target string being matched is UTF-8, which is
5265                      * unknowable until match time.  Since we are going to
5266                      * invert, we want to get rid of all of them so that the
5267                      * inversion will match all */
5268                     if (OP(scan) == NPOSIXD) {
5269                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5270                                           &my_invlist);
5271                     }
5272
5273                   join_posix:
5274
5275                     if (flags & SCF_DO_STCLASS_AND) {
5276                         ssc_intersection(data->start_class, my_invlist, invert);
5277                         ssc_clear_locale(data->start_class);
5278                     }
5279                     else {
5280                         assert(flags & SCF_DO_STCLASS_OR);
5281                         ssc_union(data->start_class, my_invlist, invert);
5282                     }
5283                     SvREFCNT_dec(my_invlist);
5284                 }
5285                 if (flags & SCF_DO_STCLASS_OR)
5286                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5287                 flags &= ~SCF_DO_STCLASS;
5288             }
5289         }
5290         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5291             data->flags |= (OP(scan) == MEOL
5292                             ? SF_BEFORE_MEOL
5293                             : SF_BEFORE_SEOL);
5294             scan_commit(pRExC_state, data, minlenp, is_inf);
5295
5296         }
5297         else if (  PL_regkind[OP(scan)] == BRANCHJ
5298                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5299                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5300                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5301         {
5302             if ( OP(scan) == UNLESSM &&
5303                  scan->flags == 0 &&
5304                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5305                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5306             ) {
5307                 regnode *opt;
5308                 regnode *upto= regnext(scan);
5309                 DEBUG_PARSE_r({
5310                     DEBUG_STUDYDATA("OPFAIL",data,depth);
5311
5312                     /*DEBUG_PARSE_MSG("opfail");*/
5313                     regprop(RExC_rx, RExC_mysv, upto, NULL, pRExC_state);
5314                     PerlIO_printf(Perl_debug_log,
5315                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5316                         SvPV_nolen_const(RExC_mysv),
5317                         (IV)REG_NODE_NUM(upto),
5318                         (IV)(upto - scan)
5319                     );
5320                 });
5321                 OP(scan) = OPFAIL;
5322                 NEXT_OFF(scan) = upto - scan;
5323                 for (opt= scan + 1; opt < upto ; opt++)
5324                     OP(opt) = OPTIMIZED;
5325                 scan= upto;
5326                 continue;
5327             }
5328             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5329                 || OP(scan) == UNLESSM )
5330             {
5331                 /* Negative Lookahead/lookbehind
5332                    In this case we can't do fixed string optimisation.
5333                 */
5334
5335                 SSize_t deltanext, minnext, fake = 0;
5336                 regnode *nscan;
5337                 regnode_ssc intrnl;
5338                 int f = 0;
5339
5340                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5341                 if (data) {
5342                     data_fake.whilem_c = data->whilem_c;
5343                     data_fake.last_closep = data->last_closep;
5344                 }
5345                 else
5346                     data_fake.last_closep = &fake;
5347                 data_fake.pos_delta = delta;
5348                 if ( flags & SCF_DO_STCLASS && !scan->flags
5349                      && OP(scan) == IFMATCH ) { /* Lookahead */
5350                     ssc_init(pRExC_state, &intrnl);
5351                     data_fake.start_class = &intrnl;
5352                     f |= SCF_DO_STCLASS_AND;
5353                 }
5354                 if (flags & SCF_WHILEM_VISITED_POS)
5355                     f |= SCF_WHILEM_VISITED_POS;
5356                 next = regnext(scan);
5357                 nscan = NEXTOPER(NEXTOPER(scan));
5358                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5359                                       last, &data_fake, stopparen,
5360                                       recursed_depth, NULL, f, depth+1);
5361                 if (scan->flags) {
5362                     if (deltanext) {
5363                         FAIL("Variable length lookbehind not implemented");
5364                     }
5365                     else if (minnext > (I32)U8_MAX) {
5366                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5367                               (UV)U8_MAX);
5368                     }
5369                     scan->flags = (U8)minnext;
5370                 }
5371                 if (data) {
5372                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5373                         pars++;
5374                     if (data_fake.flags & SF_HAS_EVAL)
5375                         data->flags |= SF_HAS_EVAL;
5376                     data->whilem_c = data_fake.whilem_c;
5377                 }
5378                 if (f & SCF_DO_STCLASS_AND) {
5379                     if (flags & SCF_DO_STCLASS_OR) {
5380                         /* OR before, AND after: ideally we would recurse with
5381                          * data_fake to get the AND applied by study of the
5382                          * remainder of the pattern, and then derecurse;
5383                          * *** HACK *** for now just treat as "no information".
5384                          * See [perl #56690].
5385                          */
5386                         ssc_init(pRExC_state, data->start_class);
5387                     }  else {
5388                         /* AND before and after: combine and continue.  These
5389                          * assertions are zero-length, so can match an EMPTY
5390                          * string */
5391                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5392                         ANYOF_FLAGS(data->start_class)
5393                                                    |= SSC_MATCHES_EMPTY_STRING;
5394                     }
5395                 }
5396             }
5397 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5398             else {
5399                 /* Positive Lookahead/lookbehind
5400                    In this case we can do fixed string optimisation,
5401                    but we must be careful about it. Note in the case of
5402                    lookbehind the positions will be offset by the minimum
5403                    length of the pattern, something we won't know about
5404                    until after the recurse.
5405                 */
5406                 SSize_t deltanext, fake = 0;
5407                 regnode *nscan;
5408                 regnode_ssc intrnl;
5409                 int f = 0;
5410                 /* We use SAVEFREEPV so that when the full compile
5411                     is finished perl will clean up the allocated
5412                     minlens when it's all done. This way we don't
5413                     have to worry about freeing them when we know
5414                     they wont be used, which would be a pain.
5415                  */
5416                 SSize_t *minnextp;
5417                 Newx( minnextp, 1, SSize_t );
5418                 SAVEFREEPV(minnextp);
5419
5420                 if (data) {
5421                     StructCopy(data, &data_fake, scan_data_t);
5422                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5423                         f |= SCF_DO_SUBSTR;
5424                         if (scan->flags)
5425                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5426                         data_fake.last_found=newSVsv(data->last_found);
5427                     }
5428                 }
5429                 else
5430                     data_fake.last_closep = &fake;
5431                 data_fake.flags = 0;
5432                 data_fake.pos_delta = delta;
5433                 if (is_inf)
5434                     data_fake.flags |= SF_IS_INF;
5435                 if ( flags & SCF_DO_STCLASS && !scan->flags
5436                      && OP(scan) == IFMATCH ) { /* Lookahead */
5437                     ssc_init(pRExC_state, &intrnl);
5438                     data_fake.start_class = &intrnl;
5439                     f |= SCF_DO_STCLASS_AND;
5440                 }
5441                 if (flags & SCF_WHILEM_VISITED_POS)
5442                     f |= SCF_WHILEM_VISITED_POS;
5443                 next = regnext(scan);
5444                 nscan = NEXTOPER(NEXTOPER(scan));
5445
5446                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5447                                         &deltanext, last, &data_fake,
5448                                         stopparen, recursed_depth, NULL,
5449                                         f,depth+1);
5450                 if (scan->flags) {
5451                     if (deltanext) {
5452                         FAIL("Variable length lookbehind not implemented");
5453                     }
5454                     else if (*minnextp > (I32)U8_MAX) {
5455                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5456                               (UV)U8_MAX);
5457                     }
5458                     scan->flags = (U8)*minnextp;
5459                 }
5460
5461                 *minnextp += min;
5462
5463                 if (f & SCF_DO_STCLASS_AND) {
5464                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5465                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5466                 }
5467                 if (data) {
5468                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5469                         pars++;
5470                     if (data_fake.flags & SF_HAS_EVAL)
5471                         data->flags |= SF_HAS_EVAL;
5472                     data->whilem_c = data_fake.whilem_c;
5473                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5474                         if (RExC_rx->minlen<*minnextp)
5475                             RExC_rx->minlen=*minnextp;
5476                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5477                         SvREFCNT_dec_NN(data_fake.last_found);
5478
5479                         if ( data_fake.minlen_fixed != minlenp )
5480                         {
5481                             data->offset_fixed= data_fake.offset_fixed;
5482                             data->minlen_fixed= data_fake.minlen_fixed;
5483                             data->lookbehind_fixed+= scan->flags;
5484                         }
5485                         if ( data_fake.minlen_float != minlenp )
5486                         {
5487                             data->minlen_float= data_fake.minlen_float;
5488                             data->offset_float_min=data_fake.offset_float_min;
5489                             data->offset_float_max=data_fake.offset_float_max;
5490                             data->lookbehind_float+= scan->flags;
5491                         }
5492                     }
5493                 }
5494             }
5495 #endif
5496         }
5497         else if (OP(scan) == OPEN) {
5498             if (stopparen != (I32)ARG(scan))
5499                 pars++;
5500         }
5501         else if (OP(scan) == CLOSE) {
5502             if (stopparen == (I32)ARG(scan)) {
5503                 break;
5504             }
5505             if ((I32)ARG(scan) == is_par) {
5506                 next = regnext(scan);
5507
5508                 if ( next && (OP(next) != WHILEM) && next < last)
5509                     is_par = 0;         /* Disable optimization */
5510             }
5511             if (data)
5512                 *(data->last_closep) = ARG(scan);
5513         }
5514         else if (OP(scan) == EVAL) {
5515                 if (data)
5516                     data->flags |= SF_HAS_EVAL;
5517         }
5518         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5519             if (flags & SCF_DO_SUBSTR) {
5520                 scan_commit(pRExC_state, data, minlenp, is_inf);
5521                 flags &= ~SCF_DO_SUBSTR;
5522             }
5523             if (data && OP(scan)==ACCEPT) {
5524                 data->flags |= SCF_SEEN_ACCEPT;
5525                 if (stopmin > min)
5526                     stopmin = min;
5527             }
5528         }
5529         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5530         {
5531                 if (flags & SCF_DO_SUBSTR) {
5532                     scan_commit(pRExC_state, data, minlenp, is_inf);
5533                     data->longest = &(data->longest_float);
5534                 }
5535                 is_inf = is_inf_internal = 1;
5536                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5537                     ssc_anything(data->start_class);
5538                 flags &= ~SCF_DO_STCLASS;
5539         }
5540         else if (OP(scan) == GPOS) {
5541             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5542                 !(delta || is_inf || (data && data->pos_delta)))
5543             {
5544                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5545                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5546                 if (RExC_rx->gofs < (STRLEN)min)
5547                     RExC_rx->gofs = min;
5548             } else {
5549                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5550                 RExC_rx->gofs = 0;
5551             }
5552         }
5553 #ifdef TRIE_STUDY_OPT
5554 #ifdef FULL_TRIE_STUDY
5555         else if (PL_regkind[OP(scan)] == TRIE) {
5556             /* NOTE - There is similar code to this block above for handling
5557                BRANCH nodes on the initial study.  If you change stuff here
5558                check there too. */
5559             regnode *trie_node= scan;
5560             regnode *tail= regnext(scan);
5561             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5562             SSize_t max1 = 0, min1 = SSize_t_MAX;
5563             regnode_ssc accum;
5564
5565             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5566                 /* Cannot merge strings after this. */
5567                 scan_commit(pRExC_state, data, minlenp, is_inf);
5568             }
5569             if (flags & SCF_DO_STCLASS)
5570                 ssc_init_zero(pRExC_state, &accum);
5571
5572             if (!trie->jump) {
5573                 min1= trie->minlen;
5574                 max1= trie->maxlen;
5575             } else {
5576                 const regnode *nextbranch= NULL;
5577                 U32 word;
5578
5579                 for ( word=1 ; word <= trie->wordcount ; word++)
5580                 {
5581                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5582                     regnode_ssc this_class;
5583
5584                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5585                     if (data) {
5586                         data_fake.whilem_c = data->whilem_c;
5587                         data_fake.last_closep = data->last_closep;
5588                     }
5589                     else
5590                         data_fake.last_closep = &fake;
5591                     data_fake.pos_delta = delta;
5592                     if (flags & SCF_DO_STCLASS) {
5593                         ssc_init(pRExC_state, &this_class);
5594                         data_fake.start_class = &this_class;
5595                         f = SCF_DO_STCLASS_AND;
5596                     }
5597                     if (flags & SCF_WHILEM_VISITED_POS)
5598                         f |= SCF_WHILEM_VISITED_POS;
5599
5600                     if (trie->jump[word]) {
5601                         if (!nextbranch)
5602                             nextbranch = trie_node + trie->jump[0];
5603                         scan= trie_node + trie->jump[word];
5604                         /* We go from the jump point to the branch that follows
5605                            it. Note this means we need the vestigal unused
5606                            branches even though they arent otherwise used. */
5607                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5608                             &deltanext, (regnode *)nextbranch, &data_fake,
5609                             stopparen, recursed_depth, NULL, f,depth+1);
5610                     }
5611                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5612                         nextbranch= regnext((regnode*)nextbranch);
5613
5614                     if (min1 > (SSize_t)(minnext + trie->minlen))
5615                         min1 = minnext + trie->minlen;
5616                     if (deltanext == SSize_t_MAX) {
5617                         is_inf = is_inf_internal = 1;
5618                         max1 = SSize_t_MAX;
5619                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5620                         max1 = minnext + deltanext + trie->maxlen;
5621
5622                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5623                         pars++;
5624                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5625                         if ( stopmin > min + min1)
5626                             stopmin = min + min1;
5627                         flags &= ~SCF_DO_SUBSTR;
5628                         if (data)
5629                             data->flags |= SCF_SEEN_ACCEPT;
5630                     }
5631                     if (data) {
5632                         if (data_fake.flags & SF_HAS_EVAL)
5633                             data->flags |= SF_HAS_EVAL;
5634                         data->whilem_c = data_fake.whilem_c;
5635                     }
5636                     if (flags & SCF_DO_STCLASS)
5637                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5638                 }
5639             }
5640             if (flags & SCF_DO_SUBSTR) {
5641                 data->pos_min += min1;
5642                 data->pos_delta += max1 - min1;
5643                 if (max1 != min1 || is_inf)
5644                     data->longest = &(data->longest_float);
5645             }
5646             min += min1;
5647             delta += max1 - min1;
5648             if (flags & SCF_DO_STCLASS_OR) {
5649                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5650                 if (min1) {
5651                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5652                     flags &= ~SCF_DO_STCLASS;
5653                 }
5654             }
5655             else if (flags & SCF_DO_STCLASS_AND) {
5656                 if (min1) {
5657                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5658                     flags &= ~SCF_DO_STCLASS;
5659                 }
5660                 else {
5661                     /* Switch to OR mode: cache the old value of
5662                      * data->start_class */
5663                     INIT_AND_WITHP;
5664                     StructCopy(data->start_class, and_withp, regnode_ssc);
5665                     flags &= ~SCF_DO_STCLASS_AND;
5666                     StructCopy(&accum, data->start_class, regnode_ssc);
5667                     flags |= SCF_DO_STCLASS_OR;
5668                 }
5669             }
5670             scan= tail;
5671             continue;
5672         }
5673 #else
5674         else if (PL_regkind[OP(scan)] == TRIE) {
5675             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5676             U8*bang=NULL;
5677
5678             min += trie->minlen;
5679             delta += (trie->maxlen - trie->minlen);
5680             flags &= ~SCF_DO_STCLASS; /* xxx */
5681             if (flags & SCF_DO_SUBSTR) {
5682                 /* Cannot expect anything... */
5683                 scan_commit(pRExC_state, data, minlenp, is_inf);
5684                 data->pos_min += trie->minlen;
5685                 data->pos_delta += (trie->maxlen - trie->minlen);
5686                 if (trie->maxlen != trie->minlen)
5687                     data->longest = &(data->longest_float);
5688             }
5689             if (trie->jump) /* no more substrings -- for now /grr*/
5690                flags &= ~SCF_DO_SUBSTR;
5691         }
5692 #endif /* old or new */
5693 #endif /* TRIE_STUDY_OPT */
5694
5695         /* Else: zero-length, ignore. */
5696         scan = regnext(scan);
5697     }
5698     /* If we are exiting a recursion we can unset its recursed bit
5699      * and allow ourselves to enter it again - no danger of an
5700      * infinite loop there.
5701     if (stopparen > -1 && recursed) {
5702         DEBUG_STUDYDATA("unset:", data,depth);
5703         PAREN_UNSET( recursed, stopparen);
5704     }
5705     */
5706     if (frame) {
5707         depth = depth - 1;
5708
5709         DEBUG_STUDYDATA("frame-end:",data,depth);
5710         DEBUG_PEEP("fend", scan, depth);
5711
5712         /* restore previous context */
5713         last = frame->last_regnode;
5714         scan = frame->next_regnode;
5715         stopparen = frame->stopparen;
5716         recursed_depth = frame->prev_recursed_depth;
5717
5718         RExC_frame_last = frame->prev_frame;
5719         frame = frame->this_prev_frame;
5720         goto fake_study_recurse;
5721     }
5722
5723   finish:
5724     assert(!frame);
5725     DEBUG_STUDYDATA("pre-fin:",data,depth);
5726
5727     *scanp = scan;
5728     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5729
5730     if (flags & SCF_DO_SUBSTR && is_inf)
5731         data->pos_delta = SSize_t_MAX - data->pos_min;
5732     if (is_par > (I32)U8_MAX)
5733         is_par = 0;
5734     if (is_par && pars==1 && data) {
5735         data->flags |= SF_IN_PAR;
5736         data->flags &= ~SF_HAS_PAR;
5737     }
5738     else if (pars && data) {
5739         data->flags |= SF_HAS_PAR;
5740         data->flags &= ~SF_IN_PAR;
5741     }
5742     if (flags & SCF_DO_STCLASS_OR)
5743         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5744     if (flags & SCF_TRIE_RESTUDY)
5745         data->flags |=  SCF_TRIE_RESTUDY;
5746
5747     DEBUG_STUDYDATA("post-fin:",data,depth);
5748
5749     {
5750         SSize_t final_minlen= min < stopmin ? min : stopmin;
5751
5752         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5753             RExC_maxlen = final_minlen + delta;
5754         }
5755         return final_minlen;
5756     }
5757     NOT_REACHED;
5758 }
5759
5760 STATIC U32
5761 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5762 {
5763     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5764
5765     PERL_ARGS_ASSERT_ADD_DATA;
5766
5767     Renewc(RExC_rxi->data,
5768            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5769            char, struct reg_data);
5770     if(count)
5771         Renew(RExC_rxi->data->what, count + n, U8);
5772     else
5773         Newx(RExC_rxi->data->what, n, U8);
5774     RExC_rxi->data->count = count + n;
5775     Copy(s, RExC_rxi->data->what + count, n, U8);
5776     return count;
5777 }
5778
5779 /*XXX: todo make this not included in a non debugging perl, but appears to be
5780  * used anyway there, in 'use re' */
5781 #ifndef PERL_IN_XSUB_RE
5782 void
5783 Perl_reginitcolors(pTHX)
5784 {
5785     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5786     if (s) {
5787         char *t = savepv(s);
5788         int i = 0;
5789         PL_colors[0] = t;
5790         while (++i < 6) {
5791             t = strchr(t, '\t');
5792             if (t) {
5793                 *t = '\0';
5794                 PL_colors[i] = ++t;
5795             }
5796             else
5797                 PL_colors[i] = t = (char *)"";
5798         }
5799     } else {
5800         int i = 0;
5801         while (i < 6)
5802             PL_colors[i++] = (char *)"";
5803     }
5804     PL_colorset = 1;
5805 }
5806 #endif
5807
5808
5809 #ifdef TRIE_STUDY_OPT
5810 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5811     STMT_START {                                            \
5812         if (                                                \
5813               (data.flags & SCF_TRIE_RESTUDY)               \
5814               && ! restudied++                              \
5815         ) {                                                 \
5816             dOsomething;                                    \
5817             goto reStudy;                                   \
5818         }                                                   \
5819     } STMT_END
5820 #else
5821 #define CHECK_RESTUDY_GOTO_butfirst
5822 #endif
5823
5824 /*
5825  * pregcomp - compile a regular expression into internal code
5826  *
5827  * Decides which engine's compiler to call based on the hint currently in
5828  * scope
5829  */
5830
5831 #ifndef PERL_IN_XSUB_RE
5832
5833 /* return the currently in-scope regex engine (or the default if none)  */
5834
5835 regexp_engine const *
5836 Perl_current_re_engine(pTHX)
5837 {
5838     if (IN_PERL_COMPILETIME) {
5839         HV * const table = GvHV(PL_hintgv);
5840         SV **ptr;
5841
5842         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5843             return &PL_core_reg_engine;
5844         ptr = hv_fetchs(table, "regcomp", FALSE);
5845         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5846             return &PL_core_reg_engine;
5847         return INT2PTR(regexp_engine*,SvIV(*ptr));
5848     }
5849     else {
5850         SV *ptr;
5851         if (!PL_curcop->cop_hints_hash)
5852             return &PL_core_reg_engine;
5853         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5854         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5855             return &PL_core_reg_engine;
5856         return INT2PTR(regexp_engine*,SvIV(ptr));
5857     }
5858 }
5859
5860
5861 REGEXP *
5862 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5863 {
5864     regexp_engine const *eng = current_re_engine();
5865     GET_RE_DEBUG_FLAGS_DECL;
5866
5867     PERL_ARGS_ASSERT_PREGCOMP;
5868
5869     /* Dispatch a request to compile a regexp to correct regexp engine. */
5870     DEBUG_COMPILE_r({
5871         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5872                         PTR2UV(eng));
5873     });
5874     return CALLREGCOMP_ENG(eng, pattern, flags);
5875 }
5876 #endif
5877
5878 /* public(ish) entry point for the perl core's own regex compiling code.
5879  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5880  * pattern rather than a list of OPs, and uses the internal engine rather
5881  * than the current one */
5882
5883 REGEXP *
5884 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5885 {
5886     SV *pat = pattern; /* defeat constness! */
5887     PERL_ARGS_ASSERT_RE_COMPILE;
5888     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5889 #ifdef PERL_IN_XSUB_RE
5890                                 &my_reg_engine,
5891 #else
5892                                 &PL_core_reg_engine,
5893 #endif
5894                                 NULL, NULL, rx_flags, 0);
5895 }
5896
5897
5898 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5899  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5900  * point to the realloced string and length.
5901  *
5902  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5903  * stuff added */
5904
5905 static void
5906 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5907                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5908 {
5909     U8 *const src = (U8*)*pat_p;
5910     U8 *dst, *d;
5911     int n=0;
5912     STRLEN s = 0;
5913     bool do_end = 0;
5914     GET_RE_DEBUG_FLAGS_DECL;
5915
5916     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5917         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5918
5919     Newx(dst, *plen_p * 2 + 1, U8);
5920     d = dst;
5921
5922     while (s < *plen_p) {
5923         append_utf8_from_native_byte(src[s], &d);
5924         if (n < num_code_blocks) {
5925             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5926                 pRExC_state->code_blocks[n].start = d - dst - 1;
5927                 assert(*(d - 1) == '(');
5928                 do_end = 1;
5929             }
5930             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5931                 pRExC_state->code_blocks[n].end = d - dst - 1;
5932                 assert(*(d - 1) == ')');
5933                 do_end = 0;
5934                 n++;
5935             }
5936         }
5937         s++;
5938     }
5939     *d = '\0';
5940     *plen_p = d - dst;
5941     *pat_p = (char*) dst;
5942     SAVEFREEPV(*pat_p);
5943     RExC_orig_utf8 = RExC_utf8 = 1;
5944 }
5945
5946
5947
5948 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5949  * while recording any code block indices, and handling overloading,
5950  * nested qr// objects etc.  If pat is null, it will allocate a new
5951  * string, or just return the first arg, if there's only one.
5952  *
5953  * Returns the malloced/updated pat.
5954  * patternp and pat_count is the array of SVs to be concatted;
5955  * oplist is the optional list of ops that generated the SVs;
5956  * recompile_p is a pointer to a boolean that will be set if
5957  *   the regex will need to be recompiled.
5958  * delim, if non-null is an SV that will be inserted between each element
5959  */
5960
5961 static SV*
5962 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5963                 SV *pat, SV ** const patternp, int pat_count,
5964                 OP *oplist, bool *recompile_p, SV *delim)
5965 {
5966     SV **svp;
5967     int n = 0;
5968     bool use_delim = FALSE;
5969     bool alloced = FALSE;
5970
5971     /* if we know we have at least two args, create an empty string,
5972      * then concatenate args to that. For no args, return an empty string */
5973     if (!pat && pat_count != 1) {
5974         pat = newSVpvs("");
5975         SAVEFREESV(pat);
5976         alloced = TRUE;
5977     }
5978
5979     for (svp = patternp; svp < patternp + pat_count; svp++) {
5980         SV *sv;
5981         SV *rx  = NULL;
5982         STRLEN orig_patlen = 0;
5983         bool code = 0;
5984         SV *msv = use_delim ? delim : *svp;
5985         if (!msv) msv = &PL_sv_undef;
5986
5987         /* if we've got a delimiter, we go round the loop twice for each
5988          * svp slot (except the last), using the delimiter the second
5989          * time round */
5990         if (use_delim) {
5991             svp--;
5992             use_delim = FALSE;
5993         }
5994         else if (delim)
5995             use_delim = TRUE;
5996
5997         if (SvTYPE(msv) == SVt_PVAV) {
5998             /* we've encountered an interpolated array within
5999              * the pattern, e.g. /...@a..../. Expand the list of elements,
6000              * then recursively append elements.
6001              * The code in this block is based on S_pushav() */
6002
6003             AV *const av = (AV*)msv;
6004             const SSize_t maxarg = AvFILL(av) + 1;
6005             SV **array;
6006
6007             if (oplist) {
6008                 assert(oplist->op_type == OP_PADAV
6009                     || oplist->op_type == OP_RV2AV);
6010                 oplist = OP_SIBLING(oplist);
6011             }
6012
6013             if (SvRMAGICAL(av)) {
6014                 SSize_t i;
6015
6016                 Newx(array, maxarg, SV*);
6017                 SAVEFREEPV(array);
6018                 for (i=0; i < maxarg; i++) {
6019                     SV ** const svp = av_fetch(av, i, FALSE);
6020                     array[i] = svp ? *svp : &PL_sv_undef;
6021                 }
6022             }
6023             else
6024                 array = AvARRAY(av);
6025
6026             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6027                                 array, maxarg, NULL, recompile_p,
6028                                 /* $" */
6029                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6030
6031             continue;
6032         }
6033
6034
6035         /* we make the assumption here that each op in the list of
6036          * op_siblings maps to one SV pushed onto the stack,
6037          * except for code blocks, with have both an OP_NULL and
6038          * and OP_CONST.
6039          * This allows us to match up the list of SVs against the
6040          * list of OPs to find the next code block.
6041          *
6042          * Note that       PUSHMARK PADSV PADSV ..
6043          * is optimised to
6044          *                 PADRANGE PADSV  PADSV  ..
6045          * so the alignment still works. */
6046
6047         if (oplist) {
6048             if (oplist->op_type == OP_NULL
6049                 && (oplist->op_flags & OPf_SPECIAL))
6050             {
6051                 assert(n < pRExC_state->num_code_blocks);
6052                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6053                 pRExC_state->code_blocks[n].block = oplist;
6054                 pRExC_state->code_blocks[n].src_regex = NULL;
6055                 n++;
6056                 code = 1;
6057                 oplist = OP_SIBLING(oplist); /* skip CONST */
6058                 assert(oplist);
6059             }
6060             oplist = OP_SIBLING(oplist);;
6061         }
6062
6063         /* apply magic and QR overloading to arg */
6064
6065         SvGETMAGIC(msv);
6066         if (SvROK(msv) && SvAMAGIC(msv)) {
6067             SV *sv = AMG_CALLunary(msv, regexp_amg);
6068             if (sv) {
6069                 if (SvROK(sv))
6070                     sv = SvRV(sv);
6071                 if (SvTYPE(sv) != SVt_REGEXP)
6072                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6073                 msv = sv;
6074             }
6075         }
6076
6077         /* try concatenation overload ... */
6078         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6079                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6080         {
6081             sv_setsv(pat, sv);
6082             /* overloading involved: all bets are off over literal
6083              * code. Pretend we haven't seen it */
6084             pRExC_state->num_code_blocks -= n;
6085             n = 0;
6086         }
6087         else  {
6088             /* ... or failing that, try "" overload */
6089             while (SvAMAGIC(msv)
6090                     && (sv = AMG_CALLunary(msv, string_amg))
6091                     && sv != msv
6092                     &&  !(   SvROK(msv)
6093                           && SvROK(sv)
6094                           && SvRV(msv) == SvRV(sv))
6095             ) {
6096                 msv = sv;
6097                 SvGETMAGIC(msv);
6098             }
6099             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6100                 msv = SvRV(msv);
6101
6102             if (pat) {
6103                 /* this is a partially unrolled
6104                  *     sv_catsv_nomg(pat, msv);
6105                  * that allows us to adjust code block indices if
6106                  * needed */
6107                 STRLEN dlen;
6108                 char *dst = SvPV_force_nomg(pat, dlen);
6109                 orig_patlen = dlen;
6110                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6111                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6112                     sv_setpvn(pat, dst, dlen);
6113                     SvUTF8_on(pat);
6114                 }
6115                 sv_catsv_nomg(pat, msv);
6116                 rx = msv;
6117             }
6118             else
6119                 pat = msv;
6120
6121             if (code)
6122                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6123         }
6124
6125         /* extract any code blocks within any embedded qr//'s */
6126         if (rx && SvTYPE(rx) == SVt_REGEXP
6127             && RX_ENGINE((REGEXP*)rx)->op_comp)
6128         {
6129
6130             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6131             if (ri->num_code_blocks) {
6132                 int i;
6133                 /* the presence of an embedded qr// with code means
6134                  * we should always recompile: the text of the
6135                  * qr// may not have changed, but it may be a
6136                  * different closure than last time */
6137                 *recompile_p = 1;
6138                 Renew(pRExC_state->code_blocks,
6139                     pRExC_state->num_code_blocks + ri->num_code_blocks,
6140                     struct reg_code_block);
6141                 pRExC_state->num_code_blocks += ri->num_code_blocks;
6142
6143                 for (i=0; i < ri->num_code_blocks; i++) {
6144                     struct reg_code_block *src, *dst;
6145                     STRLEN offset =  orig_patlen
6146                         + ReANY((REGEXP *)rx)->pre_prefix;
6147                     assert(n < pRExC_state->num_code_blocks);
6148                     src = &ri->code_blocks[i];
6149                     dst = &pRExC_state->code_blocks[n];
6150                     dst->start      = src->start + offset;
6151                     dst->end        = src->end   + offset;
6152                     dst->block      = src->block;
6153                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6154                                             src->src_regex
6155                                                 ? src->src_regex
6156                                                 : (REGEXP*)rx);
6157                     n++;
6158                 }
6159             }
6160         }
6161     }
6162     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6163     if (alloced)
6164         SvSETMAGIC(pat);
6165
6166     return pat;
6167 }
6168
6169
6170
6171 /* see if there are any run-time code blocks in the pattern.
6172  * False positives are allowed */
6173
6174 static bool
6175 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6176                     char *pat, STRLEN plen)
6177 {
6178     int n = 0;
6179     STRLEN s;
6180
6181     PERL_UNUSED_CONTEXT;
6182
6183     for (s = 0; s < plen; s++) {
6184         if (n < pRExC_state->num_code_blocks
6185             && s == pRExC_state->code_blocks[n].start)
6186         {
6187             s = pRExC_state->code_blocks[n].end;
6188             n++;
6189             continue;
6190         }
6191         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6192          * positives here */
6193         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6194             (pat[s+2] == '{'
6195                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6196         )
6197             return 1;
6198     }
6199     return 0;
6200 }
6201
6202 /* Handle run-time code blocks. We will already have compiled any direct
6203  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6204  * copy of it, but with any literal code blocks blanked out and
6205  * appropriate chars escaped; then feed it into
6206  *
6207  *    eval "qr'modified_pattern'"
6208  *
6209  * For example,
6210  *
6211  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6212  *
6213  * becomes
6214  *
6215  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6216  *
6217  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6218  * and merge them with any code blocks of the original regexp.
6219  *
6220  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6221  * instead, just save the qr and return FALSE; this tells our caller that
6222  * the original pattern needs upgrading to utf8.
6223  */
6224
6225 static bool
6226 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6227     char *pat, STRLEN plen)
6228 {
6229     SV *qr;
6230
6231     GET_RE_DEBUG_FLAGS_DECL;
6232
6233     if (pRExC_state->runtime_code_qr) {
6234         /* this is the second time we've been called; this should
6235          * only happen if the main pattern got upgraded to utf8
6236          * during compilation; re-use the qr we compiled first time
6237          * round (which should be utf8 too)
6238          */
6239         qr = pRExC_state->runtime_code_qr;
6240         pRExC_state->runtime_code_qr = NULL;
6241         assert(RExC_utf8 && SvUTF8(qr));
6242     }
6243     else {
6244         int n = 0;
6245         STRLEN s;
6246         char *p, *newpat;
6247         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6248         SV *sv, *qr_ref;
6249         dSP;
6250
6251         /* determine how many extra chars we need for ' and \ escaping */
6252         for (s = 0; s < plen; s++) {
6253             if (pat[s] == '\'' || pat[s] == '\\')
6254                 newlen++;
6255         }
6256
6257         Newx(newpat, newlen, char);
6258         p = newpat;
6259         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6260
6261         for (s = 0; s < plen; s++) {
6262             if (n < pRExC_state->num_code_blocks
6263                 && s == pRExC_state->code_blocks[n].start)
6264             {
6265                 /* blank out literal code block */
6266                 assert(pat[s] == '(');
6267                 while (s <= pRExC_state->code_blocks[n].end) {
6268                     *p++ = '_';
6269                     s++;
6270                 }
6271                 s--;
6272                 n++;
6273                 continue;
6274             }
6275             if (pat[s] == '\'' || pat[s] == '\\')
6276                 *p++ = '\\';
6277             *p++ = pat[s];
6278         }
6279         *p++ = '\'';
6280         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6281             *p++ = 'x';
6282         *p++ = '\0';
6283         DEBUG_COMPILE_r({
6284             PerlIO_printf(Perl_debug_log,
6285                 "%sre-parsing pattern for runtime code:%s %s\n",
6286                 PL_colors[4],PL_colors[5],newpat);
6287         });
6288
6289         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6290         Safefree(newpat);
6291
6292         ENTER;
6293         SAVETMPS;
6294         PUSHSTACKi(PERLSI_REQUIRE);
6295         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6296          * parsing qr''; normally only q'' does this. It also alters
6297          * hints handling */
6298         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6299         SvREFCNT_dec_NN(sv);
6300         SPAGAIN;
6301         qr_ref = POPs;
6302         PUTBACK;
6303         {
6304             SV * const errsv = ERRSV;
6305             if (SvTRUE_NN(errsv))
6306             {
6307                 Safefree(pRExC_state->code_blocks);
6308                 /* use croak_sv ? */
6309                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6310             }
6311         }
6312         assert(SvROK(qr_ref));
6313         qr = SvRV(qr_ref);
6314         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6315         /* the leaving below frees the tmp qr_ref.
6316          * Give qr a life of its own */
6317         SvREFCNT_inc(qr);
6318         POPSTACK;
6319         FREETMPS;
6320         LEAVE;
6321
6322     }
6323
6324     if (!RExC_utf8 && SvUTF8(qr)) {
6325         /* first time through; the pattern got upgraded; save the
6326          * qr for the next time through */
6327         assert(!pRExC_state->runtime_code_qr);
6328         pRExC_state->runtime_code_qr = qr;
6329         return 0;
6330     }
6331
6332
6333     /* extract any code blocks within the returned qr//  */
6334
6335
6336     /* merge the main (r1) and run-time (r2) code blocks into one */
6337     {
6338         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6339         struct reg_code_block *new_block, *dst;
6340         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6341         int i1 = 0, i2 = 0;
6342
6343         if (!r2->num_code_blocks) /* we guessed wrong */
6344         {
6345             SvREFCNT_dec_NN(qr);
6346             return 1;
6347         }
6348
6349         Newx(new_block,
6350             r1->num_code_blocks + r2->num_code_blocks,
6351             struct reg_code_block);
6352         dst = new_block;
6353
6354         while (    i1 < r1->num_code_blocks
6355                 || i2 < r2->num_code_blocks)
6356         {
6357             struct reg_code_block *src;
6358             bool is_qr = 0;
6359
6360             if (i1 == r1->num_code_blocks) {
6361                 src = &r2->code_blocks[i2++];
6362                 is_qr = 1;
6363             }
6364             else if (i2 == r2->num_code_blocks)
6365                 src = &r1->code_blocks[i1++];
6366             else if (  r1->code_blocks[i1].start
6367                      < r2->code_blocks[i2].start)
6368             {
6369                 src = &r1->code_blocks[i1++];
6370                 assert(src->end < r2->code_blocks[i2].start);
6371             }
6372             else {
6373                 assert(  r1->code_blocks[i1].start
6374                        > r2->code_blocks[i2].start);
6375                 src = &r2->code_blocks[i2++];
6376                 is_qr = 1;
6377                 assert(src->end < r1->code_blocks[i1].start);
6378             }
6379
6380             assert(pat[src->start] == '(');
6381             assert(pat[src->end]   == ')');
6382             dst->start      = src->start;
6383             dst->end        = src->end;
6384             dst->block      = src->block;
6385             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6386                                     : src->src_regex;
6387             dst++;
6388         }
6389         r1->num_code_blocks += r2->num_code_blocks;
6390         Safefree(r1->code_blocks);
6391         r1->code_blocks = new_block;
6392     }
6393
6394     SvREFCNT_dec_NN(qr);
6395     return 1;
6396 }
6397
6398
6399 STATIC bool
6400 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6401                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6402                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6403                       STRLEN longest_length, bool eol, bool meol)
6404 {
6405     /* This is the common code for setting up the floating and fixed length
6406      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6407      * as to whether succeeded or not */
6408
6409     I32 t;
6410     SSize_t ml;
6411
6412     if (! (longest_length
6413            || (eol /* Can't have SEOL and MULTI */
6414                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6415           )
6416             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6417         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6418     {
6419         return FALSE;
6420     }
6421
6422     /* copy the information about the longest from the reg_scan_data
6423         over to the program. */
6424     if (SvUTF8(sv_longest)) {
6425         *rx_utf8 = sv_longest;
6426         *rx_substr = NULL;
6427     } else {
6428         *rx_substr = sv_longest;
6429         *rx_utf8 = NULL;
6430     }
6431     /* end_shift is how many chars that must be matched that
6432         follow this item. We calculate it ahead of time as once the
6433         lookbehind offset is added in we lose the ability to correctly
6434         calculate it.*/
6435     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6436     *rx_end_shift = ml - offset
6437         - longest_length + (SvTAIL(sv_longest) != 0)
6438         + lookbehind;
6439
6440     t = (eol/* Can't have SEOL and MULTI */
6441          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6442     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6443
6444     return TRUE;
6445 }
6446
6447 /*
6448  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6449  * regular expression into internal code.
6450  * The pattern may be passed either as:
6451  *    a list of SVs (patternp plus pat_count)
6452  *    a list of OPs (expr)
6453  * If both are passed, the SV list is used, but the OP list indicates
6454  * which SVs are actually pre-compiled code blocks
6455  *
6456  * The SVs in the list have magic and qr overloading applied to them (and
6457  * the list may be modified in-place with replacement SVs in the latter
6458  * case).
6459  *
6460  * If the pattern hasn't changed from old_re, then old_re will be
6461  * returned.
6462  *
6463  * eng is the current engine. If that engine has an op_comp method, then
6464  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6465  * do the initial concatenation of arguments and pass on to the external
6466  * engine.
6467  *
6468  * If is_bare_re is not null, set it to a boolean indicating whether the
6469  * arg list reduced (after overloading) to a single bare regex which has
6470  * been returned (i.e. /$qr/).
6471  *
6472  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6473  *
6474  * pm_flags contains the PMf_* flags, typically based on those from the
6475  * pm_flags field of the related PMOP. Currently we're only interested in
6476  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6477  *
6478  * We can't allocate space until we know how big the compiled form will be,
6479  * but we can't compile it (and thus know how big it is) until we've got a
6480  * place to put the code.  So we cheat:  we compile it twice, once with code
6481  * generation turned off and size counting turned on, and once "for real".
6482  * This also means that we don't allocate space until we are sure that the
6483  * thing really will compile successfully, and we never have to move the
6484  * code and thus invalidate pointers into it.  (Note that it has to be in
6485  * one piece because free() must be able to free it all.) [NB: not true in perl]
6486  *
6487  * Beware that the optimization-preparation code in here knows about some
6488  * of the structure of the compiled regexp.  [I'll say.]
6489  */
6490
6491 REGEXP *
6492 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6493                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6494                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6495 {
6496     REGEXP *rx;
6497     struct regexp *r;
6498     regexp_internal *ri;
6499     STRLEN plen;
6500     char *exp;
6501     regnode *scan;
6502     I32 flags;
6503     SSize_t minlen = 0;
6504     U32 rx_flags;
6505     SV *pat;
6506     SV *code_blocksv = NULL;
6507     SV** new_patternp = patternp;
6508
6509     /* these are all flags - maybe they should be turned
6510      * into a single int with different bit masks */
6511     I32 sawlookahead = 0;
6512     I32 sawplus = 0;
6513     I32 sawopen = 0;
6514     I32 sawminmod = 0;
6515
6516     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6517     bool recompile = 0;
6518     bool runtime_code = 0;
6519     scan_data_t data;
6520     RExC_state_t RExC_state;
6521     RExC_state_t * const pRExC_state = &RExC_state;
6522 #ifdef TRIE_STUDY_OPT
6523     int restudied = 0;
6524     RExC_state_t copyRExC_state;
6525 #endif
6526     GET_RE_DEBUG_FLAGS_DECL;
6527
6528     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6529
6530     DEBUG_r(if (!PL_colorset) reginitcolors());
6531
6532 #ifndef PERL_IN_XSUB_RE
6533     /* Initialize these here instead of as-needed, as is quick and avoids
6534      * having to test them each time otherwise */
6535     if (! PL_AboveLatin1) {
6536         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6537         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6538         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6539         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6540         PL_HasMultiCharFold =
6541                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6542
6543         /* This is calculated here, because the Perl program that generates the
6544          * static global ones doesn't currently have access to
6545          * NUM_ANYOF_CODE_POINTS */
6546         PL_InBitmap = _new_invlist(2);
6547         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6548                                                     NUM_ANYOF_CODE_POINTS - 1);
6549     }
6550 #endif
6551
6552     pRExC_state->code_blocks = NULL;
6553     pRExC_state->num_code_blocks = 0;
6554
6555     if (is_bare_re)
6556         *is_bare_re = FALSE;
6557
6558     if (expr && (expr->op_type == OP_LIST ||
6559                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6560         /* allocate code_blocks if needed */
6561         OP *o;
6562         int ncode = 0;
6563
6564         for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6565             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6566                 ncode++; /* count of DO blocks */
6567         if (ncode) {
6568             pRExC_state->num_code_blocks = ncode;
6569             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6570         }
6571     }
6572
6573     if (!pat_count) {
6574         /* compile-time pattern with just OP_CONSTs and DO blocks */
6575
6576         int n;
6577         OP *o;
6578
6579         /* find how many CONSTs there are */
6580         assert(expr);
6581         n = 0;
6582         if (expr->op_type == OP_CONST)
6583             n = 1;
6584         else
6585             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6586                 if (o->op_type == OP_CONST)
6587                     n++;
6588             }
6589
6590         /* fake up an SV array */
6591
6592         assert(!new_patternp);
6593         Newx(new_patternp, n, SV*);
6594         SAVEFREEPV(new_patternp);
6595         pat_count = n;
6596
6597         n = 0;
6598         if (expr->op_type == OP_CONST)
6599             new_patternp[n] = cSVOPx_sv(expr);
6600         else
6601             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6602                 if (o->op_type == OP_CONST)
6603                     new_patternp[n++] = cSVOPo_sv;
6604             }
6605
6606     }
6607
6608     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6609         "Assembling pattern from %d elements%s\n", pat_count,
6610             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6611
6612     /* set expr to the first arg op */
6613
6614     if (pRExC_state->num_code_blocks
6615          && expr->op_type != OP_CONST)
6616     {
6617             expr = cLISTOPx(expr)->op_first;
6618             assert(   expr->op_type == OP_PUSHMARK
6619                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6620                    || expr->op_type == OP_PADRANGE);
6621             expr = OP_SIBLING(expr);
6622     }
6623
6624     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6625                         expr, &recompile, NULL);
6626
6627     /* handle bare (possibly after overloading) regex: foo =~ $re */
6628     {
6629         SV *re = pat;
6630         if (SvROK(re))
6631             re = SvRV(re);
6632         if (SvTYPE(re) == SVt_REGEXP) {
6633             if (is_bare_re)
6634                 *is_bare_re = TRUE;
6635             SvREFCNT_inc(re);
6636             Safefree(pRExC_state->code_blocks);
6637             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6638                 "Precompiled pattern%s\n",
6639                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6640
6641             return (REGEXP*)re;
6642         }
6643     }
6644
6645     exp = SvPV_nomg(pat, plen);
6646
6647     if (!eng->op_comp) {
6648         if ((SvUTF8(pat) && IN_BYTES)
6649                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6650         {
6651             /* make a temporary copy; either to convert to bytes,
6652              * or to avoid repeating get-magic / overloaded stringify */
6653             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6654                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6655         }
6656         Safefree(pRExC_state->code_blocks);
6657         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6658     }
6659
6660     /* ignore the utf8ness if the pattern is 0 length */
6661     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6662     RExC_uni_semantics = 0;
6663     RExC_contains_locale = 0;
6664     RExC_contains_i = 0;
6665     pRExC_state->runtime_code_qr = NULL;
6666     RExC_frame_head= NULL;
6667     RExC_frame_last= NULL;
6668     RExC_frame_count= 0;
6669
6670     DEBUG_r({
6671         RExC_mysv1= sv_newmortal();
6672         RExC_mysv2= sv_newmortal();
6673     });
6674     DEBUG_COMPILE_r({
6675             SV *dsv= sv_newmortal();
6676             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6677             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6678                           PL_colors[4],PL_colors[5],s);
6679         });
6680
6681   redo_first_pass:
6682     /* we jump here if we upgrade the pattern to utf8 and have to
6683      * recompile */
6684
6685     if ((pm_flags & PMf_USE_RE_EVAL)
6686                 /* this second condition covers the non-regex literal case,
6687                  * i.e.  $foo =~ '(?{})'. */
6688                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6689     )
6690         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6691
6692     /* return old regex if pattern hasn't changed */
6693     /* XXX: note in the below we have to check the flags as well as the
6694      * pattern.
6695      *
6696      * Things get a touch tricky as we have to compare the utf8 flag
6697      * independently from the compile flags.  */
6698
6699     if (   old_re
6700         && !recompile
6701         && !!RX_UTF8(old_re) == !!RExC_utf8
6702         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6703         && RX_PRECOMP(old_re)
6704         && RX_PRELEN(old_re) == plen
6705         && memEQ(RX_PRECOMP(old_re), exp, plen)
6706         && !runtime_code /* with runtime code, always recompile */ )
6707     {
6708         Safefree(pRExC_state->code_blocks);
6709         return old_re;
6710     }
6711
6712     rx_flags = orig_rx_flags;
6713
6714     if (rx_flags & PMf_FOLD) {
6715         RExC_contains_i = 1;
6716     }
6717     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6718
6719         /* Set to use unicode semantics if the pattern is in utf8 and has the
6720          * 'depends' charset specified, as it means unicode when utf8  */
6721         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6722     }
6723
6724     RExC_precomp = exp;
6725     RExC_flags = rx_flags;
6726     RExC_pm_flags = pm_flags;
6727
6728     if (runtime_code) {
6729         if (TAINTING_get && TAINT_get)
6730             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6731
6732         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6733             /* whoops, we have a non-utf8 pattern, whilst run-time code
6734              * got compiled as utf8. Try again with a utf8 pattern */
6735             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6736                                     pRExC_state->num_code_blocks);
6737             goto redo_first_pass;
6738         }
6739     }
6740     assert(!pRExC_state->runtime_code_qr);
6741
6742     RExC_sawback = 0;
6743
6744     RExC_seen = 0;
6745     RExC_maxlen = 0;
6746     RExC_in_lookbehind = 0;
6747     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6748     RExC_extralen = 0;
6749     RExC_override_recoding = 0;
6750     RExC_in_multi_char_class = 0;
6751
6752     /* First pass: determine size, legality. */
6753     RExC_parse = exp;
6754     RExC_start = exp;
6755     RExC_end = exp + plen;
6756     RExC_naughty = 0;
6757     RExC_npar = 1;
6758     RExC_nestroot = 0;
6759     RExC_size = 0L;
6760     RExC_emit = (regnode *) &RExC_emit_dummy;
6761     RExC_whilem_seen = 0;
6762     RExC_open_parens = NULL;
6763     RExC_close_parens = NULL;
6764     RExC_opend = NULL;
6765     RExC_paren_names = NULL;
6766 #ifdef DEBUGGING
6767     RExC_paren_name_list = NULL;
6768 #endif
6769     RExC_recurse = NULL;
6770     RExC_study_chunk_recursed = NULL;
6771     RExC_study_chunk_recursed_bytes= 0;
6772     RExC_recurse_count = 0;
6773     pRExC_state->code_index = 0;
6774
6775 #if 0 /* REGC() is (currently) a NOP at the first pass.
6776        * Clever compilers notice this and complain. --jhi */
6777     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6778 #endif
6779     DEBUG_PARSE_r(
6780         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6781         RExC_lastnum=0;
6782         RExC_lastparse=NULL;
6783     );
6784     /* reg may croak on us, not giving us a chance to free
6785        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6786        need it to survive as long as the regexp (qr/(?{})/).
6787        We must check that code_blocksv is not already set, because we may
6788        have jumped back to restart the sizing pass. */
6789     if (pRExC_state->code_blocks && !code_blocksv) {
6790         code_blocksv = newSV_type(SVt_PV);
6791         SAVEFREESV(code_blocksv);
6792         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6793         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6794     }
6795     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6796         /* It's possible to write a regexp in ascii that represents Unicode
6797         codepoints outside of the byte range, such as via \x{100}. If we
6798         detect such a sequence we have to convert the entire pattern to utf8
6799         and then recompile, as our sizing calculation will have been based
6800         on 1 byte == 1 character, but we will need to use utf8 to encode
6801         at least some part of the pattern, and therefore must convert the whole
6802         thing.
6803         -- dmq */
6804         if (flags & RESTART_UTF8) {
6805             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6806                                     pRExC_state->num_code_blocks);
6807             goto redo_first_pass;
6808         }
6809         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6810     }
6811     if (code_blocksv)
6812         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6813
6814     DEBUG_PARSE_r({
6815         PerlIO_printf(Perl_debug_log,
6816             "Required size %"IVdf" nodes\n"
6817             "Starting second pass (creation)\n",
6818             (IV)RExC_size);
6819         RExC_lastnum=0;
6820         RExC_lastparse=NULL;
6821     });
6822
6823     /* The first pass could have found things that force Unicode semantics */
6824     if ((RExC_utf8 || RExC_uni_semantics)
6825          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6826     {
6827         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6828     }
6829
6830     /* Small enough for pointer-storage convention?
6831        If extralen==0, this means that we will not need long jumps. */
6832     if (RExC_size >= 0x10000L && RExC_extralen)
6833         RExC_size += RExC_extralen;
6834     else
6835         RExC_extralen = 0;
6836     if (RExC_whilem_seen > 15)
6837         RExC_whilem_seen = 15;
6838
6839     /* Allocate space and zero-initialize. Note, the two step process
6840        of zeroing when in debug mode, thus anything assigned has to
6841        happen after that */
6842     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6843     r = ReANY(rx);
6844     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6845          char, regexp_internal);
6846     if ( r == NULL || ri == NULL )
6847         FAIL("Regexp out of space");
6848 #ifdef DEBUGGING
6849     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6850     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6851          char);
6852 #else
6853     /* bulk initialize base fields with 0. */
6854     Zero(ri, sizeof(regexp_internal), char);
6855 #endif
6856
6857     /* non-zero initialization begins here */
6858     RXi_SET( r, ri );
6859     r->engine= eng;
6860     r->extflags = rx_flags;
6861     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6862
6863     if (pm_flags & PMf_IS_QR) {
6864         ri->code_blocks = pRExC_state->code_blocks;
6865         ri->num_code_blocks = pRExC_state->num_code_blocks;
6866     }
6867     else
6868     {
6869         int n;
6870         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6871             if (pRExC_state->code_blocks[n].src_regex)
6872                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6873         SAVEFREEPV(pRExC_state->code_blocks);
6874     }
6875
6876     {
6877         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6878         bool has_charset = (get_regex_charset(r->extflags)
6879                                                     != REGEX_DEPENDS_CHARSET);
6880
6881         /* The caret is output if there are any defaults: if not all the STD
6882          * flags are set, or if no character set specifier is needed */
6883         bool has_default =
6884                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6885                     || ! has_charset);
6886         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6887                                                    == REG_RUN_ON_COMMENT_SEEN);
6888         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6889                             >> RXf_PMf_STD_PMMOD_SHIFT);
6890         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6891         char *p;
6892         /* Allocate for the worst case, which is all the std flags are turned
6893          * on.  If more precision is desired, we could do a population count of
6894          * the flags set.  This could be done with a small lookup table, or by
6895          * shifting, masking and adding, or even, when available, assembly
6896          * language for a machine-language population count.
6897          * We never output a minus, as all those are defaults, so are
6898          * covered by the caret */
6899         const STRLEN wraplen = plen + has_p + has_runon
6900             + has_default       /* If needs a caret */
6901
6902                 /* If needs a character set specifier */
6903             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6904             + (sizeof(STD_PAT_MODS) - 1)
6905             + (sizeof("(?:)") - 1);
6906
6907         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6908         r->xpv_len_u.xpvlenu_pv = p;
6909         if (RExC_utf8)
6910             SvFLAGS(rx) |= SVf_UTF8;
6911         *p++='('; *p++='?';
6912
6913         /* If a default, cover it using the caret */
6914         if (has_default) {
6915             *p++= DEFAULT_PAT_MOD;
6916         }
6917         if (has_charset) {
6918             STRLEN len;
6919             const char* const name = get_regex_charset_name(r->extflags, &len);
6920             Copy(name, p, len, char);
6921             p += len;
6922         }
6923         if (has_p)
6924             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6925         {
6926             char ch;
6927             while((ch = *fptr++)) {
6928                 if(reganch & 1)
6929                     *p++ = ch;
6930                 reganch >>= 1;
6931             }
6932         }
6933
6934         *p++ = ':';
6935         Copy(RExC_precomp, p, plen, char);
6936         assert ((RX_WRAPPED(rx) - p) < 16);
6937         r->pre_prefix = p - RX_WRAPPED(rx);
6938         p += plen;
6939         if (has_runon)
6940             *p++ = '\n';
6941         *p++ = ')';
6942         *p = 0;
6943         SvCUR_set(rx, p - RX_WRAPPED(rx));
6944     }
6945
6946     r->intflags = 0;
6947     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6948
6949     /* setup various meta data about recursion, this all requires
6950      * RExC_npar to be correctly set, and a bit later on we clear it */
6951     if (RExC_seen & REG_RECURSE_SEEN) {
6952         Newxz(RExC_open_parens, RExC_npar,regnode *);
6953         SAVEFREEPV(RExC_open_parens);
6954         Newxz(RExC_close_parens,RExC_npar,regnode *);
6955         SAVEFREEPV(RExC_close_parens);
6956     }
6957     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6958         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6959          * So its 1 if there are no parens. */
6960         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6961                                          ((RExC_npar & 0x07) != 0);
6962         Newx(RExC_study_chunk_recursed,
6963              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6964         SAVEFREEPV(RExC_study_chunk_recursed);
6965     }
6966
6967     /* Useful during FAIL. */
6968 #ifdef RE_TRACK_PATTERN_OFFSETS
6969     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6970     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6971                           "%s %"UVuf" bytes for offset annotations.\n",
6972                           ri->u.offsets ? "Got" : "Couldn't get",
6973                           (UV)((2*RExC_size+1) * sizeof(U32))));
6974 #endif
6975     SetProgLen(ri,RExC_size);
6976     RExC_rx_sv = rx;
6977     RExC_rx = r;
6978     RExC_rxi = ri;
6979
6980     /* Second pass: emit code. */
6981     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6982     RExC_pm_flags = pm_flags;
6983     RExC_parse = exp;
6984     RExC_end = exp + plen;
6985     RExC_naughty = 0;
6986     RExC_npar = 1;
6987     RExC_emit_start = ri->program;
6988     RExC_emit = ri->program;
6989     RExC_emit_bound = ri->program + RExC_size + 1;
6990     pRExC_state->code_index = 0;
6991
6992     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6993     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6994         ReREFCNT_dec(rx);
6995         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6996     }
6997     /* XXXX To minimize changes to RE engine we always allocate
6998        3-units-long substrs field. */
6999     Newx(r->substrs, 1, struct reg_substr_data);
7000     if (RExC_recurse_count) {
7001         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7002         SAVEFREEPV(RExC_recurse);
7003     }
7004
7005 reStudy:
7006     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7007     DEBUG_r(
7008         RExC_study_chunk_recursed_count= 0;
7009     );
7010     Zero(r->substrs, 1, struct reg_substr_data);
7011     if (RExC_study_chunk_recursed) {
7012         Zero(RExC_study_chunk_recursed,
7013              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7014     }
7015
7016
7017 #ifdef TRIE_STUDY_OPT
7018     if (!restudied) {
7019         StructCopy(&zero_scan_data, &data, scan_data_t);
7020         copyRExC_state = RExC_state;
7021     } else {
7022         U32 seen=RExC_seen;
7023         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
7024
7025         RExC_state = copyRExC_state;
7026         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7027             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7028         else
7029             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7030         StructCopy(&zero_scan_data, &data, scan_data_t);
7031     }
7032 #else
7033     StructCopy(&zero_scan_data, &data, scan_data_t);
7034 #endif
7035
7036     /* Dig out information for optimizations. */
7037     r->extflags = RExC_flags; /* was pm_op */
7038     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7039
7040     if (UTF)
7041         SvUTF8_on(rx);  /* Unicode in it? */
7042     ri->regstclass = NULL;
7043     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
7044         r->intflags |= PREGf_NAUGHTY;
7045     scan = ri->program + 1;             /* First BRANCH. */
7046
7047     /* testing for BRANCH here tells us whether there is "must appear"
7048        data in the pattern. If there is then we can use it for optimisations */
7049     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7050                                                   */
7051         SSize_t fake;
7052         STRLEN longest_float_length, longest_fixed_length;
7053         regnode_ssc ch_class; /* pointed to by data */
7054         int stclass_flag;
7055         SSize_t last_close = 0; /* pointed to by data */
7056         regnode *first= scan;
7057         regnode *first_next= regnext(first);
7058         /*
7059          * Skip introductions and multiplicators >= 1
7060          * so that we can extract the 'meat' of the pattern that must
7061          * match in the large if() sequence following.
7062          * NOTE that EXACT is NOT covered here, as it is normally
7063          * picked up by the optimiser separately.
7064          *
7065          * This is unfortunate as the optimiser isnt handling lookahead
7066          * properly currently.
7067          *
7068          */
7069         while ((OP(first) == OPEN && (sawopen = 1)) ||
7070                /* An OR of *one* alternative - should not happen now. */
7071             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7072             /* for now we can't handle lookbehind IFMATCH*/
7073             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7074             (OP(first) == PLUS) ||
7075             (OP(first) == MINMOD) ||
7076                /* An {n,m} with n>0 */
7077             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7078             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7079         {
7080                 /*
7081                  * the only op that could be a regnode is PLUS, all the rest
7082                  * will be regnode_1 or regnode_2.
7083                  *
7084                  * (yves doesn't think this is true)
7085                  */
7086                 if (OP(first) == PLUS)
7087                     sawplus = 1;
7088                 else {
7089                     if (OP(first) == MINMOD)
7090                         sawminmod = 1;
7091                     first += regarglen[OP(first)];
7092                 }
7093                 first = NEXTOPER(first);
7094                 first_next= regnext(first);
7095         }
7096
7097         /* Starting-point info. */
7098       again:
7099         DEBUG_PEEP("first:",first,0);
7100         /* Ignore EXACT as we deal with it later. */
7101         if (PL_regkind[OP(first)] == EXACT) {
7102             if (OP(first) == EXACT)
7103                 NOOP;   /* Empty, get anchored substr later. */
7104             else
7105                 ri->regstclass = first;
7106         }
7107 #ifdef TRIE_STCLASS
7108         else if (PL_regkind[OP(first)] == TRIE &&
7109                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7110         {
7111             /* this can happen only on restudy */
7112             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7113         }
7114 #endif
7115         else if (REGNODE_SIMPLE(OP(first)))
7116             ri->regstclass = first;
7117         else if (PL_regkind[OP(first)] == BOUND ||
7118                  PL_regkind[OP(first)] == NBOUND)
7119             ri->regstclass = first;
7120         else if (PL_regkind[OP(first)] == BOL) {
7121             r->intflags |= (OP(first) == MBOL
7122                            ? PREGf_ANCH_MBOL
7123                            : PREGf_ANCH_SBOL);
7124             first = NEXTOPER(first);
7125             goto again;
7126         }
7127         else if (OP(first) == GPOS) {
7128             r->intflags |= PREGf_ANCH_GPOS;
7129             first = NEXTOPER(first);
7130             goto again;
7131         }
7132         else if ((!sawopen || !RExC_sawback) &&
7133             !sawlookahead &&
7134             (OP(first) == STAR &&
7135             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7136             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7137         {
7138             /* turn .* into ^.* with an implied $*=1 */
7139             const int type =
7140                 (OP(NEXTOPER(first)) == REG_ANY)
7141                     ? PREGf_ANCH_MBOL
7142                     : PREGf_ANCH_SBOL;
7143             r->intflags |= (type | PREGf_IMPLICIT);
7144             first = NEXTOPER(first);
7145             goto again;
7146         }
7147         if (sawplus && !sawminmod && !sawlookahead
7148             && (!sawopen || !RExC_sawback)
7149             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7150             /* x+ must match at the 1st pos of run of x's */
7151             r->intflags |= PREGf_SKIP;
7152
7153         /* Scan is after the zeroth branch, first is atomic matcher. */
7154 #ifdef TRIE_STUDY_OPT
7155         DEBUG_PARSE_r(
7156             if (!restudied)
7157                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7158                               (IV)(first - scan + 1))
7159         );
7160 #else
7161         DEBUG_PARSE_r(
7162             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7163                 (IV)(first - scan + 1))
7164         );
7165 #endif
7166
7167
7168         /*
7169         * If there's something expensive in the r.e., find the
7170         * longest literal string that must appear and make it the
7171         * regmust.  Resolve ties in favor of later strings, since
7172         * the regstart check works with the beginning of the r.e.
7173         * and avoiding duplication strengthens checking.  Not a
7174         * strong reason, but sufficient in the absence of others.
7175         * [Now we resolve ties in favor of the earlier string if
7176         * it happens that c_offset_min has been invalidated, since the
7177         * earlier string may buy us something the later one won't.]
7178         */
7179
7180         data.longest_fixed = newSVpvs("");
7181         data.longest_float = newSVpvs("");
7182         data.last_found = newSVpvs("");
7183         data.longest = &(data.longest_fixed);
7184         ENTER_with_name("study_chunk");
7185         SAVEFREESV(data.longest_fixed);
7186         SAVEFREESV(data.longest_float);
7187         SAVEFREESV(data.last_found);
7188         first = scan;
7189         if (!ri->regstclass) {
7190             ssc_init(pRExC_state, &ch_class);
7191             data.start_class = &ch_class;
7192             stclass_flag = SCF_DO_STCLASS_AND;
7193         } else                          /* XXXX Check for BOUND? */
7194             stclass_flag = 0;
7195         data.last_closep = &last_close;
7196
7197         DEBUG_RExC_seen();
7198         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7199                              scan + RExC_size, /* Up to end */
7200             &data, -1, 0, NULL,
7201             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7202                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7203             0);
7204
7205
7206         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7207
7208
7209         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7210              && data.last_start_min == 0 && data.last_end > 0
7211              && !RExC_seen_zerolen
7212              && !(RExC_seen & REG_VERBARG_SEEN)
7213              && !(RExC_seen & REG_GPOS_SEEN)
7214         ){
7215             r->extflags |= RXf_CHECK_ALL;
7216         }
7217         scan_commit(pRExC_state, &data,&minlen,0);
7218
7219         longest_float_length = CHR_SVLEN(data.longest_float);
7220
7221         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7222                    && data.offset_fixed == data.offset_float_min
7223                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7224             && S_setup_longest (aTHX_ pRExC_state,
7225                                     data.longest_float,
7226                                     &(r->float_utf8),
7227                                     &(r->float_substr),
7228                                     &(r->float_end_shift),
7229                                     data.lookbehind_float,
7230                                     data.offset_float_min,
7231                                     data.minlen_float,
7232                                     longest_float_length,
7233                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7234                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7235         {
7236             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7237             r->float_max_offset = data.offset_float_max;
7238             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7239                 r->float_max_offset -= data.lookbehind_float;
7240             SvREFCNT_inc_simple_void_NN(data.longest_float);
7241         }
7242         else {
7243             r->float_substr = r->float_utf8 = NULL;
7244             longest_float_length = 0;
7245         }
7246
7247         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7248
7249         if (S_setup_longest (aTHX_ pRExC_state,
7250                                 data.longest_fixed,
7251                                 &(r->anchored_utf8),
7252                                 &(r->anchored_substr),
7253                                 &(r->anchored_end_shift),
7254                                 data.lookbehind_fixed,
7255                                 data.offset_fixed,
7256                                 data.minlen_fixed,
7257                                 longest_fixed_length,
7258                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7259                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7260         {
7261             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7262             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7263         }
7264         else {
7265             r->anchored_substr = r->anchored_utf8 = NULL;
7266             longest_fixed_length = 0;
7267         }
7268         LEAVE_with_name("study_chunk");
7269
7270         if (ri->regstclass
7271             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7272             ri->regstclass = NULL;
7273
7274         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7275             && stclass_flag
7276             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7277             && is_ssc_worth_it(pRExC_state, data.start_class))
7278         {
7279             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7280
7281             ssc_finalize(pRExC_state, data.start_class);
7282
7283             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7284             StructCopy(data.start_class,
7285                        (regnode_ssc*)RExC_rxi->data->data[n],
7286                        regnode_ssc);
7287             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7288             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7289             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7290                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7291                       PerlIO_printf(Perl_debug_log,
7292                                     "synthetic stclass \"%s\".\n",
7293                                     SvPVX_const(sv));});
7294             data.start_class = NULL;
7295         }
7296
7297         /* A temporary algorithm prefers floated substr to fixed one to dig
7298          * more info. */
7299         if (longest_fixed_length > longest_float_length) {
7300             r->substrs->check_ix = 0;
7301             r->check_end_shift = r->anchored_end_shift;
7302             r->check_substr = r->anchored_substr;
7303             r->check_utf8 = r->anchored_utf8;
7304             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7305             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7306                 r->intflags |= PREGf_NOSCAN;
7307         }
7308         else {
7309             r->substrs->check_ix = 1;
7310             r->check_end_shift = r->float_end_shift;
7311             r->check_substr = r->float_substr;
7312             r->check_utf8 = r->float_utf8;
7313             r->check_offset_min = r->float_min_offset;
7314             r->check_offset_max = r->float_max_offset;
7315         }
7316         if ((r->check_substr || r->check_utf8) ) {
7317             r->extflags |= RXf_USE_INTUIT;
7318             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7319                 r->extflags |= RXf_INTUIT_TAIL;
7320         }
7321         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7322
7323         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7324         if ( (STRLEN)minlen < longest_float_length )
7325             minlen= longest_float_length;
7326         if ( (STRLEN)minlen < longest_fixed_length )
7327             minlen= longest_fixed_length;
7328         */
7329     }
7330     else {
7331         /* Several toplevels. Best we can is to set minlen. */
7332         SSize_t fake;
7333         regnode_ssc ch_class;
7334         SSize_t last_close = 0;
7335
7336         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7337
7338         scan = ri->program + 1;
7339         ssc_init(pRExC_state, &ch_class);
7340         data.start_class = &ch_class;
7341         data.last_closep = &last_close;
7342
7343         DEBUG_RExC_seen();
7344         minlen = study_chunk(pRExC_state,
7345             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7346             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7347                                                       ? SCF_TRIE_DOING_RESTUDY
7348                                                       : 0),
7349             0);
7350
7351         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7352
7353         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7354                 = r->float_substr = r->float_utf8 = NULL;
7355
7356         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7357             && is_ssc_worth_it(pRExC_state, data.start_class))
7358         {
7359             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7360
7361             ssc_finalize(pRExC_state, data.start_class);
7362
7363             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7364             StructCopy(data.start_class,
7365                        (regnode_ssc*)RExC_rxi->data->data[n],
7366                        regnode_ssc);
7367             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7368             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7369             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7370                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7371                       PerlIO_printf(Perl_debug_log,
7372                                     "synthetic stclass \"%s\".\n",
7373                                     SvPVX_const(sv));});
7374             data.start_class = NULL;
7375         }
7376     }
7377
7378     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7379         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7380         r->maxlen = REG_INFTY;
7381     }
7382     else {
7383         r->maxlen = RExC_maxlen;
7384     }
7385
7386     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7387        the "real" pattern. */
7388     DEBUG_OPTIMISE_r({
7389         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7390                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7391     });
7392     r->minlenret = minlen;
7393     if (r->minlen < minlen)
7394         r->minlen = minlen;
7395
7396     if (RExC_seen & REG_GPOS_SEEN)
7397         r->intflags |= PREGf_GPOS_SEEN;
7398     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7399         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7400                                                 lookbehind */
7401     if (pRExC_state->num_code_blocks)
7402         r->extflags |= RXf_EVAL_SEEN;
7403     if (RExC_seen & REG_CANY_SEEN)
7404         r->intflags |= PREGf_CANY_SEEN;
7405     if (RExC_seen & REG_VERBARG_SEEN)
7406     {
7407         r->intflags |= PREGf_VERBARG_SEEN;
7408         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7409     }
7410     if (RExC_seen & REG_CUTGROUP_SEEN)
7411         r->intflags |= PREGf_CUTGROUP_SEEN;
7412     if (pm_flags & PMf_USE_RE_EVAL)
7413         r->intflags |= PREGf_USE_RE_EVAL;
7414     if (RExC_paren_names)
7415         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7416     else
7417         RXp_PAREN_NAMES(r) = NULL;
7418
7419     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7420      * so it can be used in pp.c */
7421     if (r->intflags & PREGf_ANCH)
7422         r->extflags |= RXf_IS_ANCHORED;
7423
7424
7425     {
7426         /* this is used to identify "special" patterns that might result
7427          * in Perl NOT calling the regex engine and instead doing the match "itself",
7428          * particularly special cases in split//. By having the regex compiler
7429          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7430          * we avoid weird issues with equivalent patterns resulting in different behavior,
7431          * AND we allow non Perl engines to get the same optimizations by the setting the
7432          * flags appropriately - Yves */
7433         regnode *first = ri->program + 1;
7434         U8 fop = OP(first);
7435         regnode *next = NEXTOPER(first);
7436         U8 nop = OP(next);
7437
7438         if (PL_regkind[fop] == NOTHING && nop == END)
7439             r->extflags |= RXf_NULL;
7440         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7441             /* when fop is SBOL first->flags will be true only when it was
7442              * produced by parsing /\A/, and not when parsing /^/. This is
7443              * very important for the split code as there we want to
7444              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7445              * See rt #122761 for more details. -- Yves */
7446             r->extflags |= RXf_START_ONLY;
7447         else if (fop == PLUS
7448                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7449                  && OP(regnext(first)) == END)
7450             r->extflags |= RXf_WHITE;
7451         else if ( r->extflags & RXf_SPLIT
7452                   && fop == EXACT
7453                   && STR_LEN(first) == 1
7454                   && *(STRING(first)) == ' '
7455                   && OP(regnext(first)) == END )
7456             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7457
7458     }
7459
7460     if (RExC_contains_locale) {
7461         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7462     }
7463
7464 #ifdef DEBUGGING
7465     if (RExC_paren_names) {
7466         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7467         ri->data->data[ri->name_list_idx]
7468                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7469     } else
7470 #endif
7471         ri->name_list_idx = 0;
7472
7473     if (RExC_recurse_count) {
7474         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7475             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7476             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7477         }
7478     }
7479     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7480     /* assume we don't need to swap parens around before we match */
7481     DEBUG_TEST_r({
7482         PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7483             (unsigned long)RExC_study_chunk_recursed_count);
7484     });
7485     DEBUG_DUMP_r({
7486         DEBUG_RExC_seen();
7487         PerlIO_printf(Perl_debug_log,"Final program:\n");
7488         regdump(r);
7489     });
7490 #ifdef RE_TRACK_PATTERN_OFFSETS
7491     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7492         const STRLEN len = ri->u.offsets[0];
7493         STRLEN i;
7494         GET_RE_DEBUG_FLAGS_DECL;
7495         PerlIO_printf(Perl_debug_log,
7496                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7497         for (i = 1; i <= len; i++) {
7498             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7499                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7500                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7501             }
7502         PerlIO_printf(Perl_debug_log, "\n");
7503     });
7504 #endif
7505
7506 #ifdef USE_ITHREADS
7507     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7508      * by setting the regexp SV to readonly-only instead. If the
7509      * pattern's been recompiled, the USEDness should remain. */
7510     if (old_re && SvREADONLY(old_re))
7511         SvREADONLY_on(rx);
7512 #endif
7513     return rx;
7514 }
7515
7516
7517 SV*
7518 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7519                     const U32 flags)
7520 {
7521     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7522
7523     PERL_UNUSED_ARG(value);
7524
7525     if (flags & RXapif_FETCH) {
7526         return reg_named_buff_fetch(rx, key, flags);
7527     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7528         Perl_croak_no_modify();
7529         return NULL;
7530     } else if (flags & RXapif_EXISTS) {
7531         return reg_named_buff_exists(rx, key, flags)
7532             ? &PL_sv_yes
7533             : &PL_sv_no;
7534     } else if (flags & RXapif_REGNAMES) {
7535         return reg_named_buff_all(rx, flags);
7536     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7537         return reg_named_buff_scalar(rx, flags);
7538     } else {
7539         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7540         return NULL;
7541     }
7542 }
7543
7544 SV*
7545 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7546                          const U32 flags)
7547 {
7548     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7549     PERL_UNUSED_ARG(lastkey);
7550
7551     if (flags & RXapif_FIRSTKEY)
7552         return reg_named_buff_firstkey(rx, flags);
7553     else if (flags & RXapif_NEXTKEY)
7554         return reg_named_buff_nextkey(rx, flags);
7555     else {
7556         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7557                                             (int)flags);
7558         return NULL;
7559     }
7560 }
7561
7562 SV*
7563 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7564                           const U32 flags)
7565 {
7566     AV *retarray = NULL;
7567     SV *ret;
7568     struct regexp *const rx = ReANY(r);
7569
7570     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7571
7572     if (flags & RXapif_ALL)
7573         retarray=newAV();
7574
7575     if (rx && RXp_PAREN_NAMES(rx)) {
7576         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7577         if (he_str) {
7578             IV i;
7579             SV* sv_dat=HeVAL(he_str);
7580             I32 *nums=(I32*)SvPVX(sv_dat);
7581             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7582                 if ((I32)(rx->nparens) >= nums[i]
7583                     && rx->offs[nums[i]].start != -1
7584                     && rx->offs[nums[i]].end != -1)
7585                 {
7586                     ret = newSVpvs("");
7587                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7588                     if (!retarray)
7589                         return ret;
7590                 } else {
7591                     if (retarray)
7592                         ret = newSVsv(&PL_sv_undef);
7593                 }
7594                 if (retarray)
7595                     av_push(retarray, ret);
7596             }
7597             if (retarray)
7598                 return newRV_noinc(MUTABLE_SV(retarray));
7599         }
7600     }
7601     return NULL;
7602 }
7603
7604 bool
7605 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7606                            const U32 flags)
7607 {
7608     struct regexp *const rx = ReANY(r);
7609
7610     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7611
7612     if (rx && RXp_PAREN_NAMES(rx)) {
7613         if (flags & RXapif_ALL) {
7614             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7615         } else {
7616             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7617             if (sv) {
7618                 SvREFCNT_dec_NN(sv);
7619                 return TRUE;
7620             } else {
7621                 return FALSE;
7622             }
7623         }
7624     } else {
7625         return FALSE;
7626     }
7627 }
7628
7629 SV*
7630 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7631 {
7632     struct regexp *const rx = ReANY(r);
7633
7634     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7635
7636     if ( rx && RXp_PAREN_NAMES(rx) ) {
7637         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7638
7639         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7640     } else {
7641         return FALSE;
7642     }
7643 }
7644
7645 SV*
7646 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7647 {
7648     struct regexp *const rx = ReANY(r);
7649     GET_RE_DEBUG_FLAGS_DECL;
7650
7651     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7652
7653     if (rx && RXp_PAREN_NAMES(rx)) {
7654         HV *hv = RXp_PAREN_NAMES(rx);
7655         HE *temphe;
7656         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7657             IV i;
7658             IV parno = 0;
7659             SV* sv_dat = HeVAL(temphe);
7660             I32 *nums = (I32*)SvPVX(sv_dat);
7661             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7662                 if ((I32)(rx->lastparen) >= nums[i] &&
7663                     rx->offs[nums[i]].start != -1 &&
7664                     rx->offs[nums[i]].end != -1)
7665                 {
7666                     parno = nums[i];
7667                     break;
7668                 }
7669             }
7670             if (parno || flags & RXapif_ALL) {
7671                 return newSVhek(HeKEY_hek(temphe));
7672             }
7673         }
7674     }
7675     return NULL;
7676 }
7677
7678 SV*
7679 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7680 {
7681     SV *ret;
7682     AV *av;
7683     SSize_t length;
7684     struct regexp *const rx = ReANY(r);
7685
7686     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7687
7688     if (rx && RXp_PAREN_NAMES(rx)) {
7689         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7690             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7691         } else if (flags & RXapif_ONE) {
7692             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7693             av = MUTABLE_AV(SvRV(ret));
7694             length = av_tindex(av);
7695             SvREFCNT_dec_NN(ret);
7696             return newSViv(length + 1);
7697         } else {
7698             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7699                                                 (int)flags);
7700             return NULL;
7701         }
7702     }
7703     return &PL_sv_undef;
7704 }
7705
7706 SV*
7707 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7708 {
7709     struct regexp *const rx = ReANY(r);
7710     AV *av = newAV();
7711
7712     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7713
7714     if (rx && RXp_PAREN_NAMES(rx)) {
7715         HV *hv= RXp_PAREN_NAMES(rx);
7716         HE *temphe;
7717         (void)hv_iterinit(hv);
7718         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7719             IV i;
7720             IV parno = 0;
7721             SV* sv_dat = HeVAL(temphe);
7722             I32 *nums = (I32*)SvPVX(sv_dat);
7723             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7724                 if ((I32)(rx->lastparen) >= nums[i] &&
7725                     rx->offs[nums[i]].start != -1 &&
7726                     rx->offs[nums[i]].end != -1)
7727                 {
7728                     parno = nums[i];
7729                     break;
7730                 }
7731             }
7732             if (parno || flags & RXapif_ALL) {
7733                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7734             }
7735         }
7736     }
7737
7738     return newRV_noinc(MUTABLE_SV(av));
7739 }
7740
7741 void
7742 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7743                              SV * const sv)
7744 {
7745     struct regexp *const rx = ReANY(r);
7746     char *s = NULL;
7747     SSize_t i = 0;
7748     SSize_t s1, t1;
7749     I32 n = paren;
7750
7751     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7752
7753     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7754            || n == RX_BUFF_IDX_CARET_FULLMATCH
7755            || n == RX_BUFF_IDX_CARET_POSTMATCH
7756        )
7757     {
7758         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7759         if (!keepcopy) {
7760             /* on something like
7761              *    $r = qr/.../;
7762              *    /$qr/p;
7763              * the KEEPCOPY is set on the PMOP rather than the regex */
7764             if (PL_curpm && r == PM_GETRE(PL_curpm))
7765                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7766         }
7767         if (!keepcopy)
7768             goto ret_undef;
7769     }
7770
7771     if (!rx->subbeg)
7772         goto ret_undef;
7773
7774     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7775         /* no need to distinguish between them any more */
7776         n = RX_BUFF_IDX_FULLMATCH;
7777
7778     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7779         && rx->offs[0].start != -1)
7780     {
7781         /* $`, ${^PREMATCH} */
7782         i = rx->offs[0].start;
7783         s = rx->subbeg;
7784     }
7785     else
7786     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7787         && rx->offs[0].end != -1)
7788     {
7789         /* $', ${^POSTMATCH} */
7790         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7791         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7792     }
7793     else
7794     if ( 0 <= n && n <= (I32)rx->nparens &&
7795         (s1 = rx->offs[n].start) != -1 &&
7796         (t1 = rx->offs[n].end) != -1)
7797     {
7798         /* $&, ${^MATCH},  $1 ... */
7799         i = t1 - s1;
7800         s = rx->subbeg + s1 - rx->suboffset;
7801     } else {
7802         goto ret_undef;
7803     }
7804
7805     assert(s >= rx->subbeg);
7806     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7807     if (i >= 0) {
7808 #ifdef NO_TAINT_SUPPORT
7809         sv_setpvn(sv, s, i);
7810 #else
7811         const int oldtainted = TAINT_get;
7812         TAINT_NOT;
7813         sv_setpvn(sv, s, i);
7814         TAINT_set(oldtainted);
7815 #endif
7816         if ( (rx->intflags & PREGf_CANY_SEEN)
7817             ? (RXp_MATCH_UTF8(rx)
7818                         && (!i || is_utf8_string((U8*)s, i)))
7819             : (RXp_MATCH_UTF8(rx)) )
7820         {
7821             SvUTF8_on(sv);
7822         }
7823         else
7824             SvUTF8_off(sv);
7825         if (TAINTING_get) {
7826             if (RXp_MATCH_TAINTED(rx)) {
7827                 if (SvTYPE(sv) >= SVt_PVMG) {
7828                     MAGIC* const mg = SvMAGIC(sv);
7829                     MAGIC* mgt;
7830                     TAINT;
7831                     SvMAGIC_set(sv, mg->mg_moremagic);
7832                     SvTAINT(sv);
7833                     if ((mgt = SvMAGIC(sv))) {
7834                         mg->mg_moremagic = mgt;
7835                         SvMAGIC_set(sv, mg);
7836                     }
7837                 } else {
7838                     TAINT;
7839                     SvTAINT(sv);
7840                 }
7841             } else
7842                 SvTAINTED_off(sv);
7843         }
7844     } else {
7845       ret_undef:
7846         sv_setsv(sv,&PL_sv_undef);
7847         return;
7848     }
7849 }
7850
7851 void
7852 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7853                                                          SV const * const value)
7854 {
7855     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7856
7857     PERL_UNUSED_ARG(rx);
7858     PERL_UNUSED_ARG(paren);
7859     PERL_UNUSED_ARG(value);
7860
7861     if (!PL_localizing)
7862         Perl_croak_no_modify();
7863 }
7864
7865 I32
7866 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7867                               const I32 paren)
7868 {
7869     struct regexp *const rx = ReANY(r);
7870     I32 i;
7871     I32 s1, t1;
7872
7873     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7874
7875     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7876         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7877         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7878     )
7879     {
7880         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7881         if (!keepcopy) {
7882             /* on something like
7883              *    $r = qr/.../;
7884              *    /$qr/p;
7885              * the KEEPCOPY is set on the PMOP rather than the regex */
7886             if (PL_curpm && r == PM_GETRE(PL_curpm))
7887                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7888         }
7889         if (!keepcopy)
7890             goto warn_undef;
7891     }
7892
7893     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7894     switch (paren) {
7895       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7896       case RX_BUFF_IDX_PREMATCH:       /* $` */
7897         if (rx->offs[0].start != -1) {
7898                         i = rx->offs[0].start;
7899                         if (i > 0) {
7900                                 s1 = 0;
7901                                 t1 = i;
7902                                 goto getlen;
7903                         }
7904             }
7905         return 0;
7906
7907       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7908       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7909             if (rx->offs[0].end != -1) {
7910                         i = rx->sublen - rx->offs[0].end;
7911                         if (i > 0) {
7912                                 s1 = rx->offs[0].end;
7913                                 t1 = rx->sublen;
7914                                 goto getlen;
7915                         }
7916             }
7917         return 0;
7918
7919       default: /* $& / ${^MATCH}, $1, $2, ... */
7920             if (paren <= (I32)rx->nparens &&
7921             (s1 = rx->offs[paren].start) != -1 &&
7922             (t1 = rx->offs[paren].end) != -1)
7923             {
7924             i = t1 - s1;
7925             goto getlen;
7926         } else {
7927           warn_undef:
7928             if (ckWARN(WARN_UNINITIALIZED))
7929                 report_uninit((const SV *)sv);
7930             return 0;
7931         }
7932     }
7933   getlen:
7934     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7935         const char * const s = rx->subbeg - rx->suboffset + s1;
7936         const U8 *ep;
7937         STRLEN el;
7938
7939         i = t1 - s1;
7940         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7941                         i = el;
7942     }
7943     return i;
7944 }
7945
7946 SV*
7947 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7948 {
7949     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7950         PERL_UNUSED_ARG(rx);
7951         if (0)
7952             return NULL;
7953         else
7954             return newSVpvs("Regexp");
7955 }
7956
7957 /* Scans the name of a named buffer from the pattern.
7958  * If flags is REG_RSN_RETURN_NULL returns null.
7959  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7960  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7961  * to the parsed name as looked up in the RExC_paren_names hash.
7962  * If there is an error throws a vFAIL().. type exception.
7963  */
7964
7965 #define REG_RSN_RETURN_NULL    0
7966 #define REG_RSN_RETURN_NAME    1
7967 #define REG_RSN_RETURN_DATA    2
7968
7969 STATIC SV*
7970 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7971 {
7972     char *name_start = RExC_parse;
7973
7974     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7975
7976     assert (RExC_parse <= RExC_end);
7977     if (RExC_parse == RExC_end) NOOP;
7978     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7979          /* skip IDFIRST by using do...while */
7980         if (UTF)
7981             do {
7982                 RExC_parse += UTF8SKIP(RExC_parse);
7983             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7984         else
7985             do {
7986                 RExC_parse++;
7987             } while (isWORDCHAR(*RExC_parse));
7988     } else {
7989         RExC_parse++; /* so the <- from the vFAIL is after the offending
7990                          character */
7991         vFAIL("Group name must start with a non-digit word character");
7992     }
7993     if ( flags ) {
7994         SV* sv_name
7995             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7996                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7997         if ( flags == REG_RSN_RETURN_NAME)
7998             return sv_name;
7999         else if (flags==REG_RSN_RETURN_DATA) {
8000             HE *he_str = NULL;
8001             SV *sv_dat = NULL;
8002             if ( ! sv_name )      /* should not happen*/
8003                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8004             if (RExC_paren_names)
8005                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8006             if ( he_str )
8007                 sv_dat = HeVAL(he_str);
8008             if ( ! sv_dat )
8009                 vFAIL("Reference to nonexistent named group");
8010             return sv_dat;
8011         }
8012         else {
8013             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8014                        (unsigned long) flags);
8015         }
8016         NOT_REACHED; /* NOT REACHED */
8017     }
8018     return NULL;
8019 }
8020
8021 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8022     int num;                                                    \
8023     if (RExC_lastparse!=RExC_parse) {                           \
8024         PerlIO_printf(Perl_debug_log, "%s",                     \
8025             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8026                 RExC_end - RExC_parse, 16,                      \
8027                 "", "",                                         \
8028                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8029                 PERL_PV_PRETTY_ELLIPSES   |                     \
8030                 PERL_PV_PRETTY_LTGT       |                     \
8031                 PERL_PV_ESCAPE_RE         |                     \
8032                 PERL_PV_PRETTY_EXACTSIZE                        \
8033             )                                                   \
8034         );                                                      \
8035     } else                                                      \
8036         PerlIO_printf(Perl_debug_log,"%16s","");                \
8037                                                                 \
8038     if (SIZE_ONLY)                                              \
8039        num = RExC_size + 1;                                     \
8040     else                                                        \
8041        num=REG_NODE_NUM(RExC_emit);                             \
8042     if (RExC_lastnum!=num)                                      \
8043        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
8044     else                                                        \
8045        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
8046     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
8047         (int)((depth*2)), "",                                   \
8048         (funcname)                                              \
8049     );                                                          \
8050     RExC_lastnum=num;                                           \
8051     RExC_lastparse=RExC_parse;                                  \
8052 })
8053
8054
8055
8056 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8057     DEBUG_PARSE_MSG((funcname));                            \
8058     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
8059 })
8060 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
8061     DEBUG_PARSE_MSG((funcname));                            \
8062     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
8063 })
8064
8065 /* This section of code defines the inversion list object and its methods.  The
8066  * interfaces are highly subject to change, so as much as possible is static to
8067  * this file.  An inversion list is here implemented as a malloc'd C UV array
8068  * as an SVt_INVLIST scalar.
8069  *
8070  * An inversion list for Unicode is an array of code points, sorted by ordinal
8071  * number.  The zeroth element is the first code point in the list.  The 1th
8072  * element is the first element beyond that not in the list.  In other words,
8073  * the first range is
8074  *  invlist[0]..(invlist[1]-1)
8075  * The other ranges follow.  Thus every element whose index is divisible by two
8076  * marks the beginning of a range that is in the list, and every element not
8077  * divisible by two marks the beginning of a range not in the list.  A single
8078  * element inversion list that contains the single code point N generally
8079  * consists of two elements
8080  *  invlist[0] == N
8081  *  invlist[1] == N+1
8082  * (The exception is when N is the highest representable value on the
8083  * machine, in which case the list containing just it would be a single
8084  * element, itself.  By extension, if the last range in the list extends to
8085  * infinity, then the first element of that range will be in the inversion list
8086  * at a position that is divisible by two, and is the final element in the
8087  * list.)
8088  * Taking the complement (inverting) an inversion list is quite simple, if the
8089  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8090  * This implementation reserves an element at the beginning of each inversion
8091  * list to always contain 0; there is an additional flag in the header which
8092  * indicates if the list begins at the 0, or is offset to begin at the next
8093  * element.
8094  *
8095  * More about inversion lists can be found in "Unicode Demystified"
8096  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8097  * More will be coming when functionality is added later.
8098  *
8099  * The inversion list data structure is currently implemented as an SV pointing
8100  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8101  * array of UV whose memory management is automatically handled by the existing
8102  * facilities for SV's.
8103  *
8104  * Some of the methods should always be private to the implementation, and some
8105  * should eventually be made public */
8106
8107 /* The header definitions are in F<inline_invlist.c> */
8108
8109 PERL_STATIC_INLINE UV*
8110 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8111 {
8112     /* Returns a pointer to the first element in the inversion list's array.
8113      * This is called upon initialization of an inversion list.  Where the
8114      * array begins depends on whether the list has the code point U+0000 in it
8115      * or not.  The other parameter tells it whether the code that follows this
8116      * call is about to put a 0 in the inversion list or not.  The first
8117      * element is either the element reserved for 0, if TRUE, or the element
8118      * after it, if FALSE */
8119
8120     bool* offset = get_invlist_offset_addr(invlist);
8121     UV* zero_addr = (UV *) SvPVX(invlist);
8122
8123     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8124
8125     /* Must be empty */
8126     assert(! _invlist_len(invlist));
8127
8128     *zero_addr = 0;
8129
8130     /* 1^1 = 0; 1^0 = 1 */
8131     *offset = 1 ^ will_have_0;
8132     return zero_addr + *offset;
8133 }
8134
8135 PERL_STATIC_INLINE UV*
8136 S_invlist_array(SV* const invlist)
8137 {
8138     /* Returns the pointer to the inversion list's array.  Every time the
8139      * length changes, this needs to be called in case malloc or realloc moved
8140      * it */
8141
8142     PERL_ARGS_ASSERT_INVLIST_ARRAY;
8143
8144     /* Must not be empty.  If these fail, you probably didn't check for <len>
8145      * being non-zero before trying to get the array */
8146     assert(_invlist_len(invlist));
8147
8148     /* The very first element always contains zero, The array begins either
8149      * there, or if the inversion list is offset, at the element after it.
8150      * The offset header field determines which; it contains 0 or 1 to indicate
8151      * how much additionally to add */
8152     assert(0 == *(SvPVX(invlist)));
8153     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8154 }
8155
8156 PERL_STATIC_INLINE void
8157 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8158 {
8159     /* Sets the current number of elements stored in the inversion list.
8160      * Updates SvCUR correspondingly */
8161     PERL_UNUSED_CONTEXT;
8162     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8163
8164     assert(SvTYPE(invlist) == SVt_INVLIST);
8165
8166     SvCUR_set(invlist,
8167               (len == 0)
8168                ? 0
8169                : TO_INTERNAL_SIZE(len + offset));
8170     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8171 }
8172
8173 #ifndef PERL_IN_XSUB_RE
8174
8175 PERL_STATIC_INLINE IV*
8176 S_get_invlist_previous_index_addr(SV* invlist)
8177 {
8178     /* Return the address of the IV that is reserved to hold the cached index
8179      * */
8180     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8181
8182     assert(SvTYPE(invlist) == SVt_INVLIST);
8183
8184     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8185 }
8186
8187 PERL_STATIC_INLINE IV
8188 S_invlist_previous_index(SV* const invlist)
8189 {
8190     /* Returns cached index of previous search */
8191
8192     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8193
8194     return *get_invlist_previous_index_addr(invlist);
8195 }
8196
8197 PERL_STATIC_INLINE void
8198 S_invlist_set_previous_index(SV* const invlist, const IV index)
8199 {
8200     /* Caches <index> for later retrieval */
8201
8202     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8203
8204     assert(index == 0 || index < (int) _invlist_len(invlist));
8205
8206     *get_invlist_previous_index_addr(invlist) = index;
8207 }
8208
8209 PERL_STATIC_INLINE void
8210 S_invlist_trim(SV* const invlist)
8211 {
8212     PERL_ARGS_ASSERT_INVLIST_TRIM;
8213
8214     assert(SvTYPE(invlist) == SVt_INVLIST);
8215
8216     /* Change the length of the inversion list to how many entries it currently
8217      * has */
8218     SvPV_shrink_to_cur((SV *) invlist);
8219 }
8220
8221 PERL_STATIC_INLINE bool
8222 S_invlist_is_iterating(SV* const invlist)
8223 {
8224     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8225
8226     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8227 }
8228
8229 #endif /* ifndef PERL_IN_XSUB_RE */
8230
8231 PERL_STATIC_INLINE UV
8232 S_invlist_max(SV* const invlist)
8233 {
8234     /* Returns the maximum number of elements storable in the inversion list's
8235      * array, without having to realloc() */
8236
8237     PERL_ARGS_ASSERT_INVLIST_MAX;
8238
8239     assert(SvTYPE(invlist) == SVt_INVLIST);
8240
8241     /* Assumes worst case, in which the 0 element is not counted in the
8242      * inversion list, so subtracts 1 for that */
8243     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8244            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8245            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8246 }
8247
8248 #ifndef PERL_IN_XSUB_RE
8249 SV*
8250 Perl__new_invlist(pTHX_ IV initial_size)
8251 {
8252
8253     /* Return a pointer to a newly constructed inversion list, with enough
8254      * space to store 'initial_size' elements.  If that number is negative, a
8255      * system default is used instead */
8256
8257     SV* new_list;
8258
8259     if (initial_size < 0) {
8260         initial_size = 10;
8261     }
8262
8263     /* Allocate the initial space */
8264     new_list = newSV_type(SVt_INVLIST);
8265
8266     /* First 1 is in case the zero element isn't in the list; second 1 is for
8267      * trailing NUL */
8268     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8269     invlist_set_len(new_list, 0, 0);
8270
8271     /* Force iterinit() to be used to get iteration to work */
8272     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8273
8274     *get_invlist_previous_index_addr(new_list) = 0;
8275
8276     return new_list;
8277 }
8278
8279 SV*
8280 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8281 {
8282     /* Return a pointer to a newly constructed inversion list, initialized to
8283      * point to <list>, which has to be in the exact correct inversion list
8284      * form, including internal fields.  Thus this is a dangerous routine that
8285      * should not be used in the wrong hands.  The passed in 'list' contains
8286      * several header fields at the beginning that are not part of the
8287      * inversion list body proper */
8288
8289     const STRLEN length = (STRLEN) list[0];
8290     const UV version_id =          list[1];
8291     const bool offset   =    cBOOL(list[2]);
8292 #define HEADER_LENGTH 3
8293     /* If any of the above changes in any way, you must change HEADER_LENGTH
8294      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8295      *      perl -E 'say int(rand 2**31-1)'
8296      */
8297 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8298                                         data structure type, so that one being
8299                                         passed in can be validated to be an
8300                                         inversion list of the correct vintage.
8301                                        */
8302
8303     SV* invlist = newSV_type(SVt_INVLIST);
8304
8305     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8306
8307     if (version_id != INVLIST_VERSION_ID) {
8308         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8309     }
8310
8311     /* The generated array passed in includes header elements that aren't part
8312      * of the list proper, so start it just after them */
8313     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8314
8315     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8316                                shouldn't touch it */
8317
8318     *(get_invlist_offset_addr(invlist)) = offset;
8319
8320     /* The 'length' passed to us is the physical number of elements in the
8321      * inversion list.  But if there is an offset the logical number is one
8322      * less than that */
8323     invlist_set_len(invlist, length  - offset, offset);
8324
8325     invlist_set_previous_index(invlist, 0);
8326
8327     /* Initialize the iteration pointer. */
8328     invlist_iterfinish(invlist);
8329
8330     SvREADONLY_on(invlist);
8331
8332     return invlist;
8333 }
8334 #endif /* ifndef PERL_IN_XSUB_RE */
8335
8336 STATIC void
8337 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8338 {
8339     /* Grow the maximum size of an inversion list */
8340
8341     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8342
8343     assert(SvTYPE(invlist) == SVt_INVLIST);
8344
8345     /* Add one to account for the zero element at the beginning which may not
8346      * be counted by the calling parameters */
8347     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8348 }
8349
8350 STATIC void
8351 S__append_range_to_invlist(pTHX_ SV* const invlist,
8352                                  const UV start, const UV end)
8353 {
8354    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8355     * the end of the inversion list.  The range must be above any existing
8356     * ones. */
8357
8358     UV* array;
8359     UV max = invlist_max(invlist);
8360     UV len = _invlist_len(invlist);
8361     bool offset;
8362
8363     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8364
8365     if (len == 0) { /* Empty lists must be initialized */
8366         offset = start != 0;
8367         array = _invlist_array_init(invlist, ! offset);
8368     }
8369     else {
8370         /* Here, the existing list is non-empty. The current max entry in the
8371          * list is generally the first value not in the set, except when the
8372          * set extends to the end of permissible values, in which case it is
8373          * the first entry in that final set, and so this call is an attempt to
8374          * append out-of-order */
8375
8376         UV final_element = len - 1;
8377         array = invlist_array(invlist);
8378         if (array[final_element] > start
8379             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8380         {
8381             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8382                      array[final_element], start,
8383                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8384         }
8385
8386         /* Here, it is a legal append.  If the new range begins with the first
8387          * value not in the set, it is extending the set, so the new first
8388          * value not in the set is one greater than the newly extended range.
8389          * */
8390         offset = *get_invlist_offset_addr(invlist);
8391         if (array[final_element] == start) {
8392             if (end != UV_MAX) {
8393                 array[final_element] = end + 1;
8394             }
8395             else {
8396                 /* But if the end is the maximum representable on the machine,
8397                  * just let the range that this would extend to have no end */
8398                 invlist_set_len(invlist, len - 1, offset);
8399             }
8400             return;
8401         }
8402     }
8403
8404     /* Here the new range doesn't extend any existing set.  Add it */
8405
8406     len += 2;   /* Includes an element each for the start and end of range */
8407
8408     /* If wll overflow the existing space, extend, which may cause the array to
8409      * be moved */
8410     if (max < len) {
8411         invlist_extend(invlist, len);
8412
8413         /* Have to set len here to avoid assert failure in invlist_array() */
8414         invlist_set_len(invlist, len, offset);
8415
8416         array = invlist_array(invlist);
8417     }
8418     else {
8419         invlist_set_len(invlist, len, offset);
8420     }
8421
8422     /* The next item on the list starts the range, the one after that is
8423      * one past the new range.  */
8424     array[len - 2] = start;
8425     if (end != UV_MAX) {
8426         array[len - 1] = end + 1;
8427     }
8428     else {
8429         /* But if the end is the maximum representable on the machine, just let
8430          * the range have no end */
8431         invlist_set_len(invlist, len - 1, offset);
8432     }
8433 }
8434
8435 #ifndef PERL_IN_XSUB_RE
8436
8437 IV
8438 Perl__invlist_search(SV* const invlist, const UV cp)
8439 {
8440     /* Searches the inversion list for the entry that contains the input code
8441      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8442      * return value is the index into the list's array of the range that
8443      * contains <cp> */
8444
8445     IV low = 0;
8446     IV mid;
8447     IV high = _invlist_len(invlist);
8448     const IV highest_element = high - 1;
8449     const UV* array;
8450
8451     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8452
8453     /* If list is empty, return failure. */
8454     if (high == 0) {
8455         return -1;
8456     }
8457
8458     /* (We can't get the array unless we know the list is non-empty) */
8459     array = invlist_array(invlist);
8460
8461     mid = invlist_previous_index(invlist);
8462     assert(mid >=0 && mid <= highest_element);
8463
8464     /* <mid> contains the cache of the result of the previous call to this
8465      * function (0 the first time).  See if this call is for the same result,
8466      * or if it is for mid-1.  This is under the theory that calls to this
8467      * function will often be for related code points that are near each other.
8468      * And benchmarks show that caching gives better results.  We also test
8469      * here if the code point is within the bounds of the list.  These tests
8470      * replace others that would have had to be made anyway to make sure that
8471      * the array bounds were not exceeded, and these give us extra information
8472      * at the same time */
8473     if (cp >= array[mid]) {
8474         if (cp >= array[highest_element]) {
8475             return highest_element;
8476         }
8477
8478         /* Here, array[mid] <= cp < array[highest_element].  This means that
8479          * the final element is not the answer, so can exclude it; it also
8480          * means that <mid> is not the final element, so can refer to 'mid + 1'
8481          * safely */
8482         if (cp < array[mid + 1]) {
8483             return mid;
8484         }
8485         high--;
8486         low = mid + 1;
8487     }
8488     else { /* cp < aray[mid] */
8489         if (cp < array[0]) { /* Fail if outside the array */
8490             return -1;
8491         }
8492         high = mid;
8493         if (cp >= array[mid - 1]) {
8494             goto found_entry;
8495         }
8496     }
8497
8498     /* Binary search.  What we are looking for is <i> such that
8499      *  array[i] <= cp < array[i+1]
8500      * The loop below converges on the i+1.  Note that there may not be an
8501      * (i+1)th element in the array, and things work nonetheless */
8502     while (low < high) {
8503         mid = (low + high) / 2;
8504         assert(mid <= highest_element);
8505         if (array[mid] <= cp) { /* cp >= array[mid] */
8506             low = mid + 1;
8507
8508             /* We could do this extra test to exit the loop early.
8509             if (cp < array[low]) {
8510                 return mid;
8511             }
8512             */
8513         }
8514         else { /* cp < array[mid] */
8515             high = mid;
8516         }
8517     }
8518
8519   found_entry:
8520     high--;
8521     invlist_set_previous_index(invlist, high);
8522     return high;
8523 }
8524
8525 void
8526 Perl__invlist_populate_swatch(SV* const invlist,
8527                               const UV start, const UV end, U8* swatch)
8528 {
8529     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8530      * but is used when the swash has an inversion list.  This makes this much
8531      * faster, as it uses a binary search instead of a linear one.  This is
8532      * intimately tied to that function, and perhaps should be in utf8.c,
8533      * except it is intimately tied to inversion lists as well.  It assumes
8534      * that <swatch> is all 0's on input */
8535
8536     UV current = start;
8537     const IV len = _invlist_len(invlist);
8538     IV i;
8539     const UV * array;
8540
8541     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8542
8543     if (len == 0) { /* Empty inversion list */
8544         return;
8545     }
8546
8547     array = invlist_array(invlist);
8548
8549     /* Find which element it is */
8550     i = _invlist_search(invlist, start);
8551
8552     /* We populate from <start> to <end> */
8553     while (current < end) {
8554         UV upper;
8555
8556         /* The inversion list gives the results for every possible code point
8557          * after the first one in the list.  Only those ranges whose index is
8558          * even are ones that the inversion list matches.  For the odd ones,
8559          * and if the initial code point is not in the list, we have to skip
8560          * forward to the next element */
8561         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8562             i++;
8563             if (i >= len) { /* Finished if beyond the end of the array */
8564                 return;
8565             }
8566             current = array[i];
8567             if (current >= end) {   /* Finished if beyond the end of what we
8568                                        are populating */
8569                 if (LIKELY(end < UV_MAX)) {
8570                     return;
8571                 }
8572
8573                 /* We get here when the upper bound is the maximum
8574                  * representable on the machine, and we are looking for just
8575                  * that code point.  Have to special case it */
8576                 i = len;
8577                 goto join_end_of_list;
8578             }
8579         }
8580         assert(current >= start);
8581
8582         /* The current range ends one below the next one, except don't go past
8583          * <end> */
8584         i++;
8585         upper = (i < len && array[i] < end) ? array[i] : end;
8586
8587         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8588          * for each code point in it */
8589         for (; current < upper; current++) {
8590             const STRLEN offset = (STRLEN)(current - start);
8591             swatch[offset >> 3] |= 1 << (offset & 7);
8592         }
8593
8594     join_end_of_list:
8595
8596         /* Quit if at the end of the list */
8597         if (i >= len) {
8598
8599             /* But first, have to deal with the highest possible code point on
8600              * the platform.  The previous code assumes that <end> is one
8601              * beyond where we want to populate, but that is impossible at the
8602              * platform's infinity, so have to handle it specially */
8603             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8604             {
8605                 const STRLEN offset = (STRLEN)(end - start);
8606                 swatch[offset >> 3] |= 1 << (offset & 7);
8607             }
8608             return;
8609         }
8610
8611         /* Advance to the next range, which will be for code points not in the
8612          * inversion list */
8613         current = array[i];
8614     }
8615
8616     return;
8617 }
8618
8619 void
8620 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8621                                          const bool complement_b, SV** output)
8622 {
8623     /* Take the union of two inversion lists and point <output> to it.  *output
8624      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8625      * the reference count to that list will be decremented if not already a
8626      * temporary (mortal); otherwise *output will be made correspondingly
8627      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8628      * second list is returned.  If <complement_b> is TRUE, the union is taken
8629      * of the complement (inversion) of <b> instead of b itself.
8630      *
8631      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8632      * Richard Gillam, published by Addison-Wesley, and explained at some
8633      * length there.  The preface says to incorporate its examples into your
8634      * code at your own risk.
8635      *
8636      * The algorithm is like a merge sort.
8637      *
8638      * XXX A potential performance improvement is to keep track as we go along
8639      * if only one of the inputs contributes to the result, meaning the other
8640      * is a subset of that one.  In that case, we can skip the final copy and
8641      * return the larger of the input lists, but then outside code might need
8642      * to keep track of whether to free the input list or not */
8643
8644     const UV* array_a;    /* a's array */
8645     const UV* array_b;
8646     UV len_a;       /* length of a's array */
8647     UV len_b;
8648
8649     SV* u;                      /* the resulting union */
8650     UV* array_u;
8651     UV len_u;
8652
8653     UV i_a = 0;             /* current index into a's array */
8654     UV i_b = 0;
8655     UV i_u = 0;
8656
8657     /* running count, as explained in the algorithm source book; items are
8658      * stopped accumulating and are output when the count changes to/from 0.
8659      * The count is incremented when we start a range that's in the set, and
8660      * decremented when we start a range that's not in the set.  So its range
8661      * is 0 to 2.  Only when the count is zero is something not in the set.
8662      */
8663     UV count = 0;
8664
8665     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8666     assert(a != b);
8667
8668     /* If either one is empty, the union is the other one */
8669     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8670         bool make_temp = FALSE; /* Should we mortalize the result? */
8671
8672         if (*output == a) {
8673             if (a != NULL) {
8674                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8675                     SvREFCNT_dec_NN(a);
8676                 }
8677             }
8678         }
8679         if (*output != b) {
8680             *output = invlist_clone(b);
8681             if (complement_b) {
8682                 _invlist_invert(*output);
8683             }
8684         } /* else *output already = b; */
8685
8686         if (make_temp) {
8687             sv_2mortal(*output);
8688         }
8689         return;
8690     }
8691     else if ((len_b = _invlist_len(b)) == 0) {
8692         bool make_temp = FALSE;
8693         if (*output == b) {
8694             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8695                 SvREFCNT_dec_NN(b);
8696             }
8697         }
8698
8699         /* The complement of an empty list is a list that has everything in it,
8700          * so the union with <a> includes everything too */
8701         if (complement_b) {
8702             if (a == *output) {
8703                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8704                     SvREFCNT_dec_NN(a);
8705                 }
8706             }
8707             *output = _new_invlist(1);
8708             _append_range_to_invlist(*output, 0, UV_MAX);
8709         }
8710         else if (*output != a) {
8711             *output = invlist_clone(a);
8712         }
8713         /* else *output already = a; */
8714
8715         if (make_temp) {
8716             sv_2mortal(*output);
8717         }
8718         return;
8719     }
8720
8721     /* Here both lists exist and are non-empty */
8722     array_a = invlist_array(a);
8723     array_b = invlist_array(b);
8724
8725     /* If are to take the union of 'a' with the complement of b, set it
8726      * up so are looking at b's complement. */
8727     if (complement_b) {
8728
8729         /* To complement, we invert: if the first element is 0, remove it.  To
8730          * do this, we just pretend the array starts one later */
8731         if (array_b[0] == 0) {
8732             array_b++;
8733             len_b--;
8734         }
8735         else {
8736
8737             /* But if the first element is not zero, we pretend the list starts
8738              * at the 0 that is always stored immediately before the array. */
8739             array_b--;
8740             len_b++;
8741         }
8742     }
8743
8744     /* Size the union for the worst case: that the sets are completely
8745      * disjoint */
8746     u = _new_invlist(len_a + len_b);
8747
8748     /* Will contain U+0000 if either component does */
8749     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8750                                       || (len_b > 0 && array_b[0] == 0));
8751
8752     /* Go through each list item by item, stopping when exhausted one of
8753      * them */
8754     while (i_a < len_a && i_b < len_b) {
8755         UV cp;      /* The element to potentially add to the union's array */
8756         bool cp_in_set;   /* is it in the the input list's set or not */
8757
8758         /* We need to take one or the other of the two inputs for the union.
8759          * Since we are merging two sorted lists, we take the smaller of the
8760          * next items.  In case of a tie, we take the one that is in its set
8761          * first.  If we took one not in the set first, it would decrement the
8762          * count, possibly to 0 which would cause it to be output as ending the
8763          * range, and the next time through we would take the same number, and
8764          * output it again as beginning the next range.  By doing it the
8765          * opposite way, there is no possibility that the count will be
8766          * momentarily decremented to 0, and thus the two adjoining ranges will
8767          * be seamlessly merged.  (In a tie and both are in the set or both not
8768          * in the set, it doesn't matter which we take first.) */
8769         if (array_a[i_a] < array_b[i_b]
8770             || (array_a[i_a] == array_b[i_b]
8771                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8772         {
8773             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8774             cp= array_a[i_a++];
8775         }
8776         else {
8777             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8778             cp = array_b[i_b++];
8779         }
8780
8781         /* Here, have chosen which of the two inputs to look at.  Only output
8782          * if the running count changes to/from 0, which marks the
8783          * beginning/end of a range in that's in the set */
8784         if (cp_in_set) {
8785             if (count == 0) {
8786                 array_u[i_u++] = cp;
8787             }
8788             count++;
8789         }
8790         else {
8791             count--;
8792             if (count == 0) {
8793                 array_u[i_u++] = cp;
8794             }
8795         }
8796     }
8797
8798     /* Here, we are finished going through at least one of the lists, which
8799      * means there is something remaining in at most one.  We check if the list
8800      * that hasn't been exhausted is positioned such that we are in the middle
8801      * of a range in its set or not.  (i_a and i_b point to the element beyond
8802      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8803      * is potentially more to output.
8804      * There are four cases:
8805      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8806      *     in the union is entirely from the non-exhausted set.
8807      *  2) Both were in their sets, count is 2.  Nothing further should
8808      *     be output, as everything that remains will be in the exhausted
8809      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8810      *     that
8811      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8812      *     Nothing further should be output because the union includes
8813      *     everything from the exhausted set.  Not decrementing ensures that.
8814      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8815      *     decrementing to 0 insures that we look at the remainder of the
8816      *     non-exhausted set */
8817     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8818         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8819     {
8820         count--;
8821     }
8822
8823     /* The final length is what we've output so far, plus what else is about to
8824      * be output.  (If 'count' is non-zero, then the input list we exhausted
8825      * has everything remaining up to the machine's limit in its set, and hence
8826      * in the union, so there will be no further output. */
8827     len_u = i_u;
8828     if (count == 0) {
8829         /* At most one of the subexpressions will be non-zero */
8830         len_u += (len_a - i_a) + (len_b - i_b);
8831     }
8832
8833     /* Set result to final length, which can change the pointer to array_u, so
8834      * re-find it */
8835     if (len_u != _invlist_len(u)) {
8836         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8837         invlist_trim(u);
8838         array_u = invlist_array(u);
8839     }
8840
8841     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8842      * the other) ended with everything above it not in its set.  That means
8843      * that the remaining part of the union is precisely the same as the
8844      * non-exhausted list, so can just copy it unchanged.  (If both list were
8845      * exhausted at the same time, then the operations below will be both 0.)
8846      */
8847     if (count == 0) {
8848         IV copy_count; /* At most one will have a non-zero copy count */
8849         if ((copy_count = len_a - i_a) > 0) {
8850             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8851         }
8852         else if ((copy_count = len_b - i_b) > 0) {
8853             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8854         }
8855     }
8856
8857     /*  We may be removing a reference to one of the inputs.  If so, the output
8858      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8859      *  count decremented) */
8860     if (a == *output || b == *output) {
8861         assert(! invlist_is_iterating(*output));
8862         if ((SvTEMP(*output))) {
8863             sv_2mortal(u);
8864         }
8865         else {
8866             SvREFCNT_dec_NN(*output);
8867         }
8868     }
8869
8870     *output = u;
8871
8872     return;
8873 }
8874
8875 void
8876 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8877                                                const bool complement_b, SV** i)
8878 {
8879     /* Take the intersection of two inversion lists and point <i> to it.  *i
8880      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8881      * the reference count to that list will be decremented if not already a
8882      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8883      * The first list, <a>, may be NULL, in which case an empty list is
8884      * returned.  If <complement_b> is TRUE, the result will be the
8885      * intersection of <a> and the complement (or inversion) of <b> instead of
8886      * <b> directly.
8887      *
8888      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8889      * Richard Gillam, published by Addison-Wesley, and explained at some
8890      * length there.  The preface says to incorporate its examples into your
8891      * code at your own risk.  In fact, it had bugs
8892      *
8893      * The algorithm is like a merge sort, and is essentially the same as the
8894      * union above
8895      */
8896
8897     const UV* array_a;          /* a's array */
8898     const UV* array_b;
8899     UV len_a;   /* length of a's array */
8900     UV len_b;
8901
8902     SV* r;                   /* the resulting intersection */
8903     UV* array_r;
8904     UV len_r;
8905
8906     UV i_a = 0;             /* current index into a's array */
8907     UV i_b = 0;
8908     UV i_r = 0;
8909
8910     /* running count, as explained in the algorithm source book; items are
8911      * stopped accumulating and are output when the count changes to/from 2.
8912      * The count is incremented when we start a range that's in the set, and
8913      * decremented when we start a range that's not in the set.  So its range
8914      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8915      */
8916     UV count = 0;
8917
8918     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8919     assert(a != b);
8920
8921     /* Special case if either one is empty */
8922     len_a = (a == NULL) ? 0 : _invlist_len(a);
8923     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8924         bool make_temp = FALSE;
8925
8926         if (len_a != 0 && complement_b) {
8927
8928             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8929              * be empty.  Here, also we are using 'b's complement, which hence
8930              * must be every possible code point.  Thus the intersection is
8931              * simply 'a'. */
8932             if (*i != a) {
8933                 if (*i == b) {
8934                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8935                         SvREFCNT_dec_NN(b);
8936                     }
8937                 }
8938
8939                 *i = invlist_clone(a);
8940             }
8941             /* else *i is already 'a' */
8942
8943             if (make_temp) {
8944                 sv_2mortal(*i);
8945             }
8946             return;
8947         }
8948
8949         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8950          * intersection must be empty */
8951         if (*i == a) {
8952             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8953                 SvREFCNT_dec_NN(a);
8954             }
8955         }
8956         else if (*i == b) {
8957             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8958                 SvREFCNT_dec_NN(b);
8959             }
8960         }
8961         *i = _new_invlist(0);
8962         if (make_temp) {
8963             sv_2mortal(*i);
8964         }
8965
8966         return;
8967     }
8968
8969     /* Here both lists exist and are non-empty */
8970     array_a = invlist_array(a);
8971     array_b = invlist_array(b);
8972
8973     /* If are to take the intersection of 'a' with the complement of b, set it
8974      * up so are looking at b's complement. */
8975     if (complement_b) {
8976
8977         /* To complement, we invert: if the first element is 0, remove it.  To
8978          * do this, we just pretend the array starts one later */
8979         if (array_b[0] == 0) {
8980             array_b++;
8981             len_b--;
8982         }
8983         else {
8984
8985             /* But if the first element is not zero, we pretend the list starts
8986              * at the 0 that is always stored immediately before the array. */
8987             array_b--;
8988             len_b++;
8989         }
8990     }
8991
8992     /* Size the intersection for the worst case: that the intersection ends up
8993      * fragmenting everything to be completely disjoint */
8994     r= _new_invlist(len_a + len_b);
8995
8996     /* Will contain U+0000 iff both components do */
8997     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8998                                      && len_b > 0 && array_b[0] == 0);
8999
9000     /* Go through each list item by item, stopping when exhausted one of
9001      * them */
9002     while (i_a < len_a && i_b < len_b) {
9003         UV cp;      /* The element to potentially add to the intersection's
9004                        array */
9005         bool cp_in_set; /* Is it in the input list's set or not */
9006
9007         /* We need to take one or the other of the two inputs for the
9008          * intersection.  Since we are merging two sorted lists, we take the
9009          * smaller of the next items.  In case of a tie, we take the one that
9010          * is not in its set first (a difference from the union algorithm).  If
9011          * we took one in the set first, it would increment the count, possibly
9012          * to 2 which would cause it to be output as starting a range in the
9013          * intersection, and the next time through we would take that same
9014          * number, and output it again as ending the set.  By doing it the
9015          * opposite of this, there is no possibility that the count will be
9016          * momentarily incremented to 2.  (In a tie and both are in the set or
9017          * both not in the set, it doesn't matter which we take first.) */
9018         if (array_a[i_a] < array_b[i_b]
9019             || (array_a[i_a] == array_b[i_b]
9020                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9021         {
9022             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9023             cp= array_a[i_a++];
9024         }
9025         else {
9026             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9027             cp= array_b[i_b++];
9028         }
9029
9030         /* Here, have chosen which of the two inputs to look at.  Only output
9031          * if the running count changes to/from 2, which marks the
9032          * beginning/end of a range that's in the intersection */
9033         if (cp_in_set) {
9034             count++;
9035             if (count == 2) {
9036                 array_r[i_r++] = cp;
9037             }
9038         }
9039         else {
9040             if (count == 2) {
9041                 array_r[i_r++] = cp;
9042             }
9043             count--;
9044         }
9045     }
9046
9047     /* Here, we are finished going through at least one of the lists, which
9048      * means there is something remaining in at most one.  We check if the list
9049      * that has been exhausted is positioned such that we are in the middle
9050      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
9051      * the ones we care about.)  There are four cases:
9052      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
9053      *     nothing left in the intersection.
9054      *  2) Both were in their sets, count is 2 and perhaps is incremented to
9055      *     above 2.  What should be output is exactly that which is in the
9056      *     non-exhausted set, as everything it has is also in the intersection
9057      *     set, and everything it doesn't have can't be in the intersection
9058      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9059      *     gets incremented to 2.  Like the previous case, the intersection is
9060      *     everything that remains in the non-exhausted set.
9061      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9062      *     remains 1.  And the intersection has nothing more. */
9063     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9064         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9065     {
9066         count++;
9067     }
9068
9069     /* The final length is what we've output so far plus what else is in the
9070      * intersection.  At most one of the subexpressions below will be non-zero
9071      * */
9072     len_r = i_r;
9073     if (count >= 2) {
9074         len_r += (len_a - i_a) + (len_b - i_b);
9075     }
9076
9077     /* Set result to final length, which can change the pointer to array_r, so
9078      * re-find it */
9079     if (len_r != _invlist_len(r)) {
9080         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9081         invlist_trim(r);
9082         array_r = invlist_array(r);
9083     }
9084
9085     /* Finish outputting any remaining */
9086     if (count >= 2) { /* At most one will have a non-zero copy count */
9087         IV copy_count;
9088         if ((copy_count = len_a - i_a) > 0) {
9089             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9090         }
9091         else if ((copy_count = len_b - i_b) > 0) {
9092             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9093         }
9094     }
9095
9096     /*  We may be removing a reference to one of the inputs.  If so, the output
9097      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
9098      *  count decremented) */
9099     if (a == *i || b == *i) {
9100         assert(! invlist_is_iterating(*i));
9101         if (SvTEMP(*i)) {
9102             sv_2mortal(r);
9103         }
9104         else {
9105             SvREFCNT_dec_NN(*i);
9106         }
9107     }
9108
9109     *i = r;
9110
9111     return;
9112 }
9113
9114 SV*
9115 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9116 {
9117     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9118      * set.  A pointer to the inversion list is returned.  This may actually be
9119      * a new list, in which case the passed in one has been destroyed.  The
9120      * passed-in inversion list can be NULL, in which case a new one is created
9121      * with just the one range in it */
9122
9123     SV* range_invlist;
9124     UV len;
9125
9126     if (invlist == NULL) {
9127         invlist = _new_invlist(2);
9128         len = 0;
9129     }
9130     else {
9131         len = _invlist_len(invlist);
9132     }
9133
9134     /* If comes after the final entry actually in the list, can just append it
9135      * to the end, */
9136     if (len == 0
9137         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9138             && start >= invlist_array(invlist)[len - 1]))
9139     {
9140         _append_range_to_invlist(invlist, start, end);
9141         return invlist;
9142     }
9143
9144     /* Here, can't just append things, create and return a new inversion list
9145      * which is the union of this range and the existing inversion list */
9146     range_invlist = _new_invlist(2);
9147     _append_range_to_invlist(range_invlist, start, end);
9148
9149     _invlist_union(invlist, range_invlist, &invlist);
9150
9151     /* The temporary can be freed */
9152     SvREFCNT_dec_NN(range_invlist);
9153
9154     return invlist;
9155 }
9156
9157 SV*
9158 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9159                                  UV** other_elements_ptr)
9160 {
9161     /* Create and return an inversion list whose contents are to be populated
9162      * by the caller.  The caller gives the number of elements (in 'size') and
9163      * the very first element ('element0').  This function will set
9164      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9165      * are to be placed.
9166      *
9167      * Obviously there is some trust involved that the caller will properly
9168      * fill in the other elements of the array.
9169      *
9170      * (The first element needs to be passed in, as the underlying code does
9171      * things differently depending on whether it is zero or non-zero) */
9172
9173     SV* invlist = _new_invlist(size);
9174     bool offset;
9175
9176     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9177
9178     _append_range_to_invlist(invlist, element0, element0);
9179     offset = *get_invlist_offset_addr(invlist);
9180
9181     invlist_set_len(invlist, size, offset);
9182     *other_elements_ptr = invlist_array(invlist) + 1;
9183     return invlist;
9184 }
9185
9186 #endif
9187
9188 PERL_STATIC_INLINE SV*
9189 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9190     return _add_range_to_invlist(invlist, cp, cp);
9191 }
9192
9193 #ifndef PERL_IN_XSUB_RE
9194 void
9195 Perl__invlist_invert(pTHX_ SV* const invlist)
9196 {
9197     /* Complement the input inversion list.  This adds a 0 if the list didn't
9198      * have a zero; removes it otherwise.  As described above, the data
9199      * structure is set up so that this is very efficient */
9200
9201     PERL_ARGS_ASSERT__INVLIST_INVERT;
9202
9203     assert(! invlist_is_iterating(invlist));
9204
9205     /* The inverse of matching nothing is matching everything */
9206     if (_invlist_len(invlist) == 0) {
9207         _append_range_to_invlist(invlist, 0, UV_MAX);
9208         return;
9209     }
9210
9211     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9212 }
9213
9214 #endif
9215
9216 PERL_STATIC_INLINE SV*
9217 S_invlist_clone(pTHX_ SV* const invlist)
9218 {
9219
9220     /* Return a new inversion list that is a copy of the input one, which is
9221      * unchanged.  The new list will not be mortal even if the old one was. */
9222
9223     /* Need to allocate extra space to accommodate Perl's addition of a
9224      * trailing NUL to SvPV's, since it thinks they are always strings */
9225     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9226     STRLEN physical_length = SvCUR(invlist);
9227     bool offset = *(get_invlist_offset_addr(invlist));
9228
9229     PERL_ARGS_ASSERT_INVLIST_CLONE;
9230
9231     *(get_invlist_offset_addr(new_invlist)) = offset;
9232     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9233     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9234
9235     return new_invlist;
9236 }
9237
9238 PERL_STATIC_INLINE STRLEN*
9239 S_get_invlist_iter_addr(SV* invlist)
9240 {
9241     /* Return the address of the UV that contains the current iteration
9242      * position */
9243
9244     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9245
9246     assert(SvTYPE(invlist) == SVt_INVLIST);
9247
9248     return &(((XINVLIST*) SvANY(invlist))->iterator);
9249 }
9250
9251 PERL_STATIC_INLINE void
9252 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9253 {
9254     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9255
9256     *get_invlist_iter_addr(invlist) = 0;
9257 }
9258
9259 PERL_STATIC_INLINE void
9260 S_invlist_iterfinish(SV* invlist)
9261 {
9262     /* Terminate iterator for invlist.  This is to catch development errors.
9263      * Any iteration that is interrupted before completed should call this
9264      * function.  Functions that add code points anywhere else but to the end
9265      * of an inversion list assert that they are not in the middle of an
9266      * iteration.  If they were, the addition would make the iteration
9267      * problematical: if the iteration hadn't reached the place where things
9268      * were being added, it would be ok */
9269
9270     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9271
9272     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9273 }
9274
9275 STATIC bool
9276 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9277 {
9278     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9279      * This call sets in <*start> and <*end>, the next range in <invlist>.
9280      * Returns <TRUE> if successful and the next call will return the next
9281      * range; <FALSE> if was already at the end of the list.  If the latter,
9282      * <*start> and <*end> are unchanged, and the next call to this function
9283      * will start over at the beginning of the list */
9284
9285     STRLEN* pos = get_invlist_iter_addr(invlist);
9286     UV len = _invlist_len(invlist);
9287     UV *array;
9288
9289     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9290
9291     if (*pos >= len) {
9292         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9293         return FALSE;
9294     }
9295
9296     array = invlist_array(invlist);
9297
9298     *start = array[(*pos)++];
9299
9300     if (*pos >= len) {
9301         *end = UV_MAX;
9302     }
9303     else {
9304         *end = array[(*pos)++] - 1;
9305     }
9306
9307     return TRUE;
9308 }
9309
9310 PERL_STATIC_INLINE UV
9311 S_invlist_highest(SV* const invlist)
9312 {
9313     /* Returns the highest code point that matches an inversion list.  This API
9314      * has an ambiguity, as it returns 0 under either the highest is actually
9315      * 0, or if the list is empty.  If this distinction matters to you, check
9316      * for emptiness before calling this function */
9317
9318     UV len = _invlist_len(invlist);
9319     UV *array;
9320
9321     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9322
9323     if (len == 0) {
9324         return 0;
9325     }
9326
9327     array = invlist_array(invlist);
9328
9329     /* The last element in the array in the inversion list always starts a
9330      * range that goes to infinity.  That range may be for code points that are
9331      * matched in the inversion list, or it may be for ones that aren't
9332      * matched.  In the latter case, the highest code point in the set is one
9333      * less than the beginning of this range; otherwise it is the final element
9334      * of this range: infinity */
9335     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9336            ? UV_MAX
9337            : array[len - 1] - 1;
9338 }
9339
9340 #ifndef PERL_IN_XSUB_RE
9341 SV *
9342 Perl__invlist_contents(pTHX_ SV* const invlist)
9343 {
9344     /* Get the contents of an inversion list into a string SV so that they can
9345      * be printed out.  It uses the format traditionally done for debug tracing
9346      */
9347
9348     UV start, end;
9349     SV* output = newSVpvs("\n");
9350
9351     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9352
9353     assert(! invlist_is_iterating(invlist));
9354
9355     invlist_iterinit(invlist);
9356     while (invlist_iternext(invlist, &start, &end)) {
9357         if (end == UV_MAX) {
9358             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9359         }
9360         else if (end != start) {
9361             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9362                     start,       end);
9363         }
9364         else {
9365             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9366         }
9367     }
9368
9369     return output;
9370 }
9371 #endif
9372
9373 #ifndef PERL_IN_XSUB_RE
9374 void
9375 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9376                          const char * const indent, SV* const invlist)
9377 {
9378     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9379      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9380      * the string 'indent'.  The output looks like this:
9381          [0] 0x000A .. 0x000D
9382          [2] 0x0085
9383          [4] 0x2028 .. 0x2029
9384          [6] 0x3104 .. INFINITY
9385      * This means that the first range of code points matched by the list are
9386      * 0xA through 0xD; the second range contains only the single code point
9387      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9388      * are used to define each range (except if the final range extends to
9389      * infinity, only a single element is needed).  The array index of the
9390      * first element for the corresponding range is given in brackets. */
9391
9392     UV start, end;
9393     STRLEN count = 0;
9394
9395     PERL_ARGS_ASSERT__INVLIST_DUMP;
9396
9397     if (invlist_is_iterating(invlist)) {
9398         Perl_dump_indent(aTHX_ level, file,
9399              "%sCan't dump inversion list because is in middle of iterating\n",
9400              indent);
9401         return;
9402     }
9403
9404     invlist_iterinit(invlist);
9405     while (invlist_iternext(invlist, &start, &end)) {
9406         if (end == UV_MAX) {
9407             Perl_dump_indent(aTHX_ level, file,
9408                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9409                                    indent, (UV)count, start);
9410         }
9411         else if (end != start) {
9412             Perl_dump_indent(aTHX_ level, file,
9413                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9414                                 indent, (UV)count, start,         end);
9415         }
9416         else {
9417             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9418                                             indent, (UV)count, start);
9419         }
9420         count += 2;
9421     }
9422 }
9423
9424 void
9425 Perl__load_PL_utf8_foldclosures (pTHX)
9426 {
9427     assert(! PL_utf8_foldclosures);
9428
9429     /* If the folds haven't been read in, call a fold function
9430      * to force that */
9431     if (! PL_utf8_tofold) {
9432         U8 dummy[UTF8_MAXBYTES_CASE+1];
9433
9434         /* This string is just a short named one above \xff */
9435         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9436         assert(PL_utf8_tofold); /* Verify that worked */
9437     }
9438     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9439 }
9440 #endif
9441
9442 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9443 bool
9444 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9445 {
9446     /* Return a boolean as to if the two passed in inversion lists are
9447      * identical.  The final argument, if TRUE, says to take the complement of
9448      * the second inversion list before doing the comparison */
9449
9450     const UV* array_a = invlist_array(a);
9451     const UV* array_b = invlist_array(b);
9452     UV len_a = _invlist_len(a);
9453     UV len_b = _invlist_len(b);
9454
9455     UV i = 0;               /* current index into the arrays */
9456     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9457
9458     PERL_ARGS_ASSERT__INVLISTEQ;
9459
9460     /* If are to compare 'a' with the complement of b, set it
9461      * up so are looking at b's complement. */
9462     if (complement_b) {
9463
9464         /* The complement of nothing is everything, so <a> would have to have
9465          * just one element, starting at zero (ending at infinity) */
9466         if (len_b == 0) {
9467             return (len_a == 1 && array_a[0] == 0);
9468         }
9469         else if (array_b[0] == 0) {
9470
9471             /* Otherwise, to complement, we invert.  Here, the first element is
9472              * 0, just remove it.  To do this, we just pretend the array starts
9473              * one later */
9474
9475             array_b++;
9476             len_b--;
9477         }
9478         else {
9479
9480             /* But if the first element is not zero, we pretend the list starts
9481              * at the 0 that is always stored immediately before the array. */
9482             array_b--;
9483             len_b++;
9484         }
9485     }
9486
9487     /* Make sure that the lengths are the same, as well as the final element
9488      * before looping through the remainder.  (Thus we test the length, final,
9489      * and first elements right off the bat) */
9490     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9491         retval = FALSE;
9492     }
9493     else for (i = 0; i < len_a - 1; i++) {
9494         if (array_a[i] != array_b[i]) {
9495             retval = FALSE;
9496             break;
9497         }
9498     }
9499
9500     return retval;
9501 }
9502 #endif
9503
9504 #undef HEADER_LENGTH
9505 #undef TO_INTERNAL_SIZE
9506 #undef FROM_INTERNAL_SIZE
9507 #undef INVLIST_VERSION_ID
9508
9509 /* End of inversion list object */
9510
9511 STATIC void
9512 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9513 {
9514     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9515      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9516      * should point to the first flag; it is updated on output to point to the
9517      * final ')' or ':'.  There needs to be at least one flag, or this will
9518      * abort */
9519
9520     /* for (?g), (?gc), and (?o) warnings; warning
9521        about (?c) will warn about (?g) -- japhy    */
9522
9523 #define WASTED_O  0x01
9524 #define WASTED_G  0x02
9525 #define WASTED_C  0x04
9526 #define WASTED_GC (WASTED_G|WASTED_C)
9527     I32 wastedflags = 0x00;
9528     U32 posflags = 0, negflags = 0;
9529     U32 *flagsp = &posflags;
9530     char has_charset_modifier = '\0';
9531     regex_charset cs;
9532     bool has_use_defaults = FALSE;
9533     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9534     int x_mod_count = 0;
9535
9536     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9537
9538     /* '^' as an initial flag sets certain defaults */
9539     if (UCHARAT(RExC_parse) == '^') {
9540         RExC_parse++;
9541         has_use_defaults = TRUE;
9542         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9543         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9544                                         ? REGEX_UNICODE_CHARSET
9545                                         : REGEX_DEPENDS_CHARSET);
9546     }
9547
9548     cs = get_regex_charset(RExC_flags);
9549     if (cs == REGEX_DEPENDS_CHARSET
9550         && (RExC_utf8 || RExC_uni_semantics))
9551     {
9552         cs = REGEX_UNICODE_CHARSET;
9553     }
9554
9555     while (*RExC_parse) {
9556         /* && strchr("iogcmsx", *RExC_parse) */
9557         /* (?g), (?gc) and (?o) are useless here
9558            and must be globally applied -- japhy */
9559         switch (*RExC_parse) {
9560
9561             /* Code for the imsx flags */
9562             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9563
9564             case LOCALE_PAT_MOD:
9565                 if (has_charset_modifier) {
9566                     goto excess_modifier;
9567                 }
9568                 else if (flagsp == &negflags) {
9569                     goto neg_modifier;
9570                 }
9571                 cs = REGEX_LOCALE_CHARSET;
9572                 has_charset_modifier = LOCALE_PAT_MOD;
9573                 break;
9574             case UNICODE_PAT_MOD:
9575                 if (has_charset_modifier) {
9576                     goto excess_modifier;
9577                 }
9578                 else if (flagsp == &negflags) {
9579                     goto neg_modifier;
9580                 }
9581                 cs = REGEX_UNICODE_CHARSET;
9582                 has_charset_modifier = UNICODE_PAT_MOD;
9583                 break;
9584             case ASCII_RESTRICT_PAT_MOD:
9585                 if (flagsp == &negflags) {
9586                     goto neg_modifier;
9587                 }
9588                 if (has_charset_modifier) {
9589                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9590                         goto excess_modifier;
9591                     }
9592                     /* Doubled modifier implies more restricted */
9593                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9594                 }
9595                 else {
9596                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9597                 }
9598                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9599                 break;
9600             case DEPENDS_PAT_MOD:
9601                 if (has_use_defaults) {
9602                     goto fail_modifiers;
9603                 }
9604                 else if (flagsp == &negflags) {
9605                     goto neg_modifier;
9606                 }
9607                 else if (has_charset_modifier) {
9608                     goto excess_modifier;
9609                 }
9610
9611                 /* The dual charset means unicode semantics if the
9612                  * pattern (or target, not known until runtime) are
9613                  * utf8, or something in the pattern indicates unicode
9614                  * semantics */
9615                 cs = (RExC_utf8 || RExC_uni_semantics)
9616                      ? REGEX_UNICODE_CHARSET
9617                      : REGEX_DEPENDS_CHARSET;
9618                 has_charset_modifier = DEPENDS_PAT_MOD;
9619                 break;
9620             excess_modifier:
9621                 RExC_parse++;
9622                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9623                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9624                 }
9625                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9626                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9627                                         *(RExC_parse - 1));
9628                 }
9629                 else {
9630                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9631                 }
9632                 NOT_REACHED; /*NOTREACHED*/
9633             neg_modifier:
9634                 RExC_parse++;
9635                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9636                                     *(RExC_parse - 1));
9637                 NOT_REACHED; /*NOTREACHED*/
9638             case ONCE_PAT_MOD: /* 'o' */
9639             case GLOBAL_PAT_MOD: /* 'g' */
9640                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9641                     const I32 wflagbit = *RExC_parse == 'o'
9642                                          ? WASTED_O
9643                                          : WASTED_G;
9644                     if (! (wastedflags & wflagbit) ) {
9645                         wastedflags |= wflagbit;
9646                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9647                         vWARN5(
9648                             RExC_parse + 1,
9649                             "Useless (%s%c) - %suse /%c modifier",
9650                             flagsp == &negflags ? "?-" : "?",
9651                             *RExC_parse,
9652                             flagsp == &negflags ? "don't " : "",
9653                             *RExC_parse
9654                         );
9655                     }
9656                 }
9657                 break;
9658
9659             case CONTINUE_PAT_MOD: /* 'c' */
9660                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9661                     if (! (wastedflags & WASTED_C) ) {
9662                         wastedflags |= WASTED_GC;
9663                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9664                         vWARN3(
9665                             RExC_parse + 1,
9666                             "Useless (%sc) - %suse /gc modifier",
9667                             flagsp == &negflags ? "?-" : "?",
9668                             flagsp == &negflags ? "don't " : ""
9669                         );
9670                     }
9671                 }
9672                 break;
9673             case KEEPCOPY_PAT_MOD: /* 'p' */
9674                 if (flagsp == &negflags) {
9675                     if (PASS2)
9676                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9677                 } else {
9678                     *flagsp |= RXf_PMf_KEEPCOPY;
9679                 }
9680                 break;
9681             case '-':
9682                 /* A flag is a default iff it is following a minus, so
9683                  * if there is a minus, it means will be trying to
9684                  * re-specify a default which is an error */
9685                 if (has_use_defaults || flagsp == &negflags) {
9686                     goto fail_modifiers;
9687                 }
9688                 flagsp = &negflags;
9689                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9690                 break;
9691             case ':':
9692             case ')':
9693                 RExC_flags |= posflags;
9694                 RExC_flags &= ~negflags;
9695                 set_regex_charset(&RExC_flags, cs);
9696                 if (RExC_flags & RXf_PMf_FOLD) {
9697                     RExC_contains_i = 1;
9698                 }
9699                 if (PASS2) {
9700                     STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9701                 }
9702                 return;
9703                 /*NOTREACHED*/
9704             default:
9705             fail_modifiers:
9706                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9707                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9708                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9709                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9710                 NOT_REACHED; /*NOTREACHED*/
9711         }
9712
9713         ++RExC_parse;
9714     }
9715
9716     if (PASS2) {
9717         STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9718     }
9719 }
9720
9721 /*
9722  - reg - regular expression, i.e. main body or parenthesized thing
9723  *
9724  * Caller must absorb opening parenthesis.
9725  *
9726  * Combining parenthesis handling with the base level of regular expression
9727  * is a trifle forced, but the need to tie the tails of the branches to what
9728  * follows makes it hard to avoid.
9729  */
9730 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9731 #ifdef DEBUGGING
9732 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9733 #else
9734 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9735 #endif
9736
9737 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9738    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9739    needs to be restarted.
9740    Otherwise would only return NULL if regbranch() returns NULL, which
9741    cannot happen.  */
9742 STATIC regnode *
9743 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9744     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9745      * 2 is like 1, but indicates that nextchar() has been called to advance
9746      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9747      * this flag alerts us to the need to check for that */
9748 {
9749     regnode *ret;               /* Will be the head of the group. */
9750     regnode *br;
9751     regnode *lastbr;
9752     regnode *ender = NULL;
9753     I32 parno = 0;
9754     I32 flags;
9755     U32 oregflags = RExC_flags;
9756     bool have_branch = 0;
9757     bool is_open = 0;
9758     I32 freeze_paren = 0;
9759     I32 after_freeze = 0;
9760     I32 num; /* numeric backreferences */
9761
9762     char * parse_start = RExC_parse; /* MJD */
9763     char * const oregcomp_parse = RExC_parse;
9764
9765     GET_RE_DEBUG_FLAGS_DECL;
9766
9767     PERL_ARGS_ASSERT_REG;
9768     DEBUG_PARSE("reg ");
9769
9770     *flagp = 0;                         /* Tentatively. */
9771
9772
9773     /* Make an OPEN node, if parenthesized. */
9774     if (paren) {
9775
9776         /* Under /x, space and comments can be gobbled up between the '(' and
9777          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9778          * intervening space, as the sequence is a token, and a token should be
9779          * indivisible */
9780         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9781
9782         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9783             char *start_verb = RExC_parse;
9784             STRLEN verb_len = 0;
9785             char *start_arg = NULL;
9786             unsigned char op = 0;
9787             int argok = 1;
9788             int internal_argval = 0; /* internal_argval is only useful if
9789                                         !argok */
9790
9791             if (has_intervening_patws) {
9792                 RExC_parse++;
9793                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9794             }
9795             while ( *RExC_parse && *RExC_parse != ')' ) {
9796                 if ( *RExC_parse == ':' ) {
9797                     start_arg = RExC_parse + 1;
9798                     break;
9799                 }
9800                 RExC_parse++;
9801             }
9802             ++start_verb;
9803             verb_len = RExC_parse - start_verb;
9804             if ( start_arg ) {
9805                 RExC_parse++;
9806                 while ( *RExC_parse && *RExC_parse != ')' )
9807                     RExC_parse++;
9808                 if ( *RExC_parse != ')' )
9809                     vFAIL("Unterminated verb pattern argument");
9810                 if ( RExC_parse == start_arg )
9811                     start_arg = NULL;
9812             } else {
9813                 if ( *RExC_parse != ')' )
9814                     vFAIL("Unterminated verb pattern");
9815             }
9816
9817             switch ( *start_verb ) {
9818             case 'A':  /* (*ACCEPT) */
9819                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9820                     op = ACCEPT;
9821                     internal_argval = RExC_nestroot;
9822                 }
9823                 break;
9824             case 'C':  /* (*COMMIT) */
9825                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9826                     op = COMMIT;
9827                 break;
9828             case 'F':  /* (*FAIL) */
9829                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9830                     op = OPFAIL;
9831                     argok = 0;
9832                 }
9833                 break;
9834             case ':':  /* (*:NAME) */
9835             case 'M':  /* (*MARK:NAME) */
9836                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9837                     op = MARKPOINT;
9838                     argok = -1;
9839                 }
9840                 break;
9841             case 'P':  /* (*PRUNE) */
9842                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9843                     op = PRUNE;
9844                 break;
9845             case 'S':   /* (*SKIP) */
9846                 if ( memEQs(start_verb,verb_len,"SKIP") )
9847                     op = SKIP;
9848                 break;
9849             case 'T':  /* (*THEN) */
9850                 /* [19:06] <TimToady> :: is then */
9851                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9852                     op = CUTGROUP;
9853                     RExC_seen |= REG_CUTGROUP_SEEN;
9854                 }
9855                 break;
9856             }
9857             if ( ! op ) {
9858                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9859                 vFAIL2utf8f(
9860                     "Unknown verb pattern '%"UTF8f"'",
9861                     UTF8fARG(UTF, verb_len, start_verb));
9862             }
9863             if ( argok ) {
9864                 if ( start_arg && internal_argval ) {
9865                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9866                         verb_len, start_verb);
9867                 } else if ( argok < 0 && !start_arg ) {
9868                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9869                         verb_len, start_verb);
9870                 } else {
9871                     ret = reganode(pRExC_state, op, internal_argval);
9872                     if ( ! internal_argval && ! SIZE_ONLY ) {
9873                         if (start_arg) {
9874                             SV *sv = newSVpvn( start_arg,
9875                                                RExC_parse - start_arg);
9876                             ARG(ret) = add_data( pRExC_state,
9877                                                  STR_WITH_LEN("S"));
9878                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9879                             ret->flags = 0;
9880                         } else {
9881                             ret->flags = 1;
9882                         }
9883                     }
9884                 }
9885                 if (!internal_argval)
9886                     RExC_seen |= REG_VERBARG_SEEN;
9887             } else if ( start_arg ) {
9888                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9889                         verb_len, start_verb);
9890             } else {
9891                 ret = reg_node(pRExC_state, op);
9892             }
9893             nextchar(pRExC_state);
9894             return ret;
9895         }
9896         else if (*RExC_parse == '?') { /* (?...) */
9897             bool is_logical = 0;
9898             const char * const seqstart = RExC_parse;
9899             const char * endptr;
9900             if (has_intervening_patws) {
9901                 RExC_parse++;
9902                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9903             }
9904
9905             RExC_parse++;
9906             paren = *RExC_parse++;
9907             ret = NULL;                 /* For look-ahead/behind. */
9908             switch (paren) {
9909
9910             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9911                 paren = *RExC_parse++;
9912                 if ( paren == '<')         /* (?P<...>) named capture */
9913                     goto named_capture;
9914                 else if (paren == '>') {   /* (?P>name) named recursion */
9915                     goto named_recursion;
9916                 }
9917                 else if (paren == '=') {   /* (?P=...)  named backref */
9918                     /* this pretty much dupes the code for \k<NAME> in
9919                      * regatom(), if you change this make sure you change that
9920                      * */
9921                     char* name_start = RExC_parse;
9922                     U32 num = 0;
9923                     SV *sv_dat = reg_scan_name(pRExC_state,
9924                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9925                     if (RExC_parse == name_start || *RExC_parse != ')')
9926                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9927                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9928
9929                     if (!SIZE_ONLY) {
9930                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9931                         RExC_rxi->data->data[num]=(void*)sv_dat;
9932                         SvREFCNT_inc_simple_void(sv_dat);
9933                     }
9934                     RExC_sawback = 1;
9935                     ret = reganode(pRExC_state,
9936                                    ((! FOLD)
9937                                      ? NREF
9938                                      : (ASCII_FOLD_RESTRICTED)
9939                                        ? NREFFA
9940                                        : (AT_LEAST_UNI_SEMANTICS)
9941                                          ? NREFFU
9942                                          : (LOC)
9943                                            ? NREFFL
9944                                            : NREFF),
9945                                     num);
9946                     *flagp |= HASWIDTH;
9947
9948                     Set_Node_Offset(ret, parse_start+1);
9949                     Set_Node_Cur_Length(ret, parse_start);
9950
9951                     nextchar(pRExC_state);
9952                     return ret;
9953                 }
9954                 RExC_parse++;
9955                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9956                 vFAIL3("Sequence (%.*s...) not recognized",
9957                                 RExC_parse-seqstart, seqstart);
9958                 NOT_REACHED; /*NOTREACHED*/
9959             case '<':           /* (?<...) */
9960                 if (*RExC_parse == '!')
9961                     paren = ',';
9962                 else if (*RExC_parse != '=')
9963               named_capture:
9964                 {               /* (?<...>) */
9965                     char *name_start;
9966                     SV *svname;
9967                     paren= '>';
9968             case '\'':          /* (?'...') */
9969                     name_start= RExC_parse;
9970                     svname = reg_scan_name(pRExC_state,
9971                         SIZE_ONLY    /* reverse test from the others */
9972                         ? REG_RSN_RETURN_NAME
9973                         : REG_RSN_RETURN_NULL);
9974                     if (RExC_parse == name_start || *RExC_parse != paren)
9975                         vFAIL2("Sequence (?%c... not terminated",
9976                             paren=='>' ? '<' : paren);
9977                     if (SIZE_ONLY) {
9978                         HE *he_str;
9979                         SV *sv_dat = NULL;
9980                         if (!svname) /* shouldn't happen */
9981                             Perl_croak(aTHX_
9982                                 "panic: reg_scan_name returned NULL");
9983                         if (!RExC_paren_names) {
9984                             RExC_paren_names= newHV();
9985                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9986 #ifdef DEBUGGING
9987                             RExC_paren_name_list= newAV();
9988                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9989 #endif
9990                         }
9991                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9992                         if ( he_str )
9993                             sv_dat = HeVAL(he_str);
9994                         if ( ! sv_dat ) {
9995                             /* croak baby croak */
9996                             Perl_croak(aTHX_
9997                                 "panic: paren_name hash element allocation failed");
9998                         } else if ( SvPOK(sv_dat) ) {
9999                             /* (?|...) can mean we have dupes so scan to check
10000                                its already been stored. Maybe a flag indicating
10001                                we are inside such a construct would be useful,
10002                                but the arrays are likely to be quite small, so
10003                                for now we punt -- dmq */
10004                             IV count = SvIV(sv_dat);
10005                             I32 *pv = (I32*)SvPVX(sv_dat);
10006                             IV i;
10007                             for ( i = 0 ; i < count ; i++ ) {
10008                                 if ( pv[i] == RExC_npar ) {
10009                                     count = 0;
10010                                     break;
10011                                 }
10012                             }
10013                             if ( count ) {
10014                                 pv = (I32*)SvGROW(sv_dat,
10015                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10016                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10017                                 pv[count] = RExC_npar;
10018                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10019                             }
10020                         } else {
10021                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10022                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10023                                                                 sizeof(I32));
10024                             SvIOK_on(sv_dat);
10025                             SvIV_set(sv_dat, 1);
10026                         }
10027 #ifdef DEBUGGING
10028                         /* Yes this does cause a memory leak in debugging Perls
10029                          * */
10030                         if (!av_store(RExC_paren_name_list,
10031                                       RExC_npar, SvREFCNT_inc(svname)))
10032                             SvREFCNT_dec_NN(svname);
10033 #endif
10034
10035                         /*sv_dump(sv_dat);*/
10036                     }
10037                     nextchar(pRExC_state);
10038                     paren = 1;
10039                     goto capturing_parens;
10040                 }
10041                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10042                 RExC_in_lookbehind++;
10043                 RExC_parse++;
10044                 /* FALLTHROUGH */
10045             case '=':           /* (?=...) */
10046                 RExC_seen_zerolen++;
10047                 break;
10048             case '!':           /* (?!...) */
10049                 RExC_seen_zerolen++;
10050                 if (*RExC_parse == ')') {
10051                     ret=reg_node(pRExC_state, OPFAIL);
10052                     nextchar(pRExC_state);
10053                     return ret;
10054                 }
10055                 break;
10056             case '|':           /* (?|...) */
10057                 /* branch reset, behave like a (?:...) except that
10058                    buffers in alternations share the same numbers */
10059                 paren = ':';
10060                 after_freeze = freeze_paren = RExC_npar;
10061                 break;
10062             case ':':           /* (?:...) */
10063             case '>':           /* (?>...) */
10064                 break;
10065             case '$':           /* (?$...) */
10066             case '@':           /* (?@...) */
10067                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10068                 break;
10069             case '0' :           /* (?0) */
10070             case 'R' :           /* (?R) */
10071                 if (*RExC_parse != ')')
10072                     FAIL("Sequence (?R) not terminated");
10073                 ret = reg_node(pRExC_state, GOSTART);
10074                     RExC_seen |= REG_GOSTART_SEEN;
10075                 *flagp |= POSTPONED;
10076                 nextchar(pRExC_state);
10077                 return ret;
10078                 /*notreached*/
10079             /* named and numeric backreferences */
10080             case '&':            /* (?&NAME) */
10081                 parse_start = RExC_parse - 1;
10082               named_recursion:
10083                 {
10084                     SV *sv_dat = reg_scan_name(pRExC_state,
10085                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10086                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10087                 }
10088                 if (RExC_parse == RExC_end || *RExC_parse != ')')
10089                     vFAIL("Sequence (?&... not terminated");
10090                 goto gen_recurse_regop;
10091                 /* NOT REACHED */
10092             case '+':
10093                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10094                     RExC_parse++;
10095                     vFAIL("Illegal pattern");
10096                 }
10097                 goto parse_recursion;
10098                 /* NOT REACHED*/
10099             case '-': /* (?-1) */
10100                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10101                     RExC_parse--; /* rewind to let it be handled later */
10102                     goto parse_flags;
10103                 }
10104                 /* FALLTHROUGH */
10105             case '1': case '2': case '3': case '4': /* (?1) */
10106             case '5': case '6': case '7': case '8': case '9':
10107                 RExC_parse--;
10108               parse_recursion:
10109                 {
10110                     bool is_neg = FALSE;
10111                     parse_start = RExC_parse - 1; /* MJD */
10112                     if (*RExC_parse == '-') {
10113                         RExC_parse++;
10114                         is_neg = TRUE;
10115                     }
10116                     num = grok_atou(RExC_parse, &endptr);
10117                     if (endptr)
10118                         RExC_parse = (char*)endptr;
10119                     if (is_neg) {
10120                         /* Some limit for num? */
10121                         num = -num;
10122                     }
10123                 }
10124                 if (*RExC_parse!=')')
10125                     vFAIL("Expecting close bracket");
10126
10127               gen_recurse_regop:
10128                 if ( paren == '-' ) {
10129                     /*
10130                     Diagram of capture buffer numbering.
10131                     Top line is the normal capture buffer numbers
10132                     Bottom line is the negative indexing as from
10133                     the X (the (?-2))
10134
10135                     +   1 2    3 4 5 X          6 7
10136                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10137                     -   5 4    3 2 1 X          x x
10138
10139                     */
10140                     num = RExC_npar + num;
10141                     if (num < 1)  {
10142                         RExC_parse++;
10143                         vFAIL("Reference to nonexistent group");
10144                     }
10145                 } else if ( paren == '+' ) {
10146                     num = RExC_npar + num - 1;
10147                 }
10148
10149                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10150                 if (!SIZE_ONLY) {
10151                     if (num > (I32)RExC_rx->nparens) {
10152                         RExC_parse++;
10153                         vFAIL("Reference to nonexistent group");
10154                     }
10155                     RExC_recurse_count++;
10156                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10157                         "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10158                               22, "|    |", (int)(depth * 2 + 1), "",
10159                               (UV)ARG(ret), (IV)ARG2L(ret)));
10160                 }
10161                 RExC_seen |= REG_RECURSE_SEEN;
10162                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10163                 Set_Node_Offset(ret, parse_start); /* MJD */
10164
10165                 *flagp |= POSTPONED;
10166                 nextchar(pRExC_state);
10167                 return ret;
10168
10169             /* NOT REACHED */
10170
10171             case '?':           /* (??...) */
10172                 is_logical = 1;
10173                 if (*RExC_parse != '{') {
10174                     RExC_parse++;
10175                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10176                     vFAIL2utf8f(
10177                         "Sequence (%"UTF8f"...) not recognized",
10178                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10179                     NOT_REACHED; /*NOTREACHED*/
10180                 }
10181                 *flagp |= POSTPONED;
10182                 paren = *RExC_parse++;
10183                 /* FALLTHROUGH */
10184             case '{':           /* (?{...}) */
10185             {
10186                 U32 n = 0;
10187                 struct reg_code_block *cb;
10188
10189                 RExC_seen_zerolen++;
10190
10191                 if (   !pRExC_state->num_code_blocks
10192                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
10193                     || pRExC_state->code_blocks[pRExC_state->code_index].start
10194                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10195                             - RExC_start)
10196                 ) {
10197                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
10198                         FAIL("panic: Sequence (?{...}): no code block found\n");
10199                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
10200                 }
10201                 /* this is a pre-compiled code block (?{...}) */
10202                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10203                 RExC_parse = RExC_start + cb->end;
10204                 if (!SIZE_ONLY) {
10205                     OP *o = cb->block;
10206                     if (cb->src_regex) {
10207                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10208                         RExC_rxi->data->data[n] =
10209                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
10210                         RExC_rxi->data->data[n+1] = (void*)o;
10211                     }
10212                     else {
10213                         n = add_data(pRExC_state,
10214                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10215                         RExC_rxi->data->data[n] = (void*)o;
10216                     }
10217                 }
10218                 pRExC_state->code_index++;
10219                 nextchar(pRExC_state);
10220
10221                 if (is_logical) {
10222                     regnode *eval;
10223                     ret = reg_node(pRExC_state, LOGICAL);
10224
10225                     eval = reg2Lanode(pRExC_state, EVAL,
10226                                        n,
10227
10228                                        /* for later propagation into (??{})
10229                                         * return value */
10230                                        RExC_flags & RXf_PMf_COMPILETIME
10231                                       );
10232                     if (!SIZE_ONLY) {
10233                         ret->flags = 2;
10234                     }
10235                     REGTAIL(pRExC_state, ret, eval);
10236                     /* deal with the length of this later - MJD */
10237                     return ret;
10238                 }
10239                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10240                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10241                 Set_Node_Offset(ret, parse_start);
10242                 return ret;
10243             }
10244             case '(':           /* (?(?{...})...) and (?(?=...)...) */
10245             {
10246                 int is_define= 0;
10247                 const int DEFINE_len = sizeof("DEFINE") - 1;
10248                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
10249                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10250                         || RExC_parse[1] == '<'
10251                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
10252                         I32 flag;
10253                         regnode *tail;
10254
10255                         ret = reg_node(pRExC_state, LOGICAL);
10256                         if (!SIZE_ONLY)
10257                             ret->flags = 1;
10258
10259                         tail = reg(pRExC_state, 1, &flag, depth+1);
10260                         if (flag & RESTART_UTF8) {
10261                             *flagp = RESTART_UTF8;
10262                             return NULL;
10263                         }
10264                         REGTAIL(pRExC_state, ret, tail);
10265                         goto insert_if;
10266                     }
10267                     /* Fall through to ‘Unknown switch condition’ at the
10268                        end of the if/else chain. */
10269                 }
10270                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10271                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10272                 {
10273                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
10274                     char *name_start= RExC_parse++;
10275                     U32 num = 0;
10276                     SV *sv_dat=reg_scan_name(pRExC_state,
10277                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10278                     if (RExC_parse == name_start || *RExC_parse != ch)
10279                         vFAIL2("Sequence (?(%c... not terminated",
10280                             (ch == '>' ? '<' : ch));
10281                     RExC_parse++;
10282                     if (!SIZE_ONLY) {
10283                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10284                         RExC_rxi->data->data[num]=(void*)sv_dat;
10285                         SvREFCNT_inc_simple_void(sv_dat);
10286                     }
10287                     ret = reganode(pRExC_state,NGROUPP,num);
10288                     goto insert_if_check_paren;
10289                 }
10290                 else if (strnEQ(RExC_parse, "DEFINE",
10291                                        MIN(DEFINE_len, RExC_end - RExC_parse)))
10292                 {
10293                     ret = reganode(pRExC_state,DEFINEP,0);
10294                     RExC_parse += DEFINE_len;
10295                     is_define = 1;
10296                     goto insert_if_check_paren;
10297                 }
10298                 else if (RExC_parse[0] == 'R') {
10299                     RExC_parse++;
10300                     parno = 0;
10301                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10302                         parno = grok_atou(RExC_parse, &endptr);
10303                         if (endptr)
10304                             RExC_parse = (char*)endptr;
10305                     } else if (RExC_parse[0] == '&') {
10306                         SV *sv_dat;
10307                         RExC_parse++;
10308                         sv_dat = reg_scan_name(pRExC_state,
10309                             SIZE_ONLY
10310                             ? REG_RSN_RETURN_NULL
10311                             : REG_RSN_RETURN_DATA);
10312                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10313                     }
10314                     ret = reganode(pRExC_state,INSUBP,parno);
10315                     goto insert_if_check_paren;
10316                 }
10317                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10318                     /* (?(1)...) */
10319                     char c;
10320                     char *tmp;
10321                     parno = grok_atou(RExC_parse, &endptr);
10322                     if (endptr)
10323                         RExC_parse = (char*)endptr;
10324                     ret = reganode(pRExC_state, GROUPP, parno);
10325
10326                  insert_if_check_paren:
10327                     if (*(tmp = nextchar(pRExC_state)) != ')') {
10328                         /* nextchar also skips comments, so undo its work
10329                          * and skip over the the next character.
10330                          */
10331                         RExC_parse = tmp;
10332                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10333                         vFAIL("Switch condition not recognized");
10334                     }
10335                   insert_if:
10336                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10337                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10338                     if (br == NULL) {
10339                         if (flags & RESTART_UTF8) {
10340                             *flagp = RESTART_UTF8;
10341                             return NULL;
10342                         }
10343                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10344                               (UV) flags);
10345                     } else
10346                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10347                                                           LONGJMP, 0));
10348                     c = *nextchar(pRExC_state);
10349                     if (flags&HASWIDTH)
10350                         *flagp |= HASWIDTH;
10351                     if (c == '|') {
10352                         if (is_define)
10353                             vFAIL("(?(DEFINE)....) does not allow branches");
10354
10355                         /* Fake one for optimizer.  */
10356                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10357
10358                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10359                             if (flags & RESTART_UTF8) {
10360                                 *flagp = RESTART_UTF8;
10361                                 return NULL;
10362                             }
10363                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10364                                   (UV) flags);
10365                         }
10366                         REGTAIL(pRExC_state, ret, lastbr);
10367                         if (flags&HASWIDTH)
10368                             *flagp |= HASWIDTH;
10369                         c = *nextchar(pRExC_state);
10370                     }
10371                     else
10372                         lastbr = NULL;
10373                     if (c != ')') {
10374                         if (RExC_parse>RExC_end)
10375                             vFAIL("Switch (?(condition)... not terminated");
10376                         else
10377                             vFAIL("Switch (?(condition)... contains too many branches");
10378                     }
10379                     ender = reg_node(pRExC_state, TAIL);
10380                     REGTAIL(pRExC_state, br, ender);
10381                     if (lastbr) {
10382                         REGTAIL(pRExC_state, lastbr, ender);
10383                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10384                     }
10385                     else
10386                         REGTAIL(pRExC_state, ret, ender);
10387                     RExC_size++; /* XXX WHY do we need this?!!
10388                                     For large programs it seems to be required
10389                                     but I can't figure out why. -- dmq*/
10390                     return ret;
10391                 }
10392                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10393                 vFAIL("Unknown switch condition (?(...))");
10394             }
10395             case '[':           /* (?[ ... ]) */
10396                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10397                                          oregcomp_parse);
10398             case 0:
10399                 RExC_parse--; /* for vFAIL to print correctly */
10400                 vFAIL("Sequence (? incomplete");
10401                 break;
10402             default: /* e.g., (?i) */
10403                 --RExC_parse;
10404               parse_flags:
10405                 parse_lparen_question_flags(pRExC_state);
10406                 if (UCHARAT(RExC_parse) != ':') {
10407                     nextchar(pRExC_state);
10408                     *flagp = TRYAGAIN;
10409                     return NULL;
10410                 }
10411                 paren = ':';
10412                 nextchar(pRExC_state);
10413                 ret = NULL;
10414                 goto parse_rest;
10415             } /* end switch */
10416         }
10417         else {                  /* (...) */
10418           capturing_parens:
10419             parno = RExC_npar;
10420             RExC_npar++;
10421
10422             ret = reganode(pRExC_state, OPEN, parno);
10423             if (!SIZE_ONLY ){
10424                 if (!RExC_nestroot)
10425                     RExC_nestroot = parno;
10426                 if (RExC_seen & REG_RECURSE_SEEN
10427                     && !RExC_open_parens[parno-1])
10428                 {
10429                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10430                         "%*s%*s Setting open paren #%"IVdf" to %d\n",
10431                         22, "|    |", (int)(depth * 2 + 1), "",
10432                         (IV)parno, REG_NODE_NUM(ret)));
10433                     RExC_open_parens[parno-1]= ret;
10434                 }
10435             }
10436             Set_Node_Length(ret, 1); /* MJD */
10437             Set_Node_Offset(ret, RExC_parse); /* MJD */
10438             is_open = 1;
10439         }
10440     }
10441     else                        /* ! paren */
10442         ret = NULL;
10443
10444    parse_rest:
10445     /* Pick up the branches, linking them together. */
10446     parse_start = RExC_parse;   /* MJD */
10447     br = regbranch(pRExC_state, &flags, 1,depth+1);
10448
10449     /*     branch_len = (paren != 0); */
10450
10451     if (br == NULL) {
10452         if (flags & RESTART_UTF8) {
10453             *flagp = RESTART_UTF8;
10454             return NULL;
10455         }
10456         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10457     }
10458     if (*RExC_parse == '|') {
10459         if (!SIZE_ONLY && RExC_extralen) {
10460             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10461         }
10462         else {                  /* MJD */
10463             reginsert(pRExC_state, BRANCH, br, depth+1);
10464             Set_Node_Length(br, paren != 0);
10465             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10466         }
10467         have_branch = 1;
10468         if (SIZE_ONLY)
10469             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10470     }
10471     else if (paren == ':') {
10472         *flagp |= flags&SIMPLE;
10473     }
10474     if (is_open) {                              /* Starts with OPEN. */
10475         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10476     }
10477     else if (paren != '?')              /* Not Conditional */
10478         ret = br;
10479     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10480     lastbr = br;
10481     while (*RExC_parse == '|') {
10482         if (!SIZE_ONLY && RExC_extralen) {
10483             ender = reganode(pRExC_state, LONGJMP,0);
10484
10485             /* Append to the previous. */
10486             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10487         }
10488         if (SIZE_ONLY)
10489             RExC_extralen += 2;         /* Account for LONGJMP. */
10490         nextchar(pRExC_state);
10491         if (freeze_paren) {
10492             if (RExC_npar > after_freeze)
10493                 after_freeze = RExC_npar;
10494             RExC_npar = freeze_paren;
10495         }
10496         br = regbranch(pRExC_state, &flags, 0, depth+1);
10497
10498         if (br == NULL) {
10499             if (flags & RESTART_UTF8) {
10500                 *flagp = RESTART_UTF8;
10501                 return NULL;
10502             }
10503             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10504         }
10505         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10506         lastbr = br;
10507         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10508     }
10509
10510     if (have_branch || paren != ':') {
10511         /* Make a closing node, and hook it on the end. */
10512         switch (paren) {
10513         case ':':
10514             ender = reg_node(pRExC_state, TAIL);
10515             break;
10516         case 1: case 2:
10517             ender = reganode(pRExC_state, CLOSE, parno);
10518             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10519                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10520                         "%*s%*s Setting close paren #%"IVdf" to %d\n",
10521                         22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10522                 RExC_close_parens[parno-1]= ender;
10523                 if (RExC_nestroot == parno)
10524                     RExC_nestroot = 0;
10525             }
10526             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10527             Set_Node_Length(ender,1); /* MJD */
10528             break;
10529         case '<':
10530         case ',':
10531         case '=':
10532         case '!':
10533             *flagp &= ~HASWIDTH;
10534             /* FALLTHROUGH */
10535         case '>':
10536             ender = reg_node(pRExC_state, SUCCEED);
10537             break;
10538         case 0:
10539             ender = reg_node(pRExC_state, END);
10540             if (!SIZE_ONLY) {
10541                 assert(!RExC_opend); /* there can only be one! */
10542                 RExC_opend = ender;
10543             }
10544             break;
10545         }
10546         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10547             DEBUG_PARSE_MSG("lsbr");
10548             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10549             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10550             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10551                           SvPV_nolen_const(RExC_mysv1),
10552                           (IV)REG_NODE_NUM(lastbr),
10553                           SvPV_nolen_const(RExC_mysv2),
10554                           (IV)REG_NODE_NUM(ender),
10555                           (IV)(ender - lastbr)
10556             );
10557         });
10558         REGTAIL(pRExC_state, lastbr, ender);
10559
10560         if (have_branch && !SIZE_ONLY) {
10561             char is_nothing= 1;
10562             if (depth==1)
10563                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10564
10565             /* Hook the tails of the branches to the closing node. */
10566             for (br = ret; br; br = regnext(br)) {
10567                 const U8 op = PL_regkind[OP(br)];
10568                 if (op == BRANCH) {
10569                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10570                     if ( OP(NEXTOPER(br)) != NOTHING
10571                          || regnext(NEXTOPER(br)) != ender)
10572                         is_nothing= 0;
10573                 }
10574                 else if (op == BRANCHJ) {
10575                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10576                     /* for now we always disable this optimisation * /
10577                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10578                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10579                     */
10580                         is_nothing= 0;
10581                 }
10582             }
10583             if (is_nothing) {
10584                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10585                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10586                     DEBUG_PARSE_MSG("NADA");
10587                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10588                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10589                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10590                                   SvPV_nolen_const(RExC_mysv1),
10591                                   (IV)REG_NODE_NUM(ret),
10592                                   SvPV_nolen_const(RExC_mysv2),
10593                                   (IV)REG_NODE_NUM(ender),
10594                                   (IV)(ender - ret)
10595                     );
10596                 });
10597                 OP(br)= NOTHING;
10598                 if (OP(ender) == TAIL) {
10599                     NEXT_OFF(br)= 0;
10600                     RExC_emit= br + 1;
10601                 } else {
10602                     regnode *opt;
10603                     for ( opt= br + 1; opt < ender ; opt++ )
10604                         OP(opt)= OPTIMIZED;
10605                     NEXT_OFF(br)= ender - br;
10606                 }
10607             }
10608         }
10609     }
10610
10611     {
10612         const char *p;
10613         static const char parens[] = "=!<,>";
10614
10615         if (paren && (p = strchr(parens, paren))) {
10616             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10617             int flag = (p - parens) > 1;
10618
10619             if (paren == '>')
10620                 node = SUSPEND, flag = 0;
10621             reginsert(pRExC_state, node,ret, depth+1);
10622             Set_Node_Cur_Length(ret, parse_start);
10623             Set_Node_Offset(ret, parse_start + 1);
10624             ret->flags = flag;
10625             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10626         }
10627     }
10628
10629     /* Check for proper termination. */
10630     if (paren) {
10631         /* restore original flags, but keep (?p) */
10632         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10633         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10634             RExC_parse = oregcomp_parse;
10635             vFAIL("Unmatched (");
10636         }
10637     }
10638     else if (!paren && RExC_parse < RExC_end) {
10639         if (*RExC_parse == ')') {
10640             RExC_parse++;
10641             vFAIL("Unmatched )");
10642         }
10643         else
10644             FAIL("Junk on end of regexp");      /* "Can't happen". */
10645         NOT_REACHED; /* NOTREACHED */
10646     }
10647
10648     if (RExC_in_lookbehind) {
10649         RExC_in_lookbehind--;
10650     }
10651     if (after_freeze > RExC_npar)
10652         RExC_npar = after_freeze;
10653     return(ret);
10654 }
10655
10656 /*
10657  - regbranch - one alternative of an | operator
10658  *
10659  * Implements the concatenation operator.
10660  *
10661  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10662  * restarted.
10663  */
10664 STATIC regnode *
10665 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10666 {
10667     regnode *ret;
10668     regnode *chain = NULL;
10669     regnode *latest;
10670     I32 flags = 0, c = 0;
10671     GET_RE_DEBUG_FLAGS_DECL;
10672
10673     PERL_ARGS_ASSERT_REGBRANCH;
10674
10675     DEBUG_PARSE("brnc");
10676
10677     if (first)
10678         ret = NULL;
10679     else {
10680         if (!SIZE_ONLY && RExC_extralen)
10681             ret = reganode(pRExC_state, BRANCHJ,0);
10682         else {
10683             ret = reg_node(pRExC_state, BRANCH);
10684             Set_Node_Length(ret, 1);
10685         }
10686     }
10687
10688     if (!first && SIZE_ONLY)
10689         RExC_extralen += 1;                     /* BRANCHJ */
10690
10691     *flagp = WORST;                     /* Tentatively. */
10692
10693     RExC_parse--;
10694     nextchar(pRExC_state);
10695     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10696         flags &= ~TRYAGAIN;
10697         latest = regpiece(pRExC_state, &flags,depth+1);
10698         if (latest == NULL) {
10699             if (flags & TRYAGAIN)
10700                 continue;
10701             if (flags & RESTART_UTF8) {
10702                 *flagp = RESTART_UTF8;
10703                 return NULL;
10704             }
10705             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10706         }
10707         else if (ret == NULL)
10708             ret = latest;
10709         *flagp |= flags&(HASWIDTH|POSTPONED);
10710         if (chain == NULL)      /* First piece. */
10711             *flagp |= flags&SPSTART;
10712         else {
10713             RExC_naughty++;
10714             REGTAIL(pRExC_state, chain, latest);
10715         }
10716         chain = latest;
10717         c++;
10718     }
10719     if (chain == NULL) {        /* Loop ran zero times. */
10720         chain = reg_node(pRExC_state, NOTHING);
10721         if (ret == NULL)
10722             ret = chain;
10723     }
10724     if (c == 1) {
10725         *flagp |= flags&SIMPLE;
10726     }
10727
10728     return ret;
10729 }
10730
10731 /*
10732  - regpiece - something followed by possible [*+?]
10733  *
10734  * Note that the branching code sequences used for ? and the general cases
10735  * of * and + are somewhat optimized:  they use the same NOTHING node as
10736  * both the endmarker for their branch list and the body of the last branch.
10737  * It might seem that this node could be dispensed with entirely, but the
10738  * endmarker role is not redundant.
10739  *
10740  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10741  * TRYAGAIN.
10742  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10743  * restarted.
10744  */
10745 STATIC regnode *
10746 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10747 {
10748     regnode *ret;
10749     char op;
10750     char *next;
10751     I32 flags;
10752     const char * const origparse = RExC_parse;
10753     I32 min;
10754     I32 max = REG_INFTY;
10755 #ifdef RE_TRACK_PATTERN_OFFSETS
10756     char *parse_start;
10757 #endif
10758     const char *maxpos = NULL;
10759
10760     /* Save the original in case we change the emitted regop to a FAIL. */
10761     regnode * const orig_emit = RExC_emit;
10762
10763     GET_RE_DEBUG_FLAGS_DECL;
10764
10765     PERL_ARGS_ASSERT_REGPIECE;
10766
10767     DEBUG_PARSE("piec");
10768
10769     ret = regatom(pRExC_state, &flags,depth+1);
10770     if (ret == NULL) {
10771         if (flags & (TRYAGAIN|RESTART_UTF8))
10772             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10773         else
10774             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10775         return(NULL);
10776     }
10777
10778     op = *RExC_parse;
10779
10780     if (op == '{' && regcurly(RExC_parse)) {
10781         maxpos = NULL;
10782 #ifdef RE_TRACK_PATTERN_OFFSETS
10783         parse_start = RExC_parse; /* MJD */
10784 #endif
10785         next = RExC_parse + 1;
10786         while (isDIGIT(*next) || *next == ',') {
10787             if (*next == ',') {
10788                 if (maxpos)
10789                     break;
10790                 else
10791                     maxpos = next;
10792             }
10793             next++;
10794         }
10795         if (*next == '}') {             /* got one */
10796             const char* endptr;
10797             if (!maxpos)
10798                 maxpos = next;
10799             RExC_parse++;
10800             min = grok_atou(RExC_parse, &endptr);
10801             if (*maxpos == ',')
10802                 maxpos++;
10803             else
10804                 maxpos = RExC_parse;
10805             max = grok_atou(maxpos, &endptr);
10806             if (!max && *maxpos != '0')
10807                 max = REG_INFTY;                /* meaning "infinity" */
10808             else if (max >= REG_INFTY)
10809                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10810             RExC_parse = next;
10811             nextchar(pRExC_state);
10812             if (max < min) {    /* If can't match, warn and optimize to fail
10813                                    unconditionally */
10814                 if (SIZE_ONLY) {
10815
10816                     /* We can't back off the size because we have to reserve
10817                      * enough space for all the things we are about to throw
10818                      * away, but we can shrink it by the ammount we are about
10819                      * to re-use here */
10820                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10821                 }
10822                 else {
10823                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10824                     RExC_emit = orig_emit;
10825                 }
10826                 ret = reg_node(pRExC_state, OPFAIL);
10827                 return ret;
10828             }
10829             else if (min == max
10830                      && RExC_parse < RExC_end
10831                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10832             {
10833                 if (PASS2) {
10834                     ckWARN2reg(RExC_parse + 1,
10835                                "Useless use of greediness modifier '%c'",
10836                                *RExC_parse);
10837                 }
10838                 /* Absorb the modifier, so later code doesn't see nor use
10839                     * it */
10840                 nextchar(pRExC_state);
10841             }
10842
10843         do_curly:
10844             if ((flags&SIMPLE)) {
10845                 RExC_naughty += 2 + RExC_naughty / 2;
10846                 reginsert(pRExC_state, CURLY, ret, depth+1);
10847                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10848                 Set_Node_Cur_Length(ret, parse_start);
10849             }
10850             else {
10851                 regnode * const w = reg_node(pRExC_state, WHILEM);
10852
10853                 w->flags = 0;
10854                 REGTAIL(pRExC_state, ret, w);
10855                 if (!SIZE_ONLY && RExC_extralen) {
10856                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10857                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10858                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10859                 }
10860                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10861                                 /* MJD hk */
10862                 Set_Node_Offset(ret, parse_start+1);
10863                 Set_Node_Length(ret,
10864                                 op == '{' ? (RExC_parse - parse_start) : 1);
10865
10866                 if (!SIZE_ONLY && RExC_extralen)
10867                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10868                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10869                 if (SIZE_ONLY)
10870                     RExC_whilem_seen++, RExC_extralen += 3;
10871                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10872             }
10873             ret->flags = 0;
10874
10875             if (min > 0)
10876                 *flagp = WORST;
10877             if (max > 0)
10878                 *flagp |= HASWIDTH;
10879             if (!SIZE_ONLY) {
10880                 ARG1_SET(ret, (U16)min);
10881                 ARG2_SET(ret, (U16)max);
10882             }
10883             if (max == REG_INFTY)
10884                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10885
10886             goto nest_check;
10887         }
10888     }
10889
10890     if (!ISMULT1(op)) {
10891         *flagp = flags;
10892         return(ret);
10893     }
10894
10895 #if 0                           /* Now runtime fix should be reliable. */
10896
10897     /* if this is reinstated, don't forget to put this back into perldiag:
10898
10899             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10900
10901            (F) The part of the regexp subject to either the * or + quantifier
10902            could match an empty string. The {#} shows in the regular
10903            expression about where the problem was discovered.
10904
10905     */
10906
10907     if (!(flags&HASWIDTH) && op != '?')
10908       vFAIL("Regexp *+ operand could be empty");
10909 #endif
10910
10911 #ifdef RE_TRACK_PATTERN_OFFSETS
10912     parse_start = RExC_parse;
10913 #endif
10914     nextchar(pRExC_state);
10915
10916     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10917
10918     if (op == '*' && (flags&SIMPLE)) {
10919         reginsert(pRExC_state, STAR, ret, depth+1);
10920         ret->flags = 0;
10921         RExC_naughty += 4;
10922         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10923     }
10924     else if (op == '*') {
10925         min = 0;
10926         goto do_curly;
10927     }
10928     else if (op == '+' && (flags&SIMPLE)) {
10929         reginsert(pRExC_state, PLUS, ret, depth+1);
10930         ret->flags = 0;
10931         RExC_naughty += 3;
10932         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10933     }
10934     else if (op == '+') {
10935         min = 1;
10936         goto do_curly;
10937     }
10938     else if (op == '?') {
10939         min = 0; max = 1;
10940         goto do_curly;
10941     }
10942   nest_check:
10943     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10944         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10945         ckWARN2reg(RExC_parse,
10946                    "%"UTF8f" matches null string many times",
10947                    UTF8fARG(UTF, (RExC_parse >= origparse
10948                                  ? RExC_parse - origparse
10949                                  : 0),
10950                    origparse));
10951         (void)ReREFCNT_inc(RExC_rx_sv);
10952     }
10953
10954     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10955         nextchar(pRExC_state);
10956         reginsert(pRExC_state, MINMOD, ret, depth+1);
10957         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10958     }
10959     else
10960     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10961         regnode *ender;
10962         nextchar(pRExC_state);
10963         ender = reg_node(pRExC_state, SUCCEED);
10964         REGTAIL(pRExC_state, ret, ender);
10965         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10966         ret->flags = 0;
10967         ender = reg_node(pRExC_state, TAIL);
10968         REGTAIL(pRExC_state, ret, ender);
10969     }
10970
10971     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10972         RExC_parse++;
10973         vFAIL("Nested quantifiers");
10974     }
10975
10976     return(ret);
10977 }
10978
10979 STATIC STRLEN
10980 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10981                       UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10982     )
10983 {
10984
10985  /* This is expected to be called by a parser routine that has recognized '\N'
10986    and needs to handle the rest. RExC_parse is expected to point at the first
10987    char following the N at the time of the call.  On successful return,
10988    RExC_parse has been updated to point to just after the sequence identified
10989    by this routine, <*flagp> has been updated, and the non-NULL input pointers
10990    have been set appropriately.
10991
10992    The typical case for this is \N{some character name}.  This is usually
10993    called while parsing the input, filling in or ready to fill in an EXACTish
10994    node, and the code point for the character should be returned, so that it
10995    can be added to the node, and parsing continued with the next input
10996    character.  But it may be that instead of a single character the \N{}
10997    expands to more than one, a named sequence.  In this case any following
10998    quantifier applies to the whole sequence, and it is easier, given the code
10999    structure that calls this, to handle it from a different area of the code.
11000    For this reason, the input parameters can be set so that it returns valid
11001    only on one or the other of these cases.
11002
11003    Another possibility is for the input to be an empty \N{}, which for
11004    backwards compatibility we accept, but generate a NOTHING node which should
11005    later get optimized out.  This is handled from the area of code which can
11006    handle a named sequence, so if called with the parameters for the other, it
11007    fails.
11008
11009    Still another possibility is for the \N to mean [^\n], and not a single
11010    character or explicit sequence at all.  This is determined by context.
11011    Again, this is handled from the area of code which can handle a named
11012    sequence, so if called with the parameters for the other, it also fails.
11013
11014    And the final possibility is for the \N to be called from within a bracketed
11015    character class.  In this case the [^\n] meaning makes no sense, and so is
11016    an error.  Other anomalous situations are left to the calling code to handle.
11017
11018    For non-single-quoted regexes, the tokenizer has attempted to decide which
11019    of the above applies, and in the case of a named sequence, has converted it
11020    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11021    where c1... are the characters in the sequence.  For single-quoted regexes,
11022    the tokenizer passes the \N sequence through unchanged; this code will not
11023    attempt to determine this nor expand those, instead raising a syntax error.
11024    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11025    or there is no '}', it signals that this \N occurrence means to match a
11026    non-newline. (This mostly was done because of [perl #56444].)
11027
11028    The API is somewhat convoluted due to historical and the above reasons.
11029
11030    The function raises an error (via vFAIL), and doesn't return for various
11031    syntax errors.  For other failures, it returns (STRLEN) -1.  For successes,
11032    it returns a count of how many characters were accounted for by it.  (This
11033    can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11034    points in the sequence.  It sets <node_p>, <valuep>, and/or
11035    <substitute_parse> on success.
11036
11037    If <valuep> is non-null, it means the caller can accept an input sequence
11038    consisting of a just a single code point; <*valuep> is set to the value
11039    of the only or first code point in the input.
11040
11041    If <substitute_parse> is non-null, it means the caller can accept an input
11042    sequence consisting of one or more code points; <*substitute_parse> is a
11043    newly created mortal SV* in this case, containing \x{} escapes representing
11044    those code points.
11045
11046    Both <valuep> and <substitute_parse> can be non-NULL.
11047
11048    If <node_p> is non-null, <substitute_parse> must be NULL.  This signifies
11049    that the caller can accept any legal sequence other than a single code
11050    point.  To wit, <*node_p> is set as follows:
11051     1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11052     2) \N{}:              points to a new NOTHING node; return is 0
11053     3) otherwise:         points to a new EXACT node containing the resolved
11054                           string; return is the number of code points in the
11055                           string.  This will never be 1.
11056    Note that failure is returned for single code point sequences if <valuep> is
11057    null and <node_p> is not.
11058  */
11059
11060     char * endbrace;    /* '}' following the name */
11061     char* p;
11062     char *endchar;      /* Points to '.' or '}' ending cur char in the input
11063                            stream */
11064     bool has_multiple_chars; /* true if the input stream contains a sequence of
11065                                 more than one character */
11066     bool in_char_class = substitute_parse != NULL;
11067     STRLEN count = 0;   /* Number of characters in this sequence */
11068
11069     GET_RE_DEBUG_FLAGS_DECL;
11070
11071     PERL_ARGS_ASSERT_GROK_BSLASH_N;
11072
11073     GET_RE_DEBUG_FLAGS;
11074
11075     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
11076     assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11077
11078     /* The [^\n] meaning of \N ignores spaces and comments under the /x
11079      * modifier.  The other meaning does not, so use a temporary until we find
11080      * out which we are being called with */
11081     p = (RExC_flags & RXf_PMf_EXTENDED)
11082         ? regpatws(pRExC_state, RExC_parse,
11083                                 TRUE) /* means recognize comments */
11084         : RExC_parse;
11085
11086     /* Disambiguate between \N meaning a named character versus \N meaning
11087      * [^\n].  The former is assumed when it can't be the latter. */
11088     if (*p != '{' || regcurly(p)) {
11089         RExC_parse = p;
11090         if (! node_p) {
11091             /* no bare \N allowed in a charclass */
11092             if (in_char_class) {
11093                 vFAIL("\\N in a character class must be a named character: \\N{...}");
11094             }
11095             return (STRLEN) -1;
11096         }
11097         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
11098                            current char */
11099         nextchar(pRExC_state);
11100         *node_p = reg_node(pRExC_state, REG_ANY);
11101         *flagp |= HASWIDTH|SIMPLE;
11102         RExC_naughty++;
11103         Set_Node_Length(*node_p, 1); /* MJD */
11104         return 1;
11105     }
11106
11107     /* Here, we have decided it should be a named character or sequence */
11108
11109     /* The test above made sure that the next real character is a '{', but
11110      * under the /x modifier, it could be separated by space (or a comment and
11111      * \n) and this is not allowed (for consistency with \x{...} and the
11112      * tokenizer handling of \N{NAME}). */
11113     if (*RExC_parse != '{') {
11114         vFAIL("Missing braces on \\N{}");
11115     }
11116
11117     RExC_parse++;       /* Skip past the '{' */
11118
11119     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11120         || ! (endbrace == RExC_parse            /* nothing between the {} */
11121               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
11122                                                  */
11123                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
11124                                                      */
11125     {
11126         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
11127         vFAIL("\\N{NAME} must be resolved by the lexer");
11128     }
11129
11130     if (endbrace == RExC_parse) {   /* empty: \N{} */
11131         if (node_p) {
11132             *node_p = reg_node(pRExC_state,NOTHING);
11133         }
11134         else if (! in_char_class) {
11135             return (STRLEN) -1;
11136         }
11137         nextchar(pRExC_state);
11138         return 0;
11139     }
11140
11141     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11142     RExC_parse += 2;    /* Skip past the 'U+' */
11143
11144     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11145
11146     /* Code points are separated by dots.  If none, there is only one code
11147      * point, and is terminated by the brace */
11148     has_multiple_chars = (endchar < endbrace);
11149
11150     /* We get the first code point if we want it, and either there is only one,
11151      * or we can accept both cases of one and more than one */
11152     if (valuep && (substitute_parse || ! has_multiple_chars)) {
11153         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11154         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11155                            | PERL_SCAN_DISALLOW_PREFIX
11156
11157                              /* No errors in the first pass (See [perl
11158                               * #122671].)  We let the code below find the
11159                               * errors when there are multiple chars. */
11160                            | ((SIZE_ONLY || has_multiple_chars)
11161                               ? PERL_SCAN_SILENT_ILLDIGIT
11162                               : 0);
11163
11164         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11165
11166         /* The tokenizer should have guaranteed validity, but it's possible to
11167          * bypass it by using single quoting, so check.  Don't do the check
11168          * here when there are multiple chars; we do it below anyway. */
11169         if (! has_multiple_chars) {
11170             if (length_of_hex == 0
11171                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11172             {
11173                 RExC_parse += length_of_hex;    /* Includes all the valid */
11174                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
11175                                 ? UTF8SKIP(RExC_parse)
11176                                 : 1;
11177                 /* Guard against malformed utf8 */
11178                 if (RExC_parse >= endchar) {
11179                     RExC_parse = endchar;
11180                 }
11181                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11182             }
11183
11184             RExC_parse = endbrace + 1;
11185             return 1;
11186         }
11187     }
11188
11189     /* Here, we should have already handled the case where a single character
11190      * is expected and found.  So it is a failure if we aren't expecting
11191      * multiple chars and got them; or didn't get them but wanted them.  We
11192      * fail without advancing the parse, so that the caller can try again with
11193      * different acceptance criteria */
11194     if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11195         RExC_parse = p;
11196         return (STRLEN) -1;
11197     }
11198
11199     {
11200
11201         /* What is done here is to convert this to a sub-pattern of the form
11202          * \x{char1}\x{char2}...
11203          * and then either return it in <*substitute_parse> if non-null; or
11204          * call reg recursively to parse it (enclosing in "(?: ... )" ).  That
11205          * way, it retains its atomicness, while not having to worry about
11206          * special handling that some code points may have.  toke.c has
11207          * converted the original Unicode values to native, so that we can just
11208          * pass on the hex values unchanged.  We do have to set a flag to keep
11209          * recoding from happening in the recursion */
11210
11211         SV * dummy = NULL;
11212         STRLEN len;
11213         char *orig_end = RExC_end;
11214         I32 flags;
11215
11216         if (substitute_parse) {
11217             *substitute_parse = newSVpvs("");
11218         }
11219         else {
11220             substitute_parse = &dummy;
11221             *substitute_parse = newSVpvs("?:");
11222         }
11223         *substitute_parse = sv_2mortal(*substitute_parse);
11224
11225         while (RExC_parse < endbrace) {
11226
11227             /* Convert to notation the rest of the code understands */
11228             sv_catpv(*substitute_parse, "\\x{");
11229             sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11230             sv_catpv(*substitute_parse, "}");
11231
11232             /* Point to the beginning of the next character in the sequence. */
11233             RExC_parse = endchar + 1;
11234             endchar = RExC_parse + strcspn(RExC_parse, ".}");
11235
11236             count++;
11237         }
11238         if (! in_char_class) {
11239             sv_catpv(*substitute_parse, ")");
11240         }
11241
11242         RExC_parse = SvPV(*substitute_parse, len);
11243
11244         /* Don't allow empty number */
11245         if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11246             RExC_parse = endbrace;
11247             vFAIL("Invalid hexadecimal number in \\N{U+...}");
11248         }
11249         RExC_end = RExC_parse + len;
11250
11251         /* The values are Unicode, and therefore not subject to recoding */
11252         RExC_override_recoding = 1;
11253
11254         if (node_p) {
11255             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11256                 if (flags & RESTART_UTF8) {
11257                     *flagp = RESTART_UTF8;
11258                     return (STRLEN) -1;
11259                 }
11260                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11261                     (UV) flags);
11262             }
11263             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11264         }
11265
11266         RExC_parse = endbrace;
11267         RExC_end = orig_end;
11268         RExC_override_recoding = 0;
11269
11270         nextchar(pRExC_state);
11271     }
11272
11273     return count;
11274 }
11275
11276
11277 /*
11278  * reg_recode
11279  *
11280  * It returns the code point in utf8 for the value in *encp.
11281  *    value: a code value in the source encoding
11282  *    encp:  a pointer to an Encode object
11283  *
11284  * If the result from Encode is not a single character,
11285  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11286  */
11287 STATIC UV
11288 S_reg_recode(pTHX_ const char value, SV **encp)
11289 {
11290     STRLEN numlen = 1;
11291     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11292     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11293     const STRLEN newlen = SvCUR(sv);
11294     UV uv = UNICODE_REPLACEMENT;
11295
11296     PERL_ARGS_ASSERT_REG_RECODE;
11297
11298     if (newlen)
11299         uv = SvUTF8(sv)
11300              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11301              : *(U8*)s;
11302
11303     if (!newlen || numlen != newlen) {
11304         uv = UNICODE_REPLACEMENT;
11305         *encp = NULL;
11306     }
11307     return uv;
11308 }
11309
11310 PERL_STATIC_INLINE U8
11311 S_compute_EXACTish(RExC_state_t *pRExC_state)
11312 {
11313     U8 op;
11314
11315     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11316
11317     if (! FOLD) {
11318         return EXACT;
11319     }
11320
11321     op = get_regex_charset(RExC_flags);
11322     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11323         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11324                  been, so there is no hole */
11325     }
11326
11327     return op + EXACTF;
11328 }
11329
11330 PERL_STATIC_INLINE void
11331 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11332                          regnode *node, I32* flagp, STRLEN len, UV code_point,
11333                          bool downgradable)
11334 {
11335     /* This knows the details about sizing an EXACTish node, setting flags for
11336      * it (by setting <*flagp>, and potentially populating it with a single
11337      * character.
11338      *
11339      * If <len> (the length in bytes) is non-zero, this function assumes that
11340      * the node has already been populated, and just does the sizing.  In this
11341      * case <code_point> should be the final code point that has already been
11342      * placed into the node.  This value will be ignored except that under some
11343      * circumstances <*flagp> is set based on it.
11344      *
11345      * If <len> is zero, the function assumes that the node is to contain only
11346      * the single character given by <code_point> and calculates what <len>
11347      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11348      * additionally will populate the node's STRING with <code_point> or its
11349      * fold if folding.
11350      *
11351      * In both cases <*flagp> is appropriately set
11352      *
11353      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11354      * 255, must be folded (the former only when the rules indicate it can
11355      * match 'ss')
11356      *
11357      * When it does the populating, it looks at the flag 'downgradable'.  If
11358      * true with a node that folds, it checks if the single code point
11359      * participates in a fold, and if not downgrades the node to an EXACT.
11360      * This helps the optimizer */
11361
11362     bool len_passed_in = cBOOL(len != 0);
11363     U8 character[UTF8_MAXBYTES_CASE+1];
11364
11365     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11366
11367     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11368      * sizing difference, and is extra work that is thrown away */
11369     if (downgradable && ! PASS2) {
11370         downgradable = FALSE;
11371     }
11372
11373     if (! len_passed_in) {
11374         if (UTF) {
11375             if (UVCHR_IS_INVARIANT(code_point)) {
11376                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11377                     *character = (U8) code_point;
11378                 }
11379                 else { /* Here is /i and not /l. (toFOLD() is defined on just
11380                           ASCII, which isn't the same thing as INVARIANT on
11381                           EBCDIC, but it works there, as the extra invariants
11382                           fold to themselves) */
11383                     *character = toFOLD((U8) code_point);
11384
11385                     /* We can downgrade to an EXACT node if this character
11386                      * isn't a folding one.  Note that this assumes that
11387                      * nothing above Latin1 folds to some other invariant than
11388                      * one of these alphabetics; otherwise we would also have
11389                      * to check:
11390                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11391                      *      || ASCII_FOLD_RESTRICTED))
11392                      */
11393                     if (downgradable && PL_fold[code_point] == code_point) {
11394                         OP(node) = EXACT;
11395                     }
11396                 }
11397                 len = 1;
11398             }
11399             else if (FOLD && (! LOC
11400                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11401             {   /* Folding, and ok to do so now */
11402                 UV folded = _to_uni_fold_flags(
11403                                    code_point,
11404                                    character,
11405                                    &len,
11406                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11407                                                       ? FOLD_FLAGS_NOMIX_ASCII
11408                                                       : 0));
11409                 if (downgradable
11410                     && folded == code_point /* This quickly rules out many
11411                                                cases, avoiding the
11412                                                _invlist_contains_cp() overhead
11413                                                for those.  */
11414                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11415                 {
11416                     OP(node) = EXACT;
11417                 }
11418             }
11419             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11420
11421                 /* Not folding this cp, and can output it directly */
11422                 *character = UTF8_TWO_BYTE_HI(code_point);
11423                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11424                 len = 2;
11425             }
11426             else {
11427                 uvchr_to_utf8( character, code_point);
11428                 len = UTF8SKIP(character);
11429             }
11430         } /* Else pattern isn't UTF8.  */
11431         else if (! FOLD) {
11432             *character = (U8) code_point;
11433             len = 1;
11434         } /* Else is folded non-UTF8 */
11435         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11436
11437             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11438              * comments at join_exact()); */
11439             *character = (U8) code_point;
11440             len = 1;
11441
11442             /* Can turn into an EXACT node if we know the fold at compile time,
11443              * and it folds to itself and doesn't particpate in other folds */
11444             if (downgradable
11445                 && ! LOC
11446                 && PL_fold_latin1[code_point] == code_point
11447                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11448                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11449             {
11450                 OP(node) = EXACT;
11451             }
11452         } /* else is Sharp s.  May need to fold it */
11453         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11454             *character = 's';
11455             *(character + 1) = 's';
11456             len = 2;
11457         }
11458         else {
11459             *character = LATIN_SMALL_LETTER_SHARP_S;
11460             len = 1;
11461         }
11462     }
11463
11464     if (SIZE_ONLY) {
11465         RExC_size += STR_SZ(len);
11466     }
11467     else {
11468         RExC_emit += STR_SZ(len);
11469         STR_LEN(node) = len;
11470         if (! len_passed_in) {
11471             Copy((char *) character, STRING(node), len, char);
11472         }
11473     }
11474
11475     *flagp |= HASWIDTH;
11476
11477     /* A single character node is SIMPLE, except for the special-cased SHARP S
11478      * under /di. */
11479     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11480         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11481             || ! FOLD || ! DEPENDS_SEMANTICS))
11482     {
11483         *flagp |= SIMPLE;
11484     }
11485
11486     /* The OP may not be well defined in PASS1 */
11487     if (PASS2 && OP(node) == EXACTFL) {
11488         RExC_contains_locale = 1;
11489     }
11490 }
11491
11492
11493 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11494  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11495
11496 static I32
11497 S_backref_value(char *p)
11498 {
11499     const char* endptr;
11500     UV val = grok_atou(p, &endptr);
11501     if (endptr == p || endptr == NULL || val > I32_MAX)
11502         return I32_MAX;
11503     return (I32)val;
11504 }
11505
11506
11507 /*
11508  - regatom - the lowest level
11509
11510    Try to identify anything special at the start of the pattern. If there
11511    is, then handle it as required. This may involve generating a single regop,
11512    such as for an assertion; or it may involve recursing, such as to
11513    handle a () structure.
11514
11515    If the string doesn't start with something special then we gobble up
11516    as much literal text as we can.
11517
11518    Once we have been able to handle whatever type of thing started the
11519    sequence, we return.
11520
11521    Note: we have to be careful with escapes, as they can be both literal
11522    and special, and in the case of \10 and friends, context determines which.
11523
11524    A summary of the code structure is:
11525
11526    switch (first_byte) {
11527         cases for each special:
11528             handle this special;
11529             break;
11530         case '\\':
11531             switch (2nd byte) {
11532                 cases for each unambiguous special:
11533                     handle this special;
11534                     break;
11535                 cases for each ambigous special/literal:
11536                     disambiguate;
11537                     if (special)  handle here
11538                     else goto defchar;
11539                 default: // unambiguously literal:
11540                     goto defchar;
11541             }
11542         default:  // is a literal char
11543             // FALL THROUGH
11544         defchar:
11545             create EXACTish node for literal;
11546             while (more input and node isn't full) {
11547                 switch (input_byte) {
11548                    cases for each special;
11549                        make sure parse pointer is set so that the next call to
11550                            regatom will see this special first
11551                        goto loopdone; // EXACTish node terminated by prev. char
11552                    default:
11553                        append char to EXACTISH node;
11554                 }
11555                 get next input byte;
11556             }
11557         loopdone:
11558    }
11559    return the generated node;
11560
11561    Specifically there are two separate switches for handling
11562    escape sequences, with the one for handling literal escapes requiring
11563    a dummy entry for all of the special escapes that are actually handled
11564    by the other.
11565
11566    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11567    TRYAGAIN.
11568    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11569    restarted.
11570    Otherwise does not return NULL.
11571 */
11572
11573 STATIC regnode *
11574 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11575 {
11576     regnode *ret = NULL;
11577     I32 flags = 0;
11578     char *parse_start = RExC_parse;
11579     U8 op;
11580     int invert = 0;
11581     U8 arg;
11582
11583     GET_RE_DEBUG_FLAGS_DECL;
11584
11585     *flagp = WORST;             /* Tentatively. */
11586
11587     DEBUG_PARSE("atom");
11588
11589     PERL_ARGS_ASSERT_REGATOM;
11590
11591 tryagain:
11592     switch ((U8)*RExC_parse) {
11593     case '^':
11594         RExC_seen_zerolen++;
11595         nextchar(pRExC_state);
11596         if (RExC_flags & RXf_PMf_MULTILINE)
11597             ret = reg_node(pRExC_state, MBOL);
11598         else
11599             ret = reg_node(pRExC_state, SBOL);
11600         Set_Node_Length(ret, 1); /* MJD */
11601         break;
11602     case '$':
11603         nextchar(pRExC_state);
11604         if (*RExC_parse)
11605             RExC_seen_zerolen++;
11606         if (RExC_flags & RXf_PMf_MULTILINE)
11607             ret = reg_node(pRExC_state, MEOL);
11608         else
11609             ret = reg_node(pRExC_state, SEOL);
11610         Set_Node_Length(ret, 1); /* MJD */
11611         break;
11612     case '.':
11613         nextchar(pRExC_state);
11614         if (RExC_flags & RXf_PMf_SINGLELINE)
11615             ret = reg_node(pRExC_state, SANY);
11616         else
11617             ret = reg_node(pRExC_state, REG_ANY);
11618         *flagp |= HASWIDTH|SIMPLE;
11619         RExC_naughty++;
11620         Set_Node_Length(ret, 1); /* MJD */
11621         break;
11622     case '[':
11623     {
11624         char * const oregcomp_parse = ++RExC_parse;
11625         ret = regclass(pRExC_state, flagp,depth+1,
11626                        FALSE, /* means parse the whole char class */
11627                        TRUE, /* allow multi-char folds */
11628                        FALSE, /* don't silence non-portable warnings. */
11629                        NULL);
11630         if (*RExC_parse != ']') {
11631             RExC_parse = oregcomp_parse;
11632             vFAIL("Unmatched [");
11633         }
11634         if (ret == NULL) {
11635             if (*flagp & RESTART_UTF8)
11636                 return NULL;
11637             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11638                   (UV) *flagp);
11639         }
11640         nextchar(pRExC_state);
11641         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11642         break;
11643     }
11644     case '(':
11645         nextchar(pRExC_state);
11646         ret = reg(pRExC_state, 2, &flags,depth+1);
11647         if (ret == NULL) {
11648                 if (flags & TRYAGAIN) {
11649                     if (RExC_parse == RExC_end) {
11650                          /* Make parent create an empty node if needed. */
11651                         *flagp |= TRYAGAIN;
11652                         return(NULL);
11653                     }
11654                     goto tryagain;
11655                 }
11656                 if (flags & RESTART_UTF8) {
11657                     *flagp = RESTART_UTF8;
11658                     return NULL;
11659                 }
11660                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11661                                                                  (UV) flags);
11662         }
11663         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11664         break;
11665     case '|':
11666     case ')':
11667         if (flags & TRYAGAIN) {
11668             *flagp |= TRYAGAIN;
11669             return NULL;
11670         }
11671         vFAIL("Internal urp");
11672                                 /* Supposed to be caught earlier. */
11673         break;
11674     case '?':
11675     case '+':
11676     case '*':
11677         RExC_parse++;
11678         vFAIL("Quantifier follows nothing");
11679         break;
11680     case '\\':
11681         /* Special Escapes
11682
11683            This switch handles escape sequences that resolve to some kind
11684            of special regop and not to literal text. Escape sequnces that
11685            resolve to literal text are handled below in the switch marked
11686            "Literal Escapes".
11687
11688            Every entry in this switch *must* have a corresponding entry
11689            in the literal escape switch. However, the opposite is not
11690            required, as the default for this switch is to jump to the
11691            literal text handling code.
11692         */
11693         switch ((U8)*++RExC_parse) {
11694         /* Special Escapes */
11695         case 'A':
11696             RExC_seen_zerolen++;
11697             ret = reg_node(pRExC_state, SBOL);
11698             /* SBOL is shared with /^/ so we set the flags so we can tell
11699              * /\A/ from /^/ in split. We check ret because first pass we
11700              * have no regop struct to set the flags on. */
11701             if (PASS2)
11702                 ret->flags = 1;
11703             *flagp |= SIMPLE;
11704             goto finish_meta_pat;
11705         case 'G':
11706             ret = reg_node(pRExC_state, GPOS);
11707             RExC_seen |= REG_GPOS_SEEN;
11708             *flagp |= SIMPLE;
11709             goto finish_meta_pat;
11710         case 'K':
11711             RExC_seen_zerolen++;
11712             ret = reg_node(pRExC_state, KEEPS);
11713             *flagp |= SIMPLE;
11714             /* XXX:dmq : disabling in-place substitution seems to
11715              * be necessary here to avoid cases of memory corruption, as
11716              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11717              */
11718             RExC_seen |= REG_LOOKBEHIND_SEEN;
11719             goto finish_meta_pat;
11720         case 'Z':
11721             ret = reg_node(pRExC_state, SEOL);
11722             *flagp |= SIMPLE;
11723             RExC_seen_zerolen++;                /* Do not optimize RE away */
11724             goto finish_meta_pat;
11725         case 'z':
11726             ret = reg_node(pRExC_state, EOS);
11727             *flagp |= SIMPLE;
11728             RExC_seen_zerolen++;                /* Do not optimize RE away */
11729             goto finish_meta_pat;
11730         case 'C':
11731             ret = reg_node(pRExC_state, CANY);
11732             RExC_seen |= REG_CANY_SEEN;
11733             *flagp |= HASWIDTH|SIMPLE;
11734             if (PASS2) {
11735                 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11736             }
11737             goto finish_meta_pat;
11738         case 'X':
11739             ret = reg_node(pRExC_state, CLUMP);
11740             *flagp |= HASWIDTH;
11741             goto finish_meta_pat;
11742
11743         case 'W':
11744             invert = 1;
11745             /* FALLTHROUGH */
11746         case 'w':
11747             arg = ANYOF_WORDCHAR;
11748             goto join_posix;
11749
11750         case 'b':
11751             RExC_seen_zerolen++;
11752             RExC_seen |= REG_LOOKBEHIND_SEEN;
11753             op = BOUND + get_regex_charset(RExC_flags);
11754             if (op > BOUNDA) {  /* /aa is same as /a */
11755                 op = BOUNDA;
11756             }
11757             else if (op == BOUNDL) {
11758                 RExC_contains_locale = 1;
11759             }
11760             ret = reg_node(pRExC_state, op);
11761             FLAGS(ret) = get_regex_charset(RExC_flags);
11762             *flagp |= SIMPLE;
11763             if ((U8) *(RExC_parse + 1) == '{') {
11764                 /* diag_listed_as: Use "%s" instead of "%s" */
11765                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11766             }
11767             goto finish_meta_pat;
11768         case 'B':
11769             RExC_seen_zerolen++;
11770             RExC_seen |= REG_LOOKBEHIND_SEEN;
11771             op = NBOUND + get_regex_charset(RExC_flags);
11772             if (op > NBOUNDA) { /* /aa is same as /a */
11773                 op = NBOUNDA;
11774             }
11775             else if (op == NBOUNDL) {
11776                 RExC_contains_locale = 1;
11777             }
11778             ret = reg_node(pRExC_state, op);
11779             FLAGS(ret) = get_regex_charset(RExC_flags);
11780             *flagp |= SIMPLE;
11781             if ((U8) *(RExC_parse + 1) == '{') {
11782                 /* diag_listed_as: Use "%s" instead of "%s" */
11783                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11784             }
11785             goto finish_meta_pat;
11786
11787         case 'D':
11788             invert = 1;
11789             /* FALLTHROUGH */
11790         case 'd':
11791             arg = ANYOF_DIGIT;
11792             goto join_posix;
11793
11794         case 'R':
11795             ret = reg_node(pRExC_state, LNBREAK);
11796             *flagp |= HASWIDTH|SIMPLE;
11797             goto finish_meta_pat;
11798
11799         case 'H':
11800             invert = 1;
11801             /* FALLTHROUGH */
11802         case 'h':
11803             arg = ANYOF_BLANK;
11804             op = POSIXU;
11805             goto join_posix_op_known;
11806
11807         case 'V':
11808             invert = 1;
11809             /* FALLTHROUGH */
11810         case 'v':
11811             arg = ANYOF_VERTWS;
11812             op = POSIXU;
11813             goto join_posix_op_known;
11814
11815         case 'S':
11816             invert = 1;
11817             /* FALLTHROUGH */
11818         case 's':
11819             arg = ANYOF_SPACE;
11820
11821         join_posix:
11822
11823             op = POSIXD + get_regex_charset(RExC_flags);
11824             if (op > POSIXA) {  /* /aa is same as /a */
11825                 op = POSIXA;
11826             }
11827             else if (op == POSIXL) {
11828                 RExC_contains_locale = 1;
11829             }
11830
11831         join_posix_op_known:
11832
11833             if (invert) {
11834                 op += NPOSIXD - POSIXD;
11835             }
11836
11837             ret = reg_node(pRExC_state, op);
11838             if (! SIZE_ONLY) {
11839                 FLAGS(ret) = namedclass_to_classnum(arg);
11840             }
11841
11842             *flagp |= HASWIDTH|SIMPLE;
11843             /* FALLTHROUGH */
11844
11845          finish_meta_pat:
11846             nextchar(pRExC_state);
11847             Set_Node_Length(ret, 2); /* MJD */
11848             break;
11849         case 'p':
11850         case 'P':
11851             {
11852 #ifdef DEBUGGING
11853                 char* parse_start = RExC_parse - 2;
11854 #endif
11855
11856                 RExC_parse--;
11857
11858                 ret = regclass(pRExC_state, flagp,depth+1,
11859                                TRUE, /* means just parse this element */
11860                                FALSE, /* don't allow multi-char folds */
11861                                FALSE, /* don't silence non-portable warnings.
11862                                          It would be a bug if these returned
11863                                          non-portables */
11864                                NULL);
11865                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11866                    are allowed.  */
11867                 if (!ret)
11868                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11869                           (UV) *flagp);
11870
11871                 RExC_parse--;
11872
11873                 Set_Node_Offset(ret, parse_start + 2);
11874                 Set_Node_Cur_Length(ret, parse_start);
11875                 nextchar(pRExC_state);
11876             }
11877             break;
11878         case 'N':
11879             /* Handle \N and \N{NAME} with multiple code points here and not
11880              * below because it can be multicharacter. join_exact() will join
11881              * them up later on.  Also this makes sure that things like
11882              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11883              * The options to the grok function call causes it to fail if the
11884              * sequence is just a single code point.  We then go treat it as
11885              * just another character in the current EXACT node, and hence it
11886              * gets uniform treatment with all the other characters.  The
11887              * special treatment for quantifiers is not needed for such single
11888              * character sequences */
11889             ++RExC_parse;
11890             if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11891                                              depth, FALSE))
11892             {
11893                 if (*flagp & RESTART_UTF8)
11894                     return NULL;
11895                 RExC_parse--;
11896                 goto defchar;
11897             }
11898             break;
11899         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11900         parse_named_seq:
11901         {
11902             char ch= RExC_parse[1];
11903             if (ch != '<' && ch != '\'' && ch != '{') {
11904                 RExC_parse++;
11905                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11906                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11907             } else {
11908                 /* this pretty much dupes the code for (?P=...) in reg(), if
11909                    you change this make sure you change that */
11910                 char* name_start = (RExC_parse += 2);
11911                 U32 num = 0;
11912                 SV *sv_dat = reg_scan_name(pRExC_state,
11913                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11914                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11915                 if (RExC_parse == name_start || *RExC_parse != ch)
11916                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11917                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11918
11919                 if (!SIZE_ONLY) {
11920                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11921                     RExC_rxi->data->data[num]=(void*)sv_dat;
11922                     SvREFCNT_inc_simple_void(sv_dat);
11923                 }
11924
11925                 RExC_sawback = 1;
11926                 ret = reganode(pRExC_state,
11927                                ((! FOLD)
11928                                  ? NREF
11929                                  : (ASCII_FOLD_RESTRICTED)
11930                                    ? NREFFA
11931                                    : (AT_LEAST_UNI_SEMANTICS)
11932                                      ? NREFFU
11933                                      : (LOC)
11934                                        ? NREFFL
11935                                        : NREFF),
11936                                 num);
11937                 *flagp |= HASWIDTH;
11938
11939                 /* override incorrect value set in reganode MJD */
11940                 Set_Node_Offset(ret, parse_start+1);
11941                 Set_Node_Cur_Length(ret, parse_start);
11942                 nextchar(pRExC_state);
11943
11944             }
11945             break;
11946         }
11947         case 'g':
11948         case '1': case '2': case '3': case '4':
11949         case '5': case '6': case '7': case '8': case '9':
11950             {
11951                 I32 num;
11952                 bool hasbrace = 0;
11953
11954                 if (*RExC_parse == 'g') {
11955                     bool isrel = 0;
11956
11957                     RExC_parse++;
11958                     if (*RExC_parse == '{') {
11959                         RExC_parse++;
11960                         hasbrace = 1;
11961                     }
11962                     if (*RExC_parse == '-') {
11963                         RExC_parse++;
11964                         isrel = 1;
11965                     }
11966                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11967                         if (isrel) RExC_parse--;
11968                         RExC_parse -= 2;
11969                         goto parse_named_seq;
11970                     }
11971
11972                     num = S_backref_value(RExC_parse);
11973                     if (num == 0)
11974                         vFAIL("Reference to invalid group 0");
11975                     else if (num == I32_MAX) {
11976                          if (isDIGIT(*RExC_parse))
11977                             vFAIL("Reference to nonexistent group");
11978                         else
11979                             vFAIL("Unterminated \\g... pattern");
11980                     }
11981
11982                     if (isrel) {
11983                         num = RExC_npar - num;
11984                         if (num < 1)
11985                             vFAIL("Reference to nonexistent or unclosed group");
11986                     }
11987                 }
11988                 else {
11989                     num = S_backref_value(RExC_parse);
11990                     /* bare \NNN might be backref or octal - if it is larger than or equal
11991                      * RExC_npar then it is assumed to be and octal escape.
11992                      * Note RExC_npar is +1 from the actual number of parens*/
11993                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11994                             && *RExC_parse != '8' && *RExC_parse != '9'))
11995                     {
11996                         /* Probably a character specified in octal, e.g. \35 */
11997                         goto defchar;
11998                     }
11999                 }
12000
12001                 /* at this point RExC_parse definitely points to a backref
12002                  * number */
12003                 {
12004 #ifdef RE_TRACK_PATTERN_OFFSETS
12005                     char * const parse_start = RExC_parse - 1; /* MJD */
12006 #endif
12007                     while (isDIGIT(*RExC_parse))
12008                         RExC_parse++;
12009                     if (hasbrace) {
12010                         if (*RExC_parse != '}')
12011                             vFAIL("Unterminated \\g{...} pattern");
12012                         RExC_parse++;
12013                     }
12014                     if (!SIZE_ONLY) {
12015                         if (num > (I32)RExC_rx->nparens)
12016                             vFAIL("Reference to nonexistent group");
12017                     }
12018                     RExC_sawback = 1;
12019                     ret = reganode(pRExC_state,
12020                                    ((! FOLD)
12021                                      ? REF
12022                                      : (ASCII_FOLD_RESTRICTED)
12023                                        ? REFFA
12024                                        : (AT_LEAST_UNI_SEMANTICS)
12025                                          ? REFFU
12026                                          : (LOC)
12027                                            ? REFFL
12028                                            : REFF),
12029                                     num);
12030                     *flagp |= HASWIDTH;
12031
12032                     /* override incorrect value set in reganode MJD */
12033                     Set_Node_Offset(ret, parse_start+1);
12034                     Set_Node_Cur_Length(ret, parse_start);
12035                     RExC_parse--;
12036                     nextchar(pRExC_state);
12037                 }
12038             }
12039             break;
12040         case '\0':
12041             if (RExC_parse >= RExC_end)
12042                 FAIL("Trailing \\");
12043             /* FALLTHROUGH */
12044         default:
12045             /* Do not generate "unrecognized" warnings here, we fall
12046                back into the quick-grab loop below */
12047             parse_start--;
12048             goto defchar;
12049         }
12050         break;
12051
12052     case '#':
12053         if (RExC_flags & RXf_PMf_EXTENDED) {
12054             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12055             if (RExC_parse < RExC_end)
12056                 goto tryagain;
12057         }
12058         /* FALLTHROUGH */
12059
12060     default:
12061
12062             parse_start = RExC_parse - 1;
12063
12064             RExC_parse++;
12065
12066         defchar: {
12067             STRLEN len = 0;
12068             UV ender = 0;
12069             char *p;
12070             char *s;
12071 #define MAX_NODE_STRING_SIZE 127
12072             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12073             char *s0;
12074             U8 upper_parse = MAX_NODE_STRING_SIZE;
12075             U8 node_type = compute_EXACTish(pRExC_state);
12076             bool next_is_quantifier;
12077             char * oldp = NULL;
12078
12079             /* We can convert EXACTF nodes to EXACTFU if they contain only
12080              * characters that match identically regardless of the target
12081              * string's UTF8ness.  The reason to do this is that EXACTF is not
12082              * trie-able, EXACTFU is.
12083              *
12084              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12085              * contain only above-Latin1 characters (hence must be in UTF8),
12086              * which don't participate in folds with Latin1-range characters,
12087              * as the latter's folds aren't known until runtime.  (We don't
12088              * need to figure this out until pass 2) */
12089             bool maybe_exactfu = PASS2
12090                                && (node_type == EXACTF || node_type == EXACTFL);
12091
12092             /* If a folding node contains only code points that don't
12093              * participate in folds, it can be changed into an EXACT node,
12094              * which allows the optimizer more things to look for */
12095             bool maybe_exact;
12096
12097             ret = reg_node(pRExC_state, node_type);
12098
12099             /* In pass1, folded, we use a temporary buffer instead of the
12100              * actual node, as the node doesn't exist yet */
12101             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12102
12103             s0 = s;
12104
12105         reparse:
12106
12107             /* We do the EXACTFish to EXACT node only if folding.  (And we
12108              * don't need to figure this out until pass 2) */
12109             maybe_exact = FOLD && PASS2;
12110
12111             /* XXX The node can hold up to 255 bytes, yet this only goes to
12112              * 127.  I (khw) do not know why.  Keeping it somewhat less than
12113              * 255 allows us to not have to worry about overflow due to
12114              * converting to utf8 and fold expansion, but that value is
12115              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
12116              * split up by this limit into a single one using the real max of
12117              * 255.  Even at 127, this breaks under rare circumstances.  If
12118              * folding, we do not want to split a node at a character that is a
12119              * non-final in a multi-char fold, as an input string could just
12120              * happen to want to match across the node boundary.  The join
12121              * would solve that problem if the join actually happens.  But a
12122              * series of more than two nodes in a row each of 127 would cause
12123              * the first join to succeed to get to 254, but then there wouldn't
12124              * be room for the next one, which could at be one of those split
12125              * multi-char folds.  I don't know of any fool-proof solution.  One
12126              * could back off to end with only a code point that isn't such a
12127              * non-final, but it is possible for there not to be any in the
12128              * entire node. */
12129             for (p = RExC_parse - 1;
12130                  len < upper_parse && p < RExC_end;
12131                  len++)
12132             {
12133                 oldp = p;
12134
12135                 if (RExC_flags & RXf_PMf_EXTENDED)
12136                     p = regpatws(pRExC_state, p,
12137                                           TRUE); /* means recognize comments */
12138                 switch ((U8)*p) {
12139                 case '^':
12140                 case '$':
12141                 case '.':
12142                 case '[':
12143                 case '(':
12144                 case ')':
12145                 case '|':
12146                     goto loopdone;
12147                 case '\\':
12148                     /* Literal Escapes Switch
12149
12150                        This switch is meant to handle escape sequences that
12151                        resolve to a literal character.
12152
12153                        Every escape sequence that represents something
12154                        else, like an assertion or a char class, is handled
12155                        in the switch marked 'Special Escapes' above in this
12156                        routine, but also has an entry here as anything that
12157                        isn't explicitly mentioned here will be treated as
12158                        an unescaped equivalent literal.
12159                     */
12160
12161                     switch ((U8)*++p) {
12162                     /* These are all the special escapes. */
12163                     case 'A':             /* Start assertion */
12164                     case 'b': case 'B':   /* Word-boundary assertion*/
12165                     case 'C':             /* Single char !DANGEROUS! */
12166                     case 'd': case 'D':   /* digit class */
12167                     case 'g': case 'G':   /* generic-backref, pos assertion */
12168                     case 'h': case 'H':   /* HORIZWS */
12169                     case 'k': case 'K':   /* named backref, keep marker */
12170                     case 'p': case 'P':   /* Unicode property */
12171                               case 'R':   /* LNBREAK */
12172                     case 's': case 'S':   /* space class */
12173                     case 'v': case 'V':   /* VERTWS */
12174                     case 'w': case 'W':   /* word class */
12175                     case 'X':             /* eXtended Unicode "combining
12176                                              character sequence" */
12177                     case 'z': case 'Z':   /* End of line/string assertion */
12178                         --p;
12179                         goto loopdone;
12180
12181                     /* Anything after here is an escape that resolves to a
12182                        literal. (Except digits, which may or may not)
12183                      */
12184                     case 'n':
12185                         ender = '\n';
12186                         p++;
12187                         break;
12188                     case 'N': /* Handle a single-code point named character. */
12189                         /* The options cause it to fail if a multiple code
12190                          * point sequence.  Handle those in the switch() above
12191                          * */
12192                         RExC_parse = p + 1;
12193                         if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12194                                                          &ender,
12195                                                          flagp,
12196                                                          depth,
12197                                                          FALSE
12198                         )) {
12199                             if (*flagp & RESTART_UTF8)
12200                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12201                             RExC_parse = p = oldp;
12202                             goto loopdone;
12203                         }
12204                         p = RExC_parse;
12205                         if (ender > 0xff) {
12206                             REQUIRE_UTF8;
12207                         }
12208                         break;
12209                     case 'r':
12210                         ender = '\r';
12211                         p++;
12212                         break;
12213                     case 't':
12214                         ender = '\t';
12215                         p++;
12216                         break;
12217                     case 'f':
12218                         ender = '\f';
12219                         p++;
12220                         break;
12221                     case 'e':
12222                         ender = ESC_NATIVE;
12223                         p++;
12224                         break;
12225                     case 'a':
12226                         ender = '\a';
12227                         p++;
12228                         break;
12229                     case 'o':
12230                         {
12231                             UV result;
12232                             const char* error_msg;
12233
12234                             bool valid = grok_bslash_o(&p,
12235                                                        &result,
12236                                                        &error_msg,
12237                                                        PASS2, /* out warnings */
12238                                                        FALSE, /* not strict */
12239                                                        TRUE, /* Output warnings
12240                                                                 for non-
12241                                                                 portables */
12242                                                        UTF);
12243                             if (! valid) {
12244                                 RExC_parse = p; /* going to die anyway; point
12245                                                    to exact spot of failure */
12246                                 vFAIL(error_msg);
12247                             }
12248                             ender = result;
12249                             if (IN_ENCODING && ender < 0x100) {
12250                                 goto recode_encoding;
12251                             }
12252                             if (ender > 0xff) {
12253                                 REQUIRE_UTF8;
12254                             }
12255                             break;
12256                         }
12257                     case 'x':
12258                         {
12259                             UV result = UV_MAX; /* initialize to erroneous
12260                                                    value */
12261                             const char* error_msg;
12262
12263                             bool valid = grok_bslash_x(&p,
12264                                                        &result,
12265                                                        &error_msg,
12266                                                        PASS2, /* out warnings */
12267                                                        FALSE, /* not strict */
12268                                                        TRUE, /* Output warnings
12269                                                                 for non-
12270                                                                 portables */
12271                                                        UTF);
12272                             if (! valid) {
12273                                 RExC_parse = p; /* going to die anyway; point
12274                                                    to exact spot of failure */
12275                                 vFAIL(error_msg);
12276                             }
12277                             ender = result;
12278
12279                             if (IN_ENCODING && ender < 0x100) {
12280                                 goto recode_encoding;
12281                             }
12282                             if (ender > 0xff) {
12283                                 REQUIRE_UTF8;
12284                             }
12285                             break;
12286                         }
12287                     case 'c':
12288                         p++;
12289                         ender = grok_bslash_c(*p++, PASS2);
12290                         break;
12291                     case '8': case '9': /* must be a backreference */
12292                         --p;
12293                         goto loopdone;
12294                     case '1': case '2': case '3':case '4':
12295                     case '5': case '6': case '7':
12296                         /* When we parse backslash escapes there is ambiguity
12297                          * between backreferences and octal escapes. Any escape
12298                          * from \1 - \9 is a backreference, any multi-digit
12299                          * escape which does not start with 0 and which when
12300                          * evaluated as decimal could refer to an already
12301                          * parsed capture buffer is a backslash. Anything else
12302                          * is octal.
12303                          *
12304                          * Note this implies that \118 could be interpreted as
12305                          * 118 OR as "\11" . "8" depending on whether there
12306                          * were 118 capture buffers defined already in the
12307                          * pattern.  */
12308
12309                         /* NOTE, RExC_npar is 1 more than the actual number of
12310                          * parens we have seen so far, hence the < RExC_npar below. */
12311
12312                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12313                         {  /* Not to be treated as an octal constant, go
12314                                    find backref */
12315                             --p;
12316                             goto loopdone;
12317                         }
12318                         /* FALLTHROUGH */
12319                     case '0':
12320                         {
12321                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12322                             STRLEN numlen = 3;
12323                             ender = grok_oct(p, &numlen, &flags, NULL);
12324                             if (ender > 0xff) {
12325                                 REQUIRE_UTF8;
12326                             }
12327                             p += numlen;
12328                             if (PASS2   /* like \08, \178 */
12329                                 && numlen < 3
12330                                 && p < RExC_end
12331                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12332                             {
12333                                 reg_warn_non_literal_string(
12334                                          p + 1,
12335                                          form_short_octal_warning(p, numlen));
12336                             }
12337                         }
12338                         if (IN_ENCODING && ender < 0x100)
12339                             goto recode_encoding;
12340                         break;
12341                     recode_encoding:
12342                         if (! RExC_override_recoding) {
12343                             SV* enc = _get_encoding();
12344                             ender = reg_recode((const char)(U8)ender, &enc);
12345                             if (!enc && PASS2)
12346                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12347                             REQUIRE_UTF8;
12348                         }
12349                         break;
12350                     case '\0':
12351                         if (p >= RExC_end)
12352                             FAIL("Trailing \\");
12353                         /* FALLTHROUGH */
12354                     default:
12355                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12356                             /* Include any { following the alpha to emphasize
12357                              * that it could be part of an escape at some point
12358                              * in the future */
12359                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12360                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12361                         }
12362                         goto normal_default;
12363                     } /* End of switch on '\' */
12364                     break;
12365                 case '{':
12366                     /* Currently we don't warn when the lbrace is at the start
12367                      * of a construct.  This catches it in the middle of a
12368                      * literal string, or when its the first thing after
12369                      * something like "\b" */
12370                     if (! SIZE_ONLY
12371                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12372                     {
12373                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12374                     }
12375                     /*FALLTHROUGH*/
12376                 default:    /* A literal character */
12377                   normal_default:
12378                     if (UTF8_IS_START(*p) && UTF) {
12379                         STRLEN numlen;
12380                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12381                                                &numlen, UTF8_ALLOW_DEFAULT);
12382                         p += numlen;
12383                     }
12384                     else
12385                         ender = (U8) *p++;
12386                     break;
12387                 } /* End of switch on the literal */
12388
12389                 /* Here, have looked at the literal character and <ender>
12390                  * contains its ordinal, <p> points to the character after it
12391                  */
12392
12393                 if ( RExC_flags & RXf_PMf_EXTENDED)
12394                     p = regpatws(pRExC_state, p,
12395                                           TRUE); /* means recognize comments */
12396
12397                 /* If the next thing is a quantifier, it applies to this
12398                  * character only, which means that this character has to be in
12399                  * its own node and can't just be appended to the string in an
12400                  * existing node, so if there are already other characters in
12401                  * the node, close the node with just them, and set up to do
12402                  * this character again next time through, when it will be the
12403                  * only thing in its new node */
12404                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12405                 {
12406                     p = oldp;
12407                     goto loopdone;
12408                 }
12409
12410                 if (! FOLD   /* The simple case, just append the literal */
12411                     || (LOC  /* Also don't fold for tricky chars under /l */
12412                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12413                 {
12414                     if (UTF) {
12415                         const STRLEN unilen = reguni(pRExC_state, ender, s);
12416                         if (unilen > 0) {
12417                            s   += unilen;
12418                            len += unilen;
12419                         }
12420
12421                         /* The loop increments <len> each time, as all but this
12422                          * path (and one other) through it add a single byte to
12423                          * the EXACTish node.  But this one has changed len to
12424                          * be the correct final value, so subtract one to
12425                          * cancel out the increment that follows */
12426                         len--;
12427                     }
12428                     else {
12429                         REGC((char)ender, s++);
12430                     }
12431
12432                     /* Can get here if folding only if is one of the /l
12433                      * characters whose fold depends on the locale.  The
12434                      * occurrence of any of these indicate that we can't
12435                      * simplify things */
12436                     if (FOLD) {
12437                         maybe_exact = FALSE;
12438                         maybe_exactfu = FALSE;
12439                     }
12440                 }
12441                 else             /* FOLD */
12442                      if (! ( UTF
12443                         /* See comments for join_exact() as to why we fold this
12444                          * non-UTF at compile time */
12445                         || (node_type == EXACTFU
12446                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12447                 {
12448                     /* Here, are folding and are not UTF-8 encoded; therefore
12449                      * the character must be in the range 0-255, and is not /l
12450                      * (Not /l because we already handled these under /l in
12451                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12452                     if (IS_IN_SOME_FOLD_L1(ender)) {
12453                         maybe_exact = FALSE;
12454
12455                         /* See if the character's fold differs between /d and
12456                          * /u.  This includes the multi-char fold SHARP S to
12457                          * 'ss' */
12458                         if (maybe_exactfu
12459                             && (PL_fold[ender] != PL_fold_latin1[ender]
12460                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12461                                 || (len > 0
12462                                    && isALPHA_FOLD_EQ(ender, 's')
12463                                    && isALPHA_FOLD_EQ(*(s-1), 's'))))
12464                         {
12465                             maybe_exactfu = FALSE;
12466                         }
12467                     }
12468
12469                     /* Even when folding, we store just the input character, as
12470                      * we have an array that finds its fold quickly */
12471                     *(s++) = (char) ender;
12472                 }
12473                 else {  /* FOLD and UTF */
12474                     /* Unlike the non-fold case, we do actually have to
12475                      * calculate the results here in pass 1.  This is for two
12476                      * reasons, the folded length may be longer than the
12477                      * unfolded, and we have to calculate how many EXACTish
12478                      * nodes it will take; and we may run out of room in a node
12479                      * in the middle of a potential multi-char fold, and have
12480                      * to back off accordingly.  (Hence we can't use REGC for
12481                      * the simple case just below.) */
12482
12483                     UV folded;
12484                     if (isASCII_uni(ender)) {
12485                         folded = toFOLD(ender);
12486                         *(s)++ = (U8) folded;
12487                     }
12488                     else {
12489                         STRLEN foldlen;
12490
12491                         folded = _to_uni_fold_flags(
12492                                      ender,
12493                                      (U8 *) s,
12494                                      &foldlen,
12495                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12496                                                         ? FOLD_FLAGS_NOMIX_ASCII
12497                                                         : 0));
12498                         s += foldlen;
12499
12500                         /* The loop increments <len> each time, as all but this
12501                          * path (and one other) through it add a single byte to
12502                          * the EXACTish node.  But this one has changed len to
12503                          * be the correct final value, so subtract one to
12504                          * cancel out the increment that follows */
12505                         len += foldlen - 1;
12506                     }
12507                     /* If this node only contains non-folding code points so
12508                      * far, see if this new one is also non-folding */
12509                     if (maybe_exact) {
12510                         if (folded != ender) {
12511                             maybe_exact = FALSE;
12512                         }
12513                         else {
12514                             /* Here the fold is the original; we have to check
12515                              * further to see if anything folds to it */
12516                             if (_invlist_contains_cp(PL_utf8_foldable,
12517                                                         ender))
12518                             {
12519                                 maybe_exact = FALSE;
12520                             }
12521                         }
12522                     }
12523                     ender = folded;
12524                 }
12525
12526                 if (next_is_quantifier) {
12527
12528                     /* Here, the next input is a quantifier, and to get here,
12529                      * the current character is the only one in the node.
12530                      * Also, here <len> doesn't include the final byte for this
12531                      * character */
12532                     len++;
12533                     goto loopdone;
12534                 }
12535
12536             } /* End of loop through literal characters */
12537
12538             /* Here we have either exhausted the input or ran out of room in
12539              * the node.  (If we encountered a character that can't be in the
12540              * node, transfer is made directly to <loopdone>, and so we
12541              * wouldn't have fallen off the end of the loop.)  In the latter
12542              * case, we artificially have to split the node into two, because
12543              * we just don't have enough space to hold everything.  This
12544              * creates a problem if the final character participates in a
12545              * multi-character fold in the non-final position, as a match that
12546              * should have occurred won't, due to the way nodes are matched,
12547              * and our artificial boundary.  So back off until we find a non-
12548              * problematic character -- one that isn't at the beginning or
12549              * middle of such a fold.  (Either it doesn't participate in any
12550              * folds, or appears only in the final position of all the folds it
12551              * does participate in.)  A better solution with far fewer false
12552              * positives, and that would fill the nodes more completely, would
12553              * be to actually have available all the multi-character folds to
12554              * test against, and to back-off only far enough to be sure that
12555              * this node isn't ending with a partial one.  <upper_parse> is set
12556              * further below (if we need to reparse the node) to include just
12557              * up through that final non-problematic character that this code
12558              * identifies, so when it is set to less than the full node, we can
12559              * skip the rest of this */
12560             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12561
12562                 const STRLEN full_len = len;
12563
12564                 assert(len >= MAX_NODE_STRING_SIZE);
12565
12566                 /* Here, <s> points to the final byte of the final character.
12567                  * Look backwards through the string until find a non-
12568                  * problematic character */
12569
12570                 if (! UTF) {
12571
12572                     /* This has no multi-char folds to non-UTF characters */
12573                     if (ASCII_FOLD_RESTRICTED) {
12574                         goto loopdone;
12575                     }
12576
12577                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12578                     len = s - s0 + 1;
12579                 }
12580                 else {
12581                     if (!  PL_NonL1NonFinalFold) {
12582                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12583                                         NonL1_Perl_Non_Final_Folds_invlist);
12584                     }
12585
12586                     /* Point to the first byte of the final character */
12587                     s = (char *) utf8_hop((U8 *) s, -1);
12588
12589                     while (s >= s0) {   /* Search backwards until find
12590                                            non-problematic char */
12591                         if (UTF8_IS_INVARIANT(*s)) {
12592
12593                             /* There are no ascii characters that participate
12594                              * in multi-char folds under /aa.  In EBCDIC, the
12595                              * non-ascii invariants are all control characters,
12596                              * so don't ever participate in any folds. */
12597                             if (ASCII_FOLD_RESTRICTED
12598                                 || ! IS_NON_FINAL_FOLD(*s))
12599                             {
12600                                 break;
12601                             }
12602                         }
12603                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12604                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12605                                                                   *s, *(s+1))))
12606                             {
12607                                 break;
12608                             }
12609                         }
12610                         else if (! _invlist_contains_cp(
12611                                         PL_NonL1NonFinalFold,
12612                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12613                         {
12614                             break;
12615                         }
12616
12617                         /* Here, the current character is problematic in that
12618                          * it does occur in the non-final position of some
12619                          * fold, so try the character before it, but have to
12620                          * special case the very first byte in the string, so
12621                          * we don't read outside the string */
12622                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12623                     } /* End of loop backwards through the string */
12624
12625                     /* If there were only problematic characters in the string,
12626                      * <s> will point to before s0, in which case the length
12627                      * should be 0, otherwise include the length of the
12628                      * non-problematic character just found */
12629                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12630                 }
12631
12632                 /* Here, have found the final character, if any, that is
12633                  * non-problematic as far as ending the node without splitting
12634                  * it across a potential multi-char fold.  <len> contains the
12635                  * number of bytes in the node up-to and including that
12636                  * character, or is 0 if there is no such character, meaning
12637                  * the whole node contains only problematic characters.  In
12638                  * this case, give up and just take the node as-is.  We can't
12639                  * do any better */
12640                 if (len == 0) {
12641                     len = full_len;
12642
12643                     /* If the node ends in an 's' we make sure it stays EXACTF,
12644                      * as if it turns into an EXACTFU, it could later get
12645                      * joined with another 's' that would then wrongly match
12646                      * the sharp s */
12647                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12648                     {
12649                         maybe_exactfu = FALSE;
12650                     }
12651                 } else {
12652
12653                     /* Here, the node does contain some characters that aren't
12654                      * problematic.  If one such is the final character in the
12655                      * node, we are done */
12656                     if (len == full_len) {
12657                         goto loopdone;
12658                     }
12659                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12660
12661                         /* If the final character is problematic, but the
12662                          * penultimate is not, back-off that last character to
12663                          * later start a new node with it */
12664                         p = oldp;
12665                         goto loopdone;
12666                     }
12667
12668                     /* Here, the final non-problematic character is earlier
12669                      * in the input than the penultimate character.  What we do
12670                      * is reparse from the beginning, going up only as far as
12671                      * this final ok one, thus guaranteeing that the node ends
12672                      * in an acceptable character.  The reason we reparse is
12673                      * that we know how far in the character is, but we don't
12674                      * know how to correlate its position with the input parse.
12675                      * An alternate implementation would be to build that
12676                      * correlation as we go along during the original parse,
12677                      * but that would entail extra work for every node, whereas
12678                      * this code gets executed only when the string is too
12679                      * large for the node, and the final two characters are
12680                      * problematic, an infrequent occurrence.  Yet another
12681                      * possible strategy would be to save the tail of the
12682                      * string, and the next time regatom is called, initialize
12683                      * with that.  The problem with this is that unless you
12684                      * back off one more character, you won't be guaranteed
12685                      * regatom will get called again, unless regbranch,
12686                      * regpiece ... are also changed.  If you do back off that
12687                      * extra character, so that there is input guaranteed to
12688                      * force calling regatom, you can't handle the case where
12689                      * just the first character in the node is acceptable.  I
12690                      * (khw) decided to try this method which doesn't have that
12691                      * pitfall; if performance issues are found, we can do a
12692                      * combination of the current approach plus that one */
12693                     upper_parse = len;
12694                     len = 0;
12695                     s = s0;
12696                     goto reparse;
12697                 }
12698             }   /* End of verifying node ends with an appropriate char */
12699
12700         loopdone:   /* Jumped to when encounters something that shouldn't be in
12701                        the node */
12702
12703             /* I (khw) don't know if you can get here with zero length, but the
12704              * old code handled this situation by creating a zero-length EXACT
12705              * node.  Might as well be NOTHING instead */
12706             if (len == 0) {
12707                 OP(ret) = NOTHING;
12708             }
12709             else {
12710                 if (FOLD) {
12711                     /* If 'maybe_exact' is still set here, means there are no
12712                      * code points in the node that participate in folds;
12713                      * similarly for 'maybe_exactfu' and code points that match
12714                      * differently depending on UTF8ness of the target string
12715                      * (for /u), or depending on locale for /l */
12716                     if (maybe_exact) {
12717                         OP(ret) = EXACT;
12718                     }
12719                     else if (maybe_exactfu) {
12720                         OP(ret) = EXACTFU;
12721                     }
12722                 }
12723                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12724                                            FALSE /* Don't look to see if could
12725                                                     be turned into an EXACT
12726                                                     node, as we have already
12727                                                     computed that */
12728                                           );
12729             }
12730
12731             RExC_parse = p - 1;
12732             Set_Node_Cur_Length(ret, parse_start);
12733             nextchar(pRExC_state);
12734             {
12735                 /* len is STRLEN which is unsigned, need to copy to signed */
12736                 IV iv = len;
12737                 if (iv < 0)
12738                     vFAIL("Internal disaster");
12739             }
12740
12741         } /* End of label 'defchar:' */
12742         break;
12743     } /* End of giant switch on input character */
12744
12745     return(ret);
12746 }
12747
12748 STATIC char *
12749 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12750 {
12751     /* Returns the next non-pattern-white space, non-comment character (the
12752      * latter only if 'recognize_comment is true) in the string p, which is
12753      * ended by RExC_end.  See also reg_skipcomment */
12754     const char *e = RExC_end;
12755
12756     PERL_ARGS_ASSERT_REGPATWS;
12757
12758     while (p < e) {
12759         STRLEN len;
12760         if ((len = is_PATWS_safe(p, e, UTF))) {
12761             p += len;
12762         }
12763         else if (recognize_comment && *p == '#') {
12764             p = reg_skipcomment(pRExC_state, p);
12765         }
12766         else
12767             break;
12768     }
12769     return p;
12770 }
12771
12772 STATIC void
12773 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12774 {
12775     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12776      * sets up the bitmap and any flags, removing those code points from the
12777      * inversion list, setting it to NULL should it become completely empty */
12778
12779     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12780     assert(PL_regkind[OP(node)] == ANYOF);
12781
12782     ANYOF_BITMAP_ZERO(node);
12783     if (*invlist_ptr) {
12784
12785         /* This gets set if we actually need to modify things */
12786         bool change_invlist = FALSE;
12787
12788         UV start, end;
12789
12790         /* Start looking through *invlist_ptr */
12791         invlist_iterinit(*invlist_ptr);
12792         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12793             UV high;
12794             int i;
12795
12796             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12797                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12798             }
12799             else if (end >= NUM_ANYOF_CODE_POINTS) {
12800                 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12801             }
12802
12803             /* Quit if are above what we should change */
12804             if (start >= NUM_ANYOF_CODE_POINTS) {
12805                 break;
12806             }
12807
12808             change_invlist = TRUE;
12809
12810             /* Set all the bits in the range, up to the max that we are doing */
12811             high = (end < NUM_ANYOF_CODE_POINTS - 1)
12812                    ? end
12813                    : NUM_ANYOF_CODE_POINTS - 1;
12814             for (i = start; i <= (int) high; i++) {
12815                 if (! ANYOF_BITMAP_TEST(node, i)) {
12816                     ANYOF_BITMAP_SET(node, i);
12817                 }
12818             }
12819         }
12820         invlist_iterfinish(*invlist_ptr);
12821
12822         /* Done with loop; remove any code points that are in the bitmap from
12823          * *invlist_ptr; similarly for code points above the bitmap if we have
12824          * a flag to match all of them anyways */
12825         if (change_invlist) {
12826             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12827         }
12828         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12829             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12830         }
12831
12832         /* If have completely emptied it, remove it completely */
12833         if (_invlist_len(*invlist_ptr) == 0) {
12834             SvREFCNT_dec_NN(*invlist_ptr);
12835             *invlist_ptr = NULL;
12836         }
12837     }
12838 }
12839
12840 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12841    Character classes ([:foo:]) can also be negated ([:^foo:]).
12842    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12843    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12844    but trigger failures because they are currently unimplemented. */
12845
12846 #define POSIXCC_DONE(c)   ((c) == ':')
12847 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12848 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12849
12850 PERL_STATIC_INLINE I32
12851 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12852 {
12853     I32 namedclass = OOB_NAMEDCLASS;
12854
12855     PERL_ARGS_ASSERT_REGPPOSIXCC;
12856
12857     if (value == '[' && RExC_parse + 1 < RExC_end &&
12858         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12859         POSIXCC(UCHARAT(RExC_parse)))
12860     {
12861         const char c = UCHARAT(RExC_parse);
12862         char* const s = RExC_parse++;
12863
12864         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12865             RExC_parse++;
12866         if (RExC_parse == RExC_end) {
12867             if (strict) {
12868
12869                 /* Try to give a better location for the error (than the end of
12870                  * the string) by looking for the matching ']' */
12871                 RExC_parse = s;
12872                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12873                     RExC_parse++;
12874                 }
12875                 vFAIL2("Unmatched '%c' in POSIX class", c);
12876             }
12877             /* Grandfather lone [:, [=, [. */
12878             RExC_parse = s;
12879         }
12880         else {
12881             const char* const t = RExC_parse++; /* skip over the c */
12882             assert(*t == c);
12883
12884             if (UCHARAT(RExC_parse) == ']') {
12885                 const char *posixcc = s + 1;
12886                 RExC_parse++; /* skip over the ending ] */
12887
12888                 if (*s == ':') {
12889                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12890                     const I32 skip = t - posixcc;
12891
12892                     /* Initially switch on the length of the name.  */
12893                     switch (skip) {
12894                     case 4:
12895                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12896                                                           this is the Perl \w
12897                                                         */
12898                             namedclass = ANYOF_WORDCHAR;
12899                         break;
12900                     case 5:
12901                         /* Names all of length 5.  */
12902                         /* alnum alpha ascii blank cntrl digit graph lower
12903                            print punct space upper  */
12904                         /* Offset 4 gives the best switch position.  */
12905                         switch (posixcc[4]) {
12906                         case 'a':
12907                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12908                                 namedclass = ANYOF_ALPHA;
12909                             break;
12910                         case 'e':
12911                             if (memEQ(posixcc, "spac", 4)) /* space */
12912                                 namedclass = ANYOF_PSXSPC;
12913                             break;
12914                         case 'h':
12915                             if (memEQ(posixcc, "grap", 4)) /* graph */
12916                                 namedclass = ANYOF_GRAPH;
12917                             break;
12918                         case 'i':
12919                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12920                                 namedclass = ANYOF_ASCII;
12921                             break;
12922                         case 'k':
12923                             if (memEQ(posixcc, "blan", 4)) /* blank */
12924                                 namedclass = ANYOF_BLANK;
12925                             break;
12926                         case 'l':
12927                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12928                                 namedclass = ANYOF_CNTRL;
12929                             break;
12930                         case 'm':
12931                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12932                                 namedclass = ANYOF_ALPHANUMERIC;
12933                             break;
12934                         case 'r':
12935                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12936                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12937                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12938                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12939                             break;
12940                         case 't':
12941                             if (memEQ(posixcc, "digi", 4)) /* digit */
12942                                 namedclass = ANYOF_DIGIT;
12943                             else if (memEQ(posixcc, "prin", 4)) /* print */
12944                                 namedclass = ANYOF_PRINT;
12945                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12946                                 namedclass = ANYOF_PUNCT;
12947                             break;
12948                         }
12949                         break;
12950                     case 6:
12951                         if (memEQ(posixcc, "xdigit", 6))
12952                             namedclass = ANYOF_XDIGIT;
12953                         break;
12954                     }
12955
12956                     if (namedclass == OOB_NAMEDCLASS)
12957                         vFAIL2utf8f(
12958                             "POSIX class [:%"UTF8f":] unknown",
12959                             UTF8fARG(UTF, t - s - 1, s + 1));
12960
12961                     /* The #defines are structured so each complement is +1 to
12962                      * the normal one */
12963                     if (complement) {
12964                         namedclass++;
12965                     }
12966                     assert (posixcc[skip] == ':');
12967                     assert (posixcc[skip+1] == ']');
12968                 } else if (!SIZE_ONLY) {
12969                     /* [[=foo=]] and [[.foo.]] are still future. */
12970
12971                     /* adjust RExC_parse so the warning shows after
12972                        the class closes */
12973                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12974                         RExC_parse++;
12975                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12976                 }
12977             } else {
12978                 /* Maternal grandfather:
12979                  * "[:" ending in ":" but not in ":]" */
12980                 if (strict) {
12981                     vFAIL("Unmatched '[' in POSIX class");
12982                 }
12983
12984                 /* Grandfather lone [:, [=, [. */
12985                 RExC_parse = s;
12986             }
12987         }
12988     }
12989
12990     return namedclass;
12991 }
12992
12993 STATIC bool
12994 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12995 {
12996     /* This applies some heuristics at the current parse position (which should
12997      * be at a '[') to see if what follows might be intended to be a [:posix:]
12998      * class.  It returns true if it really is a posix class, of course, but it
12999      * also can return true if it thinks that what was intended was a posix
13000      * class that didn't quite make it.
13001      *
13002      * It will return true for
13003      *      [:alphanumerics:
13004      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
13005      *                         ')' indicating the end of the (?[
13006      *      [:any garbage including %^&$ punctuation:]
13007      *
13008      * This is designed to be called only from S_handle_regex_sets; it could be
13009      * easily adapted to be called from the spot at the beginning of regclass()
13010      * that checks to see in a normal bracketed class if the surrounding []
13011      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
13012      * change long-standing behavior, so I (khw) didn't do that */
13013     char* p = RExC_parse + 1;
13014     char first_char = *p;
13015
13016     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13017
13018     assert(*(p - 1) == '[');
13019
13020     if (! POSIXCC(first_char)) {
13021         return FALSE;
13022     }
13023
13024     p++;
13025     while (p < RExC_end && isWORDCHAR(*p)) p++;
13026
13027     if (p >= RExC_end) {
13028         return FALSE;
13029     }
13030
13031     if (p - RExC_parse > 2    /* Got at least 1 word character */
13032         && (*p == first_char
13033             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13034     {
13035         return TRUE;
13036     }
13037
13038     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13039
13040     return (p
13041             && p - RExC_parse > 2 /* [:] evaluates to colon;
13042                                       [::] is a bad posix class. */
13043             && first_char == *(p - 1));
13044 }
13045
13046 STATIC regnode *
13047 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13048                     I32 *flagp, U32 depth,
13049                     char * const oregcomp_parse)
13050 {
13051     /* Handle the (?[...]) construct to do set operations */
13052
13053     U8 curchar;
13054     UV start, end;      /* End points of code point ranges */
13055     SV* result_string;
13056     char *save_end, *save_parse;
13057     SV* final;
13058     STRLEN len;
13059     regnode* node;
13060     AV* stack;
13061     const bool save_fold = FOLD;
13062
13063     GET_RE_DEBUG_FLAGS_DECL;
13064
13065     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13066
13067     if (LOC) {
13068         vFAIL("(?[...]) not valid in locale");
13069     }
13070     RExC_uni_semantics = 1;
13071
13072     /* This will return only an ANYOF regnode, or (unlikely) something smaller
13073      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
13074      * call regclass to handle '[]' so as to not have to reinvent its parsing
13075      * rules here (throwing away the size it computes each time).  And, we exit
13076      * upon an unescaped ']' that isn't one ending a regclass.  To do both
13077      * these things, we need to realize that something preceded by a backslash
13078      * is escaped, so we have to keep track of backslashes */
13079     if (PASS2) {
13080         Perl_ck_warner_d(aTHX_
13081             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13082             "The regex_sets feature is experimental" REPORT_LOCATION,
13083                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13084                 UTF8fARG(UTF,
13085                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13086                          RExC_precomp + (RExC_parse - RExC_precomp)));
13087     }
13088     else {
13089         UV depth = 0; /* how many nested (?[...]) constructs */
13090
13091         while (RExC_parse < RExC_end) {
13092             SV* current = NULL;
13093             RExC_parse = regpatws(pRExC_state, RExC_parse,
13094                                           TRUE); /* means recognize comments */
13095             switch (*RExC_parse) {
13096                 case '?':
13097                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
13098                     /* FALLTHROUGH */
13099                 default:
13100                     break;
13101                 case '\\':
13102                     /* Skip the next byte (which could cause us to end up in
13103                      * the middle of a UTF-8 character, but since none of those
13104                      * are confusable with anything we currently handle in this
13105                      * switch (invariants all), it's safe.  We'll just hit the
13106                      * default: case next time and keep on incrementing until
13107                      * we find one of the invariants we do handle. */
13108                     RExC_parse++;
13109                     break;
13110                 case '[':
13111                 {
13112                     /* If this looks like it is a [:posix:] class, leave the
13113                      * parse pointer at the '[' to fool regclass() into
13114                      * thinking it is part of a '[[:posix:]]'.  That function
13115                      * will use strict checking to force a syntax error if it
13116                      * doesn't work out to a legitimate class */
13117                     bool is_posix_class
13118                                     = could_it_be_a_POSIX_class(pRExC_state);
13119                     if (! is_posix_class) {
13120                         RExC_parse++;
13121                     }
13122
13123                     /* regclass() can only return RESTART_UTF8 if multi-char
13124                        folds are allowed.  */
13125                     if (!regclass(pRExC_state, flagp,depth+1,
13126                                   is_posix_class, /* parse the whole char
13127                                                      class only if not a
13128                                                      posix class */
13129                                   FALSE, /* don't allow multi-char folds */
13130                                   TRUE, /* silence non-portable warnings. */
13131                                   &current))
13132                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13133                               (UV) *flagp);
13134
13135                     /* function call leaves parse pointing to the ']', except
13136                      * if we faked it */
13137                     if (is_posix_class) {
13138                         RExC_parse--;
13139                     }
13140
13141                     SvREFCNT_dec(current);   /* In case it returned something */
13142                     break;
13143                 }
13144
13145                 case ']':
13146                     if (depth--) break;
13147                     RExC_parse++;
13148                     if (RExC_parse < RExC_end
13149                         && *RExC_parse == ')')
13150                     {
13151                         node = reganode(pRExC_state, ANYOF, 0);
13152                         RExC_size += ANYOF_SKIP;
13153                         nextchar(pRExC_state);
13154                         Set_Node_Length(node,
13155                                 RExC_parse - oregcomp_parse + 1); /* MJD */
13156                         return node;
13157                     }
13158                     goto no_close;
13159             }
13160             RExC_parse++;
13161         }
13162
13163         no_close:
13164         FAIL("Syntax error in (?[...])");
13165     }
13166
13167     /* Pass 2 only after this.  Everything in this construct is a
13168      * metacharacter.  Operands begin with either a '\' (for an escape
13169      * sequence), or a '[' for a bracketed character class.  Any other
13170      * character should be an operator, or parenthesis for grouping.  Both
13171      * types of operands are handled by calling regclass() to parse them.  It
13172      * is called with a parameter to indicate to return the computed inversion
13173      * list.  The parsing here is implemented via a stack.  Each entry on the
13174      * stack is a single character representing one of the operators, or the
13175      * '('; or else a pointer to an operand inversion list. */
13176
13177 #define IS_OPERAND(a)  (! SvIOK(a))
13178
13179     /* The stack starts empty.  It is a syntax error if the first thing parsed
13180      * is a binary operator; everything else is pushed on the stack.  When an
13181      * operand is parsed, the top of the stack is examined.  If it is a binary
13182      * operator, the item before it should be an operand, and both are replaced
13183      * by the result of doing that operation on the new operand and the one on
13184      * the stack.   Thus a sequence of binary operands is reduced to a single
13185      * one before the next one is parsed.
13186      *
13187      * A unary operator may immediately follow a binary in the input, for
13188      * example
13189      *      [a] + ! [b]
13190      * When an operand is parsed and the top of the stack is a unary operator,
13191      * the operation is performed, and then the stack is rechecked to see if
13192      * this new operand is part of a binary operation; if so, it is handled as
13193      * above.
13194      *
13195      * A '(' is simply pushed on the stack; it is valid only if the stack is
13196      * empty, or the top element of the stack is an operator or another '('
13197      * (for which the parenthesized expression will become an operand).  By the
13198      * time the corresponding ')' is parsed everything in between should have
13199      * been parsed and evaluated to a single operand (or else is a syntax
13200      * error), and is handled as a regular operand */
13201
13202     sv_2mortal((SV *)(stack = newAV()));
13203
13204     while (RExC_parse < RExC_end) {
13205         I32 top_index = av_tindex(stack);
13206         SV** top_ptr;
13207         SV* current = NULL;
13208
13209         /* Skip white space */
13210         RExC_parse = regpatws(pRExC_state, RExC_parse,
13211                                          TRUE /* means recognize comments */ );
13212         if (RExC_parse >= RExC_end) {
13213             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13214         }
13215         if ((curchar = UCHARAT(RExC_parse)) == ']') {
13216             break;
13217         }
13218
13219         switch (curchar) {
13220
13221             case '?':
13222                 if (av_tindex(stack) >= 0   /* This makes sure that we can
13223                                                safely subtract 1 from
13224                                                RExC_parse in the next clause.
13225                                                If we have something on the
13226                                                stack, we have parsed something
13227                                              */
13228                     && UCHARAT(RExC_parse - 1) == '('
13229                     && RExC_parse < RExC_end)
13230                 {
13231                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13232                      * This happens when we have some thing like
13233                      *
13234                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13235                      *   ...
13236                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
13237                      *
13238                      * Here we would be handling the interpolated
13239                      * '$thai_or_lao'.  We handle this by a recursive call to
13240                      * ourselves which returns the inversion list the
13241                      * interpolated expression evaluates to.  We use the flags
13242                      * from the interpolated pattern. */
13243                     U32 save_flags = RExC_flags;
13244                     const char * const save_parse = ++RExC_parse;
13245
13246                     parse_lparen_question_flags(pRExC_state);
13247
13248                     if (RExC_parse == save_parse  /* Makes sure there was at
13249                                                      least one flag (or this
13250                                                      embedding wasn't compiled)
13251                                                    */
13252                         || RExC_parse >= RExC_end - 4
13253                         || UCHARAT(RExC_parse) != ':'
13254                         || UCHARAT(++RExC_parse) != '('
13255                         || UCHARAT(++RExC_parse) != '?'
13256                         || UCHARAT(++RExC_parse) != '[')
13257                     {
13258
13259                         /* In combination with the above, this moves the
13260                          * pointer to the point just after the first erroneous
13261                          * character (or if there are no flags, to where they
13262                          * should have been) */
13263                         if (RExC_parse >= RExC_end - 4) {
13264                             RExC_parse = RExC_end;
13265                         }
13266                         else if (RExC_parse != save_parse) {
13267                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13268                         }
13269                         vFAIL("Expecting '(?flags:(?[...'");
13270                     }
13271                     RExC_parse++;
13272                     (void) handle_regex_sets(pRExC_state, &current, flagp,
13273                                                     depth+1, oregcomp_parse);
13274
13275                     /* Here, 'current' contains the embedded expression's
13276                      * inversion list, and RExC_parse points to the trailing
13277                      * ']'; the next character should be the ')' which will be
13278                      * paired with the '(' that has been put on the stack, so
13279                      * the whole embedded expression reduces to '(operand)' */
13280                     RExC_parse++;
13281
13282                     RExC_flags = save_flags;
13283                     goto handle_operand;
13284                 }
13285                 /* FALLTHROUGH */
13286
13287             default:
13288                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13289                 vFAIL("Unexpected character");
13290
13291             case '\\':
13292                 /* regclass() can only return RESTART_UTF8 if multi-char
13293                    folds are allowed.  */
13294                 if (!regclass(pRExC_state, flagp,depth+1,
13295                               TRUE, /* means parse just the next thing */
13296                               FALSE, /* don't allow multi-char folds */
13297                               FALSE, /* don't silence non-portable warnings.  */
13298                               &current))
13299                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13300                           (UV) *flagp);
13301                 /* regclass() will return with parsing just the \ sequence,
13302                  * leaving the parse pointer at the next thing to parse */
13303                 RExC_parse--;
13304                 goto handle_operand;
13305
13306             case '[':   /* Is a bracketed character class */
13307             {
13308                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13309
13310                 if (! is_posix_class) {
13311                     RExC_parse++;
13312                 }
13313
13314                 /* regclass() can only return RESTART_UTF8 if multi-char
13315                    folds are allowed.  */
13316                 if(!regclass(pRExC_state, flagp,depth+1,
13317                              is_posix_class, /* parse the whole char class
13318                                                 only if not a posix class */
13319                              FALSE, /* don't allow multi-char folds */
13320                              FALSE, /* don't silence non-portable warnings.  */
13321                              &current))
13322                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13323                           (UV) *flagp);
13324                 /* function call leaves parse pointing to the ']', except if we
13325                  * faked it */
13326                 if (is_posix_class) {
13327                     RExC_parse--;
13328                 }
13329
13330                 goto handle_operand;
13331             }
13332
13333             case '&':
13334             case '|':
13335             case '+':
13336             case '-':
13337             case '^':
13338                 if (top_index < 0
13339                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13340                     || ! IS_OPERAND(*top_ptr))
13341                 {
13342                     RExC_parse++;
13343                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13344                 }
13345                 av_push(stack, newSVuv(curchar));
13346                 break;
13347
13348             case '!':
13349                 av_push(stack, newSVuv(curchar));
13350                 break;
13351
13352             case '(':
13353                 if (top_index >= 0) {
13354                     top_ptr = av_fetch(stack, top_index, FALSE);
13355                     assert(top_ptr);
13356                     if (IS_OPERAND(*top_ptr)) {
13357                         RExC_parse++;
13358                         vFAIL("Unexpected '(' with no preceding operator");
13359                     }
13360                 }
13361                 av_push(stack, newSVuv(curchar));
13362                 break;
13363
13364             case ')':
13365             {
13366                 SV* lparen;
13367                 if (top_index < 1
13368                     || ! (current = av_pop(stack))
13369                     || ! IS_OPERAND(current)
13370                     || ! (lparen = av_pop(stack))
13371                     || IS_OPERAND(lparen)
13372                     || SvUV(lparen) != '(')
13373                 {
13374                     SvREFCNT_dec(current);
13375                     RExC_parse++;
13376                     vFAIL("Unexpected ')'");
13377                 }
13378                 top_index -= 2;
13379                 SvREFCNT_dec_NN(lparen);
13380
13381                 /* FALLTHROUGH */
13382             }
13383
13384               handle_operand:
13385
13386                 /* Here, we have an operand to process, in 'current' */
13387
13388                 if (top_index < 0) {    /* Just push if stack is empty */
13389                     av_push(stack, current);
13390                 }
13391                 else {
13392                     SV* top = av_pop(stack);
13393                     SV *prev = NULL;
13394                     char current_operator;
13395
13396                     if (IS_OPERAND(top)) {
13397                         SvREFCNT_dec_NN(top);
13398                         SvREFCNT_dec_NN(current);
13399                         vFAIL("Operand with no preceding operator");
13400                     }
13401                     current_operator = (char) SvUV(top);
13402                     switch (current_operator) {
13403                         case '(':   /* Push the '(' back on followed by the new
13404                                        operand */
13405                             av_push(stack, top);
13406                             av_push(stack, current);
13407                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13408                                                    just after the 'break', so
13409                                                    it doesn't get wrongly freed
13410                                                  */
13411                             break;
13412
13413                         case '!':
13414                             _invlist_invert(current);
13415
13416                             /* Unlike binary operators, the top of the stack,
13417                              * now that this unary one has been popped off, may
13418                              * legally be an operator, and we now have operand
13419                              * for it. */
13420                             top_index--;
13421                             SvREFCNT_dec_NN(top);
13422                             goto handle_operand;
13423
13424                         case '&':
13425                             prev = av_pop(stack);
13426                             _invlist_intersection(prev,
13427                                                    current,
13428                                                    &current);
13429                             av_push(stack, current);
13430                             break;
13431
13432                         case '|':
13433                         case '+':
13434                             prev = av_pop(stack);
13435                             _invlist_union(prev, current, &current);
13436                             av_push(stack, current);
13437                             break;
13438
13439                         case '-':
13440                             prev = av_pop(stack);;
13441                             _invlist_subtract(prev, current, &current);
13442                             av_push(stack, current);
13443                             break;
13444
13445                         case '^':   /* The union minus the intersection */
13446                         {
13447                             SV* i = NULL;
13448                             SV* u = NULL;
13449                             SV* element;
13450
13451                             prev = av_pop(stack);
13452                             _invlist_union(prev, current, &u);
13453                             _invlist_intersection(prev, current, &i);
13454                             /* _invlist_subtract will overwrite current
13455                                 without freeing what it already contains */
13456                             element = current;
13457                             _invlist_subtract(u, i, &current);
13458                             av_push(stack, current);
13459                             SvREFCNT_dec_NN(i);
13460                             SvREFCNT_dec_NN(u);
13461                             SvREFCNT_dec_NN(element);
13462                             break;
13463                         }
13464
13465                         default:
13466                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13467                 }
13468                 SvREFCNT_dec_NN(top);
13469                 SvREFCNT_dec(prev);
13470             }
13471         }
13472
13473         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13474     }
13475
13476     if (av_tindex(stack) < 0   /* Was empty */
13477         || ((final = av_pop(stack)) == NULL)
13478         || ! IS_OPERAND(final)
13479         || av_tindex(stack) >= 0)  /* More left on stack */
13480     {
13481         vFAIL("Incomplete expression within '(?[ ])'");
13482     }
13483
13484     /* Here, 'final' is the resultant inversion list from evaluating the
13485      * expression.  Return it if so requested */
13486     if (return_invlist) {
13487         *return_invlist = final;
13488         return END;
13489     }
13490
13491     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13492      * expecting a string of ranges and individual code points */
13493     invlist_iterinit(final);
13494     result_string = newSVpvs("");
13495     while (invlist_iternext(final, &start, &end)) {
13496         if (start == end) {
13497             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13498         }
13499         else {
13500             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13501                                                      start,          end);
13502         }
13503     }
13504
13505     save_parse = RExC_parse;
13506     RExC_parse = SvPV(result_string, len);
13507     save_end = RExC_end;
13508     RExC_end = RExC_parse + len;
13509
13510     /* We turn off folding around the call, as the class we have constructed
13511      * already has all folding taken into consideration, and we don't want
13512      * regclass() to add to that */
13513     RExC_flags &= ~RXf_PMf_FOLD;
13514     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13515      */
13516     node = regclass(pRExC_state, flagp,depth+1,
13517                     FALSE, /* means parse the whole char class */
13518                     FALSE, /* don't allow multi-char folds */
13519                     TRUE, /* silence non-portable warnings.  The above may very
13520                              well have generated non-portable code points, but
13521                              they're valid on this machine */
13522                     NULL);
13523     if (!node)
13524         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13525                     PTR2UV(flagp));
13526     if (save_fold) {
13527         RExC_flags |= RXf_PMf_FOLD;
13528     }
13529     RExC_parse = save_parse + 1;
13530     RExC_end = save_end;
13531     SvREFCNT_dec_NN(final);
13532     SvREFCNT_dec_NN(result_string);
13533
13534     nextchar(pRExC_state);
13535     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13536     return node;
13537 }
13538 #undef IS_OPERAND
13539
13540 STATIC void
13541 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13542 {
13543     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13544      * innocent-looking character class, like /[ks]/i won't have to go out to
13545      * disk to find the possible matches.
13546      *
13547      * This should be called only for a Latin1-range code points, cp, which is
13548      * known to be involved in a simple fold with other code points above
13549      * Latin1.  It would give false results if /aa has been specified.
13550      * Multi-char folds are outside the scope of this, and must be handled
13551      * specially.
13552      *
13553      * XXX It would be better to generate these via regen, in case a new
13554      * version of the Unicode standard adds new mappings, though that is not
13555      * really likely, and may be caught by the default: case of the switch
13556      * below. */
13557
13558     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13559
13560     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13561
13562     switch (cp) {
13563         case 'k':
13564         case 'K':
13565           *invlist =
13566              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13567             break;
13568         case 's':
13569         case 'S':
13570           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13571             break;
13572         case MICRO_SIGN:
13573           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13574           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13575             break;
13576         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13577         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13578           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13579             break;
13580         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13581           *invlist = add_cp_to_invlist(*invlist,
13582                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13583             break;
13584         case LATIN_SMALL_LETTER_SHARP_S:
13585           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13586             break;
13587         default:
13588             /* Use deprecated warning to increase the chances of this being
13589              * output */
13590             if (PASS2) {
13591                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13592             }
13593             break;
13594     }
13595 }
13596
13597 STATIC AV *
13598 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13599 {
13600     /* This adds the string scalar <multi_string> to the array
13601      * <multi_char_matches>.  <multi_string> is known to have exactly
13602      * <cp_count> code points in it.  This is used when constructing a
13603      * bracketed character class and we find something that needs to match more
13604      * than a single character.
13605      *
13606      * <multi_char_matches> is actually an array of arrays.  Each top-level
13607      * element is an array that contains all the strings known so far that are
13608      * the same length.  And that length (in number of code points) is the same
13609      * as the index of the top-level array.  Hence, the [2] element is an
13610      * array, each element thereof is a string containing TWO code points;
13611      * while element [3] is for strings of THREE characters, and so on.  Since
13612      * this is for multi-char strings there can never be a [0] nor [1] element.
13613      *
13614      * When we rewrite the character class below, we will do so such that the
13615      * longest strings are written first, so that it prefers the longest
13616      * matching strings first.  This is done even if it turns out that any
13617      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
13618      * Christiansen has agreed that this is ok.  This makes the test for the
13619      * ligature 'ffi' come before the test for 'ff', for example */
13620
13621     AV* this_array;
13622     AV** this_array_ptr;
13623
13624     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13625
13626     if (! multi_char_matches) {
13627         multi_char_matches = newAV();
13628     }
13629
13630     if (av_exists(multi_char_matches, cp_count)) {
13631         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13632         this_array = *this_array_ptr;
13633     }
13634     else {
13635         this_array = newAV();
13636         av_store(multi_char_matches, cp_count,
13637                  (SV*) this_array);
13638     }
13639     av_push(this_array, multi_string);
13640
13641     return multi_char_matches;
13642 }
13643
13644 /* The names of properties whose definitions are not known at compile time are
13645  * stored in this SV, after a constant heading.  So if the length has been
13646  * changed since initialization, then there is a run-time definition. */
13647 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13648                                         (SvCUR(listsv) != initial_listsv_len)
13649
13650 STATIC regnode *
13651 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13652                  const bool stop_at_1,  /* Just parse the next thing, don't
13653                                            look for a full character class */
13654                  bool allow_multi_folds,
13655                  const bool silence_non_portable,   /* Don't output warnings
13656                                                        about too large
13657                                                        characters */
13658                  SV** ret_invlist)  /* Return an inversion list, not a node */
13659 {
13660     /* parse a bracketed class specification.  Most of these will produce an
13661      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13662      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13663      * under /i with multi-character folds: it will be rewritten following the
13664      * paradigm of this example, where the <multi-fold>s are characters which
13665      * fold to multiple character sequences:
13666      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13667      * gets effectively rewritten as:
13668      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13669      * reg() gets called (recursively) on the rewritten version, and this
13670      * function will return what it constructs.  (Actually the <multi-fold>s
13671      * aren't physically removed from the [abcdefghi], it's just that they are
13672      * ignored in the recursion by means of a flag:
13673      * <RExC_in_multi_char_class>.)
13674      *
13675      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13676      * characters, with the corresponding bit set if that character is in the
13677      * list.  For characters above this, a range list or swash is used.  There
13678      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13679      * determinable at compile time
13680      *
13681      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13682      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13683      */
13684
13685     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13686     IV range = 0;
13687     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13688     regnode *ret;
13689     STRLEN numlen;
13690     IV namedclass = OOB_NAMEDCLASS;
13691     char *rangebegin = NULL;
13692     bool need_class = 0;
13693     SV *listsv = NULL;
13694     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13695                                       than just initialized.  */
13696     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13697     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13698                                extended beyond the Latin1 range.  These have to
13699                                be kept separate from other code points for much
13700                                of this function because their handling  is
13701                                different under /i, and for most classes under
13702                                /d as well */
13703     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13704                                separate for a while from the non-complemented
13705                                versions because of complications with /d
13706                                matching */
13707     UV element_count = 0;   /* Number of distinct elements in the class.
13708                                Optimizations may be possible if this is tiny */
13709     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13710                                        character; used under /i */
13711     UV n;
13712     char * stop_ptr = RExC_end;    /* where to stop parsing */
13713     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13714                                                    space? */
13715     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13716
13717     /* Unicode properties are stored in a swash; this holds the current one
13718      * being parsed.  If this swash is the only above-latin1 component of the
13719      * character class, an optimization is to pass it directly on to the
13720      * execution engine.  Otherwise, it is set to NULL to indicate that there
13721      * are other things in the class that have to be dealt with at execution
13722      * time */
13723     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13724
13725     /* Set if a component of this character class is user-defined; just passed
13726      * on to the engine */
13727     bool has_user_defined_property = FALSE;
13728
13729     /* inversion list of code points this node matches only when the target
13730      * string is in UTF-8.  (Because is under /d) */
13731     SV* depends_list = NULL;
13732
13733     /* Inversion list of code points this node matches regardless of things
13734      * like locale, folding, utf8ness of the target string */
13735     SV* cp_list = NULL;
13736
13737     /* Like cp_list, but code points on this list need to be checked for things
13738      * that fold to/from them under /i */
13739     SV* cp_foldable_list = NULL;
13740
13741     /* Like cp_list, but code points on this list are valid only when the
13742      * runtime locale is UTF-8 */
13743     SV* only_utf8_locale_list = NULL;
13744
13745 #ifdef EBCDIC
13746     /* In a range, counts how many 0-2 of the ends of it came from literals,
13747      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13748     UV literal_endpoint = 0;
13749
13750     /* Is the range unicode? which means on a platform that isn't 1-1 native
13751      * to Unicode (i.e. non-ASCII), each code point in it should be considered
13752      * to be a Unicode value.  */
13753     bool unicode_range = FALSE;
13754 #endif
13755     bool invert = FALSE;    /* Is this class to be complemented */
13756
13757     bool warn_super = ALWAYS_WARN_SUPER;
13758
13759     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13760         case we need to change the emitted regop to an EXACT. */
13761     const char * orig_parse = RExC_parse;
13762     const SSize_t orig_size = RExC_size;
13763     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13764     GET_RE_DEBUG_FLAGS_DECL;
13765
13766     PERL_ARGS_ASSERT_REGCLASS;
13767 #ifndef DEBUGGING
13768     PERL_UNUSED_ARG(depth);
13769 #endif
13770
13771     DEBUG_PARSE("clas");
13772
13773     /* Assume we are going to generate an ANYOF node. */
13774     ret = reganode(pRExC_state, ANYOF, 0);
13775
13776     if (SIZE_ONLY) {
13777         RExC_size += ANYOF_SKIP;
13778         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13779     }
13780     else {
13781         ANYOF_FLAGS(ret) = 0;
13782
13783         RExC_emit += ANYOF_SKIP;
13784         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13785         initial_listsv_len = SvCUR(listsv);
13786         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13787     }
13788
13789     if (skip_white) {
13790         RExC_parse = regpatws(pRExC_state, RExC_parse,
13791                               FALSE /* means don't recognize comments */ );
13792     }
13793
13794     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13795         RExC_parse++;
13796         invert = TRUE;
13797         allow_multi_folds = FALSE;
13798         RExC_naughty++;
13799         if (skip_white) {
13800             RExC_parse = regpatws(pRExC_state, RExC_parse,
13801                                   FALSE /* means don't recognize comments */ );
13802         }
13803     }
13804
13805     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13806     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13807         const char *s = RExC_parse;
13808         const char  c = *s++;
13809
13810         while (isWORDCHAR(*s))
13811             s++;
13812         if (*s && c == *s && s[1] == ']') {
13813             SAVEFREESV(RExC_rx_sv);
13814             ckWARN3reg(s+2,
13815                        "POSIX syntax [%c %c] belongs inside character classes",
13816                        c, c);
13817             (void)ReREFCNT_inc(RExC_rx_sv);
13818         }
13819     }
13820
13821     /* If the caller wants us to just parse a single element, accomplish this
13822      * by faking the loop ending condition */
13823     if (stop_at_1 && RExC_end > RExC_parse) {
13824         stop_ptr = RExC_parse + 1;
13825     }
13826
13827     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13828     if (UCHARAT(RExC_parse) == ']')
13829         goto charclassloop;
13830
13831     while (1) {
13832         if  (RExC_parse >= stop_ptr) {
13833             break;
13834         }
13835
13836         if (skip_white) {
13837             RExC_parse = regpatws(pRExC_state, RExC_parse,
13838                                   FALSE /* means don't recognize comments */ );
13839         }
13840
13841         if  (UCHARAT(RExC_parse) == ']') {
13842             break;
13843         }
13844
13845     charclassloop:
13846
13847         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13848         save_value = value;
13849         save_prevvalue = prevvalue;
13850
13851         if (!range) {
13852             rangebegin = RExC_parse;
13853             element_count++;
13854         }
13855         if (UTF) {
13856             value = utf8n_to_uvchr((U8*)RExC_parse,
13857                                    RExC_end - RExC_parse,
13858                                    &numlen, UTF8_ALLOW_DEFAULT);
13859             RExC_parse += numlen;
13860         }
13861         else
13862             value = UCHARAT(RExC_parse++);
13863
13864         if (value == '['
13865             && RExC_parse < RExC_end
13866             && POSIXCC(UCHARAT(RExC_parse)))
13867         {
13868             namedclass = regpposixcc(pRExC_state, value, strict);
13869         }
13870         else if (value != '\\') {
13871 #ifdef EBCDIC
13872             literal_endpoint++;
13873 #endif
13874         }
13875         else {
13876             /* Is a backslash; get the code point of the char after it */
13877             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13878                 value = utf8n_to_uvchr((U8*)RExC_parse,
13879                                    RExC_end - RExC_parse,
13880                                    &numlen, UTF8_ALLOW_DEFAULT);
13881                 RExC_parse += numlen;
13882             }
13883             else
13884                 value = UCHARAT(RExC_parse++);
13885
13886             /* Some compilers cannot handle switching on 64-bit integer
13887              * values, therefore value cannot be an UV.  Yes, this will
13888              * be a problem later if we want switch on Unicode.
13889              * A similar issue a little bit later when switching on
13890              * namedclass. --jhi */
13891
13892             /* If the \ is escaping white space when white space is being
13893              * skipped, it means that that white space is wanted literally, and
13894              * is already in 'value'.  Otherwise, need to translate the escape
13895              * into what it signifies. */
13896             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13897
13898             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13899             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13900             case 's':   namedclass = ANYOF_SPACE;       break;
13901             case 'S':   namedclass = ANYOF_NSPACE;      break;
13902             case 'd':   namedclass = ANYOF_DIGIT;       break;
13903             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13904             case 'v':   namedclass = ANYOF_VERTWS;      break;
13905             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13906             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13907             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13908             case 'N':  /* Handle \N{NAME} in class */
13909                 {
13910                     SV *as_text;
13911                     STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13912                                                     flagp, depth, &as_text);
13913                     if (*flagp & RESTART_UTF8)
13914                         FAIL("panic: grok_bslash_N set RESTART_UTF8");
13915                     if (cp_count != 1) {    /* The typical case drops through */
13916                         assert(cp_count != (STRLEN) -1);
13917                         if (cp_count == 0) {
13918                             if (strict) {
13919                                 RExC_parse++;   /* Position after the "}" */
13920                                 vFAIL("Zero length \\N{}");
13921                             }
13922                             else if (PASS2) {
13923                                 ckWARNreg(RExC_parse,
13924                                         "Ignoring zero length \\N{} in character class");
13925                             }
13926                         }
13927                         else { /* cp_count > 1 */
13928                             if (! RExC_in_multi_char_class) {
13929                                 if (invert || range || *RExC_parse == '-') {
13930                                     if (strict) {
13931                                         RExC_parse--;
13932                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13933                                     }
13934                                     else if (PASS2) {
13935                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13936                                     }
13937                                 }
13938                                 else {
13939                                     multi_char_matches
13940                                         = add_multi_match(multi_char_matches,
13941                                                           as_text,
13942                                                           cp_count);
13943                                 }
13944                                 break; /* <value> contains the first code
13945                                           point. Drop out of the switch to
13946                                           process it */
13947                             }
13948                         } /* End of cp_count != 1 */
13949
13950                         /* This element should not be processed further in this
13951                          * class */
13952                         element_count--;
13953                         value = save_value;
13954                         prevvalue = save_prevvalue;
13955                         continue;   /* Back to top of loop to get next char */
13956                     }
13957                     /* Here, is a single code point, and <value> contains it */
13958 #ifdef EBCDIC
13959                     /* We consider named characters to be literal characters,
13960                      * and they are Unicode */
13961                     literal_endpoint++;
13962                     unicode_range = TRUE;
13963 #endif
13964                 }
13965                 break;
13966             case 'p':
13967             case 'P':
13968                 {
13969                 char *e;
13970
13971                 /* We will handle any undefined properties ourselves */
13972                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13973                                        /* And we actually would prefer to get
13974                                         * the straight inversion list of the
13975                                         * swash, since we will be accessing it
13976                                         * anyway, to save a little time */
13977                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13978
13979                 if (RExC_parse >= RExC_end)
13980                     vFAIL2("Empty \\%c{}", (U8)value);
13981                 if (*RExC_parse == '{') {
13982                     const U8 c = (U8)value;
13983                     e = strchr(RExC_parse++, '}');
13984                     if (!e)
13985                         vFAIL2("Missing right brace on \\%c{}", c);
13986                     while (isSPACE(*RExC_parse))
13987                         RExC_parse++;
13988                     if (e == RExC_parse)
13989                         vFAIL2("Empty \\%c{}", c);
13990                     n = e - RExC_parse;
13991                     while (isSPACE(*(RExC_parse + n - 1)))
13992                         n--;
13993                 }
13994                 else {
13995                     e = RExC_parse;
13996                     n = 1;
13997                 }
13998                 if (!SIZE_ONLY) {
13999                     SV* invlist;
14000                     char* name;
14001
14002                     if (UCHARAT(RExC_parse) == '^') {
14003                          RExC_parse++;
14004                          n--;
14005                          /* toggle.  (The rhs xor gets the single bit that
14006                           * differs between P and p; the other xor inverts just
14007                           * that bit) */
14008                          value ^= 'P' ^ 'p';
14009
14010                          while (isSPACE(*RExC_parse)) {
14011                               RExC_parse++;
14012                               n--;
14013                          }
14014                     }
14015                     /* Try to get the definition of the property into
14016                      * <invlist>.  If /i is in effect, the effective property
14017                      * will have its name be <__NAME_i>.  The design is
14018                      * discussed in commit
14019                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14020                     name = savepv(Perl_form(aTHX_
14021                                           "%s%.*s%s\n",
14022                                           (FOLD) ? "__" : "",
14023                                           (int)n,
14024                                           RExC_parse,
14025                                           (FOLD) ? "_i" : ""
14026                                 ));
14027
14028                     /* Look up the property name, and get its swash and
14029                      * inversion list, if the property is found  */
14030                     if (swash) {
14031                         SvREFCNT_dec_NN(swash);
14032                     }
14033                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
14034                                              1, /* binary */
14035                                              0, /* not tr/// */
14036                                              NULL, /* No inversion list */
14037                                              &swash_init_flags
14038                                             );
14039                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14040                         HV* curpkg = (IN_PERL_COMPILETIME)
14041                                       ? PL_curstash
14042                                       : CopSTASH(PL_curcop);
14043                         if (swash) {
14044                             SvREFCNT_dec_NN(swash);
14045                             swash = NULL;
14046                         }
14047
14048                         /* Here didn't find it.  It could be a user-defined
14049                          * property that will be available at run-time.  If we
14050                          * accept only compile-time properties, is an error;
14051                          * otherwise add it to the list for run-time look up */
14052                         if (ret_invlist) {
14053                             RExC_parse = e + 1;
14054                             vFAIL2utf8f(
14055                                 "Property '%"UTF8f"' is unknown",
14056                                 UTF8fARG(UTF, n, name));
14057                         }
14058
14059                         /* If the property name doesn't already have a package
14060                          * name, add the current one to it so that it can be
14061                          * referred to outside it. [perl #121777] */
14062                         if (curpkg && ! instr(name, "::")) {
14063                             char* pkgname = HvNAME(curpkg);
14064                             if (strNE(pkgname, "main")) {
14065                                 char* full_name = Perl_form(aTHX_
14066                                                             "%s::%s",
14067                                                             pkgname,
14068                                                             name);
14069                                 n = strlen(full_name);
14070                                 Safefree(name);
14071                                 name = savepvn(full_name, n);
14072                             }
14073                         }
14074                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14075                                         (value == 'p' ? '+' : '!'),
14076                                         UTF8fARG(UTF, n, name));
14077                         has_user_defined_property = TRUE;
14078
14079                         /* We don't know yet, so have to assume that the
14080                          * property could match something in the Latin1 range,
14081                          * hence something that isn't utf8.  Note that this
14082                          * would cause things in <depends_list> to match
14083                          * inappropriately, except that any \p{}, including
14084                          * this one forces Unicode semantics, which means there
14085                          * is no <depends_list> */
14086                         ANYOF_FLAGS(ret)
14087                                       |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14088                     }
14089                     else {
14090
14091                         /* Here, did get the swash and its inversion list.  If
14092                          * the swash is from a user-defined property, then this
14093                          * whole character class should be regarded as such */
14094                         if (swash_init_flags
14095                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14096                         {
14097                             has_user_defined_property = TRUE;
14098                         }
14099                         else if
14100                             /* We warn on matching an above-Unicode code point
14101                              * if the match would return true, except don't
14102                              * warn for \p{All}, which has exactly one element
14103                              * = 0 */
14104                             (_invlist_contains_cp(invlist, 0x110000)
14105                                 && (! (_invlist_len(invlist) == 1
14106                                        && *invlist_array(invlist) == 0)))
14107                         {
14108                             warn_super = TRUE;
14109                         }
14110
14111
14112                         /* Invert if asking for the complement */
14113                         if (value == 'P') {
14114                             _invlist_union_complement_2nd(properties,
14115                                                           invlist,
14116                                                           &properties);
14117
14118                             /* The swash can't be used as-is, because we've
14119                              * inverted things; delay removing it to here after
14120                              * have copied its invlist above */
14121                             SvREFCNT_dec_NN(swash);
14122                             swash = NULL;
14123                         }
14124                         else {
14125                             _invlist_union(properties, invlist, &properties);
14126                         }
14127                     }
14128                     Safefree(name);
14129                 }
14130                 RExC_parse = e + 1;
14131                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
14132                                                 named */
14133
14134                 /* \p means they want Unicode semantics */
14135                 RExC_uni_semantics = 1;
14136                 }
14137                 break;
14138             case 'n':   value = '\n';                   break;
14139             case 'r':   value = '\r';                   break;
14140             case 't':   value = '\t';                   break;
14141             case 'f':   value = '\f';                   break;
14142             case 'b':   value = '\b';                   break;
14143             case 'e':   value = ESC_NATIVE;             break;
14144             case 'a':   value = '\a';                   break;
14145             case 'o':
14146                 RExC_parse--;   /* function expects to be pointed at the 'o' */
14147                 {
14148                     const char* error_msg;
14149                     bool valid = grok_bslash_o(&RExC_parse,
14150                                                &value,
14151                                                &error_msg,
14152                                                PASS2,   /* warnings only in
14153                                                            pass 2 */
14154                                                strict,
14155                                                silence_non_portable,
14156                                                UTF);
14157                     if (! valid) {
14158                         vFAIL(error_msg);
14159                     }
14160                 }
14161                 if (IN_ENCODING && value < 0x100) {
14162                     goto recode_encoding;
14163                 }
14164                 break;
14165             case 'x':
14166                 RExC_parse--;   /* function expects to be pointed at the 'x' */
14167                 {
14168                     const char* error_msg;
14169                     bool valid = grok_bslash_x(&RExC_parse,
14170                                                &value,
14171                                                &error_msg,
14172                                                PASS2, /* Output warnings */
14173                                                strict,
14174                                                silence_non_portable,
14175                                                UTF);
14176                     if (! valid) {
14177                         vFAIL(error_msg);
14178                     }
14179                 }
14180                 if (IN_ENCODING && value < 0x100)
14181                     goto recode_encoding;
14182                 break;
14183             case 'c':
14184                 value = grok_bslash_c(*RExC_parse++, PASS2);
14185                 break;
14186             case '0': case '1': case '2': case '3': case '4':
14187             case '5': case '6': case '7':
14188                 {
14189                     /* Take 1-3 octal digits */
14190                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14191                     numlen = (strict) ? 4 : 3;
14192                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14193                     RExC_parse += numlen;
14194                     if (numlen != 3) {
14195                         if (strict) {
14196                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14197                             vFAIL("Need exactly 3 octal digits");
14198                         }
14199                         else if (! SIZE_ONLY /* like \08, \178 */
14200                                  && numlen < 3
14201                                  && RExC_parse < RExC_end
14202                                  && isDIGIT(*RExC_parse)
14203                                  && ckWARN(WARN_REGEXP))
14204                         {
14205                             SAVEFREESV(RExC_rx_sv);
14206                             reg_warn_non_literal_string(
14207                                  RExC_parse + 1,
14208                                  form_short_octal_warning(RExC_parse, numlen));
14209                             (void)ReREFCNT_inc(RExC_rx_sv);
14210                         }
14211                     }
14212                     if (IN_ENCODING && value < 0x100)
14213                         goto recode_encoding;
14214                     break;
14215                 }
14216             recode_encoding:
14217                 if (! RExC_override_recoding) {
14218                     SV* enc = _get_encoding();
14219                     value = reg_recode((const char)(U8)value, &enc);
14220                     if (!enc) {
14221                         if (strict) {
14222                             vFAIL("Invalid escape in the specified encoding");
14223                         }
14224                         else if (PASS2) {
14225                             ckWARNreg(RExC_parse,
14226                                   "Invalid escape in the specified encoding");
14227                         }
14228                     }
14229                     break;
14230                 }
14231             default:
14232                 /* Allow \_ to not give an error */
14233                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14234                     if (strict) {
14235                         vFAIL2("Unrecognized escape \\%c in character class",
14236                                (int)value);
14237                     }
14238                     else {
14239                         SAVEFREESV(RExC_rx_sv);
14240                         ckWARN2reg(RExC_parse,
14241                             "Unrecognized escape \\%c in character class passed through",
14242                             (int)value);
14243                         (void)ReREFCNT_inc(RExC_rx_sv);
14244                     }
14245                 }
14246                 break;
14247             }   /* End of switch on char following backslash */
14248         } /* end of handling backslash escape sequences */
14249
14250         /* Here, we have the current token in 'value' */
14251
14252         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14253             U8 classnum;
14254
14255             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
14256              * literal, as is the character that began the false range, i.e.
14257              * the 'a' in the examples */
14258             if (range) {
14259                 if (!SIZE_ONLY) {
14260                     const int w = (RExC_parse >= rangebegin)
14261                                   ? RExC_parse - rangebegin
14262                                   : 0;
14263                     if (strict) {
14264                         vFAIL2utf8f(
14265                             "False [] range \"%"UTF8f"\"",
14266                             UTF8fARG(UTF, w, rangebegin));
14267                     }
14268                     else {
14269                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14270                         ckWARN2reg(RExC_parse,
14271                             "False [] range \"%"UTF8f"\"",
14272                             UTF8fARG(UTF, w, rangebegin));
14273                         (void)ReREFCNT_inc(RExC_rx_sv);
14274                         cp_list = add_cp_to_invlist(cp_list, '-');
14275                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14276                                                              prevvalue);
14277                     }
14278                 }
14279
14280                 range = 0; /* this was not a true range */
14281                 element_count += 2; /* So counts for three values */
14282             }
14283
14284             classnum = namedclass_to_classnum(namedclass);
14285
14286             if (LOC && namedclass < ANYOF_POSIXL_MAX
14287 #ifndef HAS_ISASCII
14288                 && classnum != _CC_ASCII
14289 #endif
14290             ) {
14291                 /* What the Posix classes (like \w, [:space:]) match in locale
14292                  * isn't knowable under locale until actual match time.  Room
14293                  * must be reserved (one time per outer bracketed class) to
14294                  * store such classes.  The space will contain a bit for each
14295                  * named class that is to be matched against.  This isn't
14296                  * needed for \p{} and pseudo-classes, as they are not affected
14297                  * by locale, and hence are dealt with separately */
14298                 if (! need_class) {
14299                     need_class = 1;
14300                     if (SIZE_ONLY) {
14301                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14302                     }
14303                     else {
14304                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14305                     }
14306                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14307                     ANYOF_POSIXL_ZERO(ret);
14308                 }
14309
14310                 /* Coverity thinks it is possible for this to be negative; both
14311                  * jhi and khw think it's not, but be safer */
14312                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14313                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14314
14315                 /* See if it already matches the complement of this POSIX
14316                  * class */
14317                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14318                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14319                                                             ? -1
14320                                                             : 1)))
14321                 {
14322                     posixl_matches_all = TRUE;
14323                     break;  /* No need to continue.  Since it matches both
14324                                e.g., \w and \W, it matches everything, and the
14325                                bracketed class can be optimized into qr/./s */
14326                 }
14327
14328                 /* Add this class to those that should be checked at runtime */
14329                 ANYOF_POSIXL_SET(ret, namedclass);
14330
14331                 /* The above-Latin1 characters are not subject to locale rules.
14332                  * Just add them, in the second pass, to the
14333                  * unconditionally-matched list */
14334                 if (! SIZE_ONLY) {
14335                     SV* scratch_list = NULL;
14336
14337                     /* Get the list of the above-Latin1 code points this
14338                      * matches */
14339                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14340                                           PL_XPosix_ptrs[classnum],
14341
14342                                           /* Odd numbers are complements, like
14343                                            * NDIGIT, NASCII, ... */
14344                                           namedclass % 2 != 0,
14345                                           &scratch_list);
14346                     /* Checking if 'cp_list' is NULL first saves an extra
14347                      * clone.  Its reference count will be decremented at the
14348                      * next union, etc, or if this is the only instance, at the
14349                      * end of the routine */
14350                     if (! cp_list) {
14351                         cp_list = scratch_list;
14352                     }
14353                     else {
14354                         _invlist_union(cp_list, scratch_list, &cp_list);
14355                         SvREFCNT_dec_NN(scratch_list);
14356                     }
14357                     continue;   /* Go get next character */
14358                 }
14359             }
14360             else if (! SIZE_ONLY) {
14361
14362                 /* Here, not in pass1 (in that pass we skip calculating the
14363                  * contents of this class), and is /l, or is a POSIX class for
14364                  * which /l doesn't matter (or is a Unicode property, which is
14365                  * skipped here). */
14366                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
14367                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14368
14369                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
14370                          * nor /l make a difference in what these match,
14371                          * therefore we just add what they match to cp_list. */
14372                         if (classnum != _CC_VERTSPACE) {
14373                             assert(   namedclass == ANYOF_HORIZWS
14374                                    || namedclass == ANYOF_NHORIZWS);
14375
14376                             /* It turns out that \h is just a synonym for
14377                              * XPosixBlank */
14378                             classnum = _CC_BLANK;
14379                         }
14380
14381                         _invlist_union_maybe_complement_2nd(
14382                                 cp_list,
14383                                 PL_XPosix_ptrs[classnum],
14384                                 namedclass % 2 != 0,    /* Complement if odd
14385                                                           (NHORIZWS, NVERTWS)
14386                                                         */
14387                                 &cp_list);
14388                     }
14389                 }
14390                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
14391                            complement and use nposixes */
14392                     SV** posixes_ptr = namedclass % 2 == 0
14393                                        ? &posixes
14394                                        : &nposixes;
14395                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
14396                     _invlist_union_maybe_complement_2nd(
14397                                                      *posixes_ptr,
14398                                                      *source_ptr,
14399                                                      namedclass % 2 != 0,
14400                                                      posixes_ptr);
14401                 }
14402             }
14403         } /* end of namedclass \blah */
14404
14405         if (skip_white) {
14406             RExC_parse = regpatws(pRExC_state, RExC_parse,
14407                                 FALSE /* means don't recognize comments */ );
14408         }
14409
14410         /* If 'range' is set, 'value' is the ending of a range--check its
14411          * validity.  (If value isn't a single code point in the case of a
14412          * range, we should have figured that out above in the code that
14413          * catches false ranges).  Later, we will handle each individual code
14414          * point in the range.  If 'range' isn't set, this could be the
14415          * beginning of a range, so check for that by looking ahead to see if
14416          * the next real character to be processed is the range indicator--the
14417          * minus sign */
14418
14419         if (range) {
14420 #ifdef EBCDIC
14421             /* For unicode ranges, we have to test that the Unicode as opposed
14422              * to the native values are not decreasing.  (Above 255, and there
14423              * is no difference between native and Unicode) */
14424             if (unicode_range && prevvalue < 255 && value < 255) {
14425                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14426                     goto backwards_range;
14427                 }
14428             }
14429             else
14430 #endif
14431             if (prevvalue > value) /* b-a */ {
14432                 int w;
14433 #ifdef EBCDIC
14434               backwards_range:
14435 #endif
14436                 w = RExC_parse - rangebegin;
14437                 vFAIL2utf8f(
14438                     "Invalid [] range \"%"UTF8f"\"",
14439                     UTF8fARG(UTF, w, rangebegin));
14440                 range = 0; /* not a valid range */
14441             }
14442         }
14443         else {
14444             prevvalue = value; /* save the beginning of the potential range */
14445             if (! stop_at_1     /* Can't be a range if parsing just one thing */
14446                 && *RExC_parse == '-')
14447             {
14448                 char* next_char_ptr = RExC_parse + 1;
14449                 if (skip_white) {   /* Get the next real char after the '-' */
14450                     next_char_ptr = regpatws(pRExC_state,
14451                                              RExC_parse + 1,
14452                                              FALSE); /* means don't recognize
14453                                                         comments */
14454                 }
14455
14456                 /* If the '-' is at the end of the class (just before the ']',
14457                  * it is a literal minus; otherwise it is a range */
14458                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14459                     RExC_parse = next_char_ptr;
14460
14461                     /* a bad range like \w-, [:word:]- ? */
14462                     if (namedclass > OOB_NAMEDCLASS) {
14463                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14464                             const int w = RExC_parse >= rangebegin
14465                                           ?  RExC_parse - rangebegin
14466                                           : 0;
14467                             if (strict) {
14468                                 vFAIL4("False [] range \"%*.*s\"",
14469                                     w, w, rangebegin);
14470                             }
14471                             else if (PASS2) {
14472                                 vWARN4(RExC_parse,
14473                                     "False [] range \"%*.*s\"",
14474                                     w, w, rangebegin);
14475                             }
14476                         }
14477                         if (!SIZE_ONLY) {
14478                             cp_list = add_cp_to_invlist(cp_list, '-');
14479                         }
14480                         element_count++;
14481                     } else
14482                         range = 1;      /* yeah, it's a range! */
14483                     continue;   /* but do it the next time */
14484                 }
14485             }
14486         }
14487
14488         if (namedclass > OOB_NAMEDCLASS) {
14489             continue;
14490         }
14491
14492         /* Here, we have a single value this time through the loop, and
14493          * <prevvalue> is the beginning of the range, if any; or <value> if
14494          * not. */
14495
14496         /* non-Latin1 code point implies unicode semantics.  Must be set in
14497          * pass1 so is there for the whole of pass 2 */
14498         if (value > 255) {
14499             RExC_uni_semantics = 1;
14500         }
14501
14502         /* Ready to process either the single value, or the completed range.
14503          * For single-valued non-inverted ranges, we consider the possibility
14504          * of multi-char folds.  (We made a conscious decision to not do this
14505          * for the other cases because it can often lead to non-intuitive
14506          * results.  For example, you have the peculiar case that:
14507          *  "s s" =~ /^[^\xDF]+$/i => Y
14508          *  "ss"  =~ /^[^\xDF]+$/i => N
14509          *
14510          * See [perl #89750] */
14511         if (FOLD && allow_multi_folds && value == prevvalue) {
14512             if (value == LATIN_SMALL_LETTER_SHARP_S
14513                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14514                                                         value)))
14515             {
14516                 /* Here <value> is indeed a multi-char fold.  Get what it is */
14517
14518                 U8 foldbuf[UTF8_MAXBYTES_CASE];
14519                 STRLEN foldlen;
14520
14521                 UV folded = _to_uni_fold_flags(
14522                                 value,
14523                                 foldbuf,
14524                                 &foldlen,
14525                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14526                                                    ? FOLD_FLAGS_NOMIX_ASCII
14527                                                    : 0)
14528                                 );
14529
14530                 /* Here, <folded> should be the first character of the
14531                  * multi-char fold of <value>, with <foldbuf> containing the
14532                  * whole thing.  But, if this fold is not allowed (because of
14533                  * the flags), <fold> will be the same as <value>, and should
14534                  * be processed like any other character, so skip the special
14535                  * handling */
14536                 if (folded != value) {
14537
14538                     /* Skip if we are recursed, currently parsing the class
14539                      * again.  Otherwise add this character to the list of
14540                      * multi-char folds. */
14541                     if (! RExC_in_multi_char_class) {
14542                         STRLEN cp_count = utf8_length(foldbuf,
14543                                                       foldbuf + foldlen);
14544                         SV* multi_fold = sv_2mortal(newSVpvs(""));
14545
14546                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14547
14548                         multi_char_matches
14549                                         = add_multi_match(multi_char_matches,
14550                                                           multi_fold,
14551                                                           cp_count);
14552
14553                     }
14554
14555                     /* This element should not be processed further in this
14556                      * class */
14557                     element_count--;
14558                     value = save_value;
14559                     prevvalue = save_prevvalue;
14560                     continue;
14561                 }
14562             }
14563         }
14564
14565         /* Deal with this element of the class */
14566         if (! SIZE_ONLY) {
14567 #ifndef EBCDIC
14568             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14569                                                      prevvalue, value);
14570 #else
14571             /* On non-ASCII platforms, for ranges that span all of 0..255, and
14572              * ones that don't require special handling, we can just add the
14573              * range like we do for ASCII platforms */
14574             if ((UNLIKELY(prevvalue == 0) && value >= 255)
14575                 || ! (prevvalue < 256
14576                       && (unicode_range
14577                           || (literal_endpoint == 2
14578                               && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14579                                   || (isUPPER_A(prevvalue)
14580                                       && isUPPER_A(value)))))))
14581             {
14582                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14583                                                          prevvalue, value);
14584             }
14585             else {
14586                 /* Here, requires special handling.  This can be because it is
14587                  * a range whose code points are considered to be Unicode, and
14588                  * so must be individually translated into native, or because
14589                  * its a subrange of 'A-Z' or 'a-z' which each aren't
14590                  * contiguous in EBCDIC, but we have defined them to include
14591                  * only the "expected" upper or lower case ASCII alphabetics.
14592                  * Subranges above 255 are the same in native and Unicode, so
14593                  * can be added as a range */
14594                 U8 start = NATIVE_TO_LATIN1(prevvalue);
14595                 unsigned j;
14596                 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
14597                 for (j = start; j <= end; j++) {
14598                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
14599                 }
14600                 if (value > 255) {
14601                     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14602                                                              256, value);
14603                 }
14604             }
14605 #endif
14606         }
14607
14608         range = 0; /* this range (if it was one) is done now */
14609     } /* End of loop through all the text within the brackets */
14610
14611     /* If anything in the class expands to more than one character, we have to
14612      * deal with them by building up a substitute parse string, and recursively
14613      * calling reg() on it, instead of proceeding */
14614     if (multi_char_matches) {
14615         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14616         I32 cp_count;
14617         STRLEN len;
14618         char *save_end = RExC_end;
14619         char *save_parse = RExC_parse;
14620         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14621                                        a "|" */
14622         I32 reg_flags;
14623
14624         assert(! invert);
14625 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14626            because too confusing */
14627         if (invert) {
14628             sv_catpv(substitute_parse, "(?:");
14629         }
14630 #endif
14631
14632         /* Look at the longest folds first */
14633         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14634
14635             if (av_exists(multi_char_matches, cp_count)) {
14636                 AV** this_array_ptr;
14637                 SV* this_sequence;
14638
14639                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14640                                                  cp_count, FALSE);
14641                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14642                                                                 &PL_sv_undef)
14643                 {
14644                     if (! first_time) {
14645                         sv_catpv(substitute_parse, "|");
14646                     }
14647                     first_time = FALSE;
14648
14649                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14650                 }
14651             }
14652         }
14653
14654         /* If the character class contains anything else besides these
14655          * multi-character folds, have to include it in recursive parsing */
14656         if (element_count) {
14657             sv_catpv(substitute_parse, "|[");
14658             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14659             sv_catpv(substitute_parse, "]");
14660         }
14661
14662         sv_catpv(substitute_parse, ")");
14663 #if 0
14664         if (invert) {
14665             /* This is a way to get the parse to skip forward a whole named
14666              * sequence instead of matching the 2nd character when it fails the
14667              * first */
14668             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14669         }
14670 #endif
14671
14672         RExC_parse = SvPV(substitute_parse, len);
14673         RExC_end = RExC_parse + len;
14674         RExC_in_multi_char_class = 1;
14675         RExC_override_recoding = 1;
14676         RExC_emit = (regnode *)orig_emit;
14677
14678         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14679
14680         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14681
14682         RExC_parse = save_parse;
14683         RExC_end = save_end;
14684         RExC_in_multi_char_class = 0;
14685         RExC_override_recoding = 0;
14686         SvREFCNT_dec_NN(multi_char_matches);
14687         return ret;
14688     }
14689
14690     /* Here, we've gone through the entire class and dealt with multi-char
14691      * folds.  We are now in a position that we can do some checks to see if we
14692      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14693      * Currently we only do two checks:
14694      * 1) is in the unlikely event that the user has specified both, eg. \w and
14695      *    \W under /l, then the class matches everything.  (This optimization
14696      *    is done only to make the optimizer code run later work.)
14697      * 2) if the character class contains only a single element (including a
14698      *    single range), we see if there is an equivalent node for it.
14699      * Other checks are possible */
14700     if (! ret_invlist   /* Can't optimize if returning the constructed
14701                            inversion list */
14702         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14703     {
14704         U8 op = END;
14705         U8 arg = 0;
14706
14707         if (UNLIKELY(posixl_matches_all)) {
14708             op = SANY;
14709         }
14710         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14711                                                    \w or [:digit:] or \p{foo}
14712                                                  */
14713
14714             /* All named classes are mapped into POSIXish nodes, with its FLAG
14715              * argument giving which class it is */
14716             switch ((I32)namedclass) {
14717                 case ANYOF_UNIPROP:
14718                     break;
14719
14720                 /* These don't depend on the charset modifiers.  They always
14721                  * match under /u rules */
14722                 case ANYOF_NHORIZWS:
14723                 case ANYOF_HORIZWS:
14724                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14725                     /* FALLTHROUGH */
14726
14727                 case ANYOF_NVERTWS:
14728                 case ANYOF_VERTWS:
14729                     op = POSIXU;
14730                     goto join_posix;
14731
14732                 /* The actual POSIXish node for all the rest depends on the
14733                  * charset modifier.  The ones in the first set depend only on
14734                  * ASCII or, if available on this platform, locale */
14735                 case ANYOF_ASCII:
14736                 case ANYOF_NASCII:
14737 #ifdef HAS_ISASCII
14738                     op = (LOC) ? POSIXL : POSIXA;
14739 #else
14740                     op = POSIXA;
14741 #endif
14742                     goto join_posix;
14743
14744                 case ANYOF_NCASED:
14745                 case ANYOF_LOWER:
14746                 case ANYOF_NLOWER:
14747                 case ANYOF_UPPER:
14748                 case ANYOF_NUPPER:
14749                     /* under /a could be alpha */
14750                     if (FOLD) {
14751                         if (ASCII_RESTRICTED) {
14752                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14753                         }
14754                         else if (! LOC) {
14755                             break;
14756                         }
14757                     }
14758                     /* FALLTHROUGH */
14759
14760                 /* The rest have more possibilities depending on the charset.
14761                  * We take advantage of the enum ordering of the charset
14762                  * modifiers to get the exact node type, */
14763                 default:
14764                     op = POSIXD + get_regex_charset(RExC_flags);
14765                     if (op > POSIXA) { /* /aa is same as /a */
14766                         op = POSIXA;
14767                     }
14768
14769                 join_posix:
14770                     /* The odd numbered ones are the complements of the
14771                      * next-lower even number one */
14772                     if (namedclass % 2 == 1) {
14773                         invert = ! invert;
14774                         namedclass--;
14775                     }
14776                     arg = namedclass_to_classnum(namedclass);
14777                     break;
14778             }
14779         }
14780         else if (value == prevvalue) {
14781
14782             /* Here, the class consists of just a single code point */
14783
14784             if (invert) {
14785                 if (! LOC && value == '\n') {
14786                     op = REG_ANY; /* Optimize [^\n] */
14787                     *flagp |= HASWIDTH|SIMPLE;
14788                     RExC_naughty++;
14789                 }
14790             }
14791             else if (value < 256 || UTF) {
14792
14793                 /* Optimize a single value into an EXACTish node, but not if it
14794                  * would require converting the pattern to UTF-8. */
14795                 op = compute_EXACTish(pRExC_state);
14796             }
14797         } /* Otherwise is a range */
14798         else if (! LOC) {   /* locale could vary these */
14799             if (prevvalue == '0') {
14800                 if (value == '9') {
14801                     arg = _CC_DIGIT;
14802                     op = POSIXA;
14803                 }
14804             }
14805             else if (prevvalue == 'A') {
14806                 if (value == 'Z'
14807 #ifdef EBCDIC
14808                     && literal_endpoint == 2
14809 #endif
14810                 ) {
14811                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14812                     op = POSIXA;
14813                 }
14814             }
14815             else if (prevvalue == 'a') {
14816                 if (value == 'z'
14817 #ifdef EBCDIC
14818                     && literal_endpoint == 2
14819 #endif
14820                 ) {
14821                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14822                     op = POSIXA;
14823                 }
14824             }
14825         }
14826
14827         /* Here, we have changed <op> away from its initial value iff we found
14828          * an optimization */
14829         if (op != END) {
14830
14831             /* Throw away this ANYOF regnode, and emit the calculated one,
14832              * which should correspond to the beginning, not current, state of
14833              * the parse */
14834             const char * cur_parse = RExC_parse;
14835             RExC_parse = (char *)orig_parse;
14836             if ( SIZE_ONLY) {
14837                 if (! LOC) {
14838
14839                     /* To get locale nodes to not use the full ANYOF size would
14840                      * require moving the code above that writes the portions
14841                      * of it that aren't in other nodes to after this point.
14842                      * e.g.  ANYOF_POSIXL_SET */
14843                     RExC_size = orig_size;
14844                 }
14845             }
14846             else {
14847                 RExC_emit = (regnode *)orig_emit;
14848                 if (PL_regkind[op] == POSIXD) {
14849                     if (op == POSIXL) {
14850                         RExC_contains_locale = 1;
14851                     }
14852                     if (invert) {
14853                         op += NPOSIXD - POSIXD;
14854                     }
14855                 }
14856             }
14857
14858             ret = reg_node(pRExC_state, op);
14859
14860             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14861                 if (! SIZE_ONLY) {
14862                     FLAGS(ret) = arg;
14863                 }
14864                 *flagp |= HASWIDTH|SIMPLE;
14865             }
14866             else if (PL_regkind[op] == EXACT) {
14867                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14868                                            TRUE /* downgradable to EXACT */
14869                                            );
14870             }
14871
14872             RExC_parse = (char *) cur_parse;
14873
14874             SvREFCNT_dec(posixes);
14875             SvREFCNT_dec(nposixes);
14876             SvREFCNT_dec(cp_list);
14877             SvREFCNT_dec(cp_foldable_list);
14878             return ret;
14879         }
14880     }
14881
14882     if (SIZE_ONLY)
14883         return ret;
14884     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14885
14886     /* If folding, we calculate all characters that could fold to or from the
14887      * ones already on the list */
14888     if (cp_foldable_list) {
14889         if (FOLD) {
14890             UV start, end;      /* End points of code point ranges */
14891
14892             SV* fold_intersection = NULL;
14893             SV** use_list;
14894
14895             /* Our calculated list will be for Unicode rules.  For locale
14896              * matching, we have to keep a separate list that is consulted at
14897              * runtime only when the locale indicates Unicode rules.  For
14898              * non-locale, we just use to the general list */
14899             if (LOC) {
14900                 use_list = &only_utf8_locale_list;
14901             }
14902             else {
14903                 use_list = &cp_list;
14904             }
14905
14906             /* Only the characters in this class that participate in folds need
14907              * be checked.  Get the intersection of this class and all the
14908              * possible characters that are foldable.  This can quickly narrow
14909              * down a large class */
14910             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14911                                   &fold_intersection);
14912
14913             /* The folds for all the Latin1 characters are hard-coded into this
14914              * program, but we have to go out to disk to get the others. */
14915             if (invlist_highest(cp_foldable_list) >= 256) {
14916
14917                 /* This is a hash that for a particular fold gives all
14918                  * characters that are involved in it */
14919                 if (! PL_utf8_foldclosures) {
14920                     _load_PL_utf8_foldclosures();
14921                 }
14922             }
14923
14924             /* Now look at the foldable characters in this class individually */
14925             invlist_iterinit(fold_intersection);
14926             while (invlist_iternext(fold_intersection, &start, &end)) {
14927                 UV j;
14928
14929                 /* Look at every character in the range */
14930                 for (j = start; j <= end; j++) {
14931                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14932                     STRLEN foldlen;
14933                     SV** listp;
14934
14935                     if (j < 256) {
14936
14937                         if (IS_IN_SOME_FOLD_L1(j)) {
14938
14939                             /* ASCII is always matched; non-ASCII is matched
14940                              * only under Unicode rules (which could happen
14941                              * under /l if the locale is a UTF-8 one */
14942                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14943                                 *use_list = add_cp_to_invlist(*use_list,
14944                                                             PL_fold_latin1[j]);
14945                             }
14946                             else {
14947                                 depends_list =
14948                                  add_cp_to_invlist(depends_list,
14949                                                    PL_fold_latin1[j]);
14950                             }
14951                         }
14952
14953                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14954                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14955                         {
14956                             add_above_Latin1_folds(pRExC_state,
14957                                                    (U8) j,
14958                                                    use_list);
14959                         }
14960                         continue;
14961                     }
14962
14963                     /* Here is an above Latin1 character.  We don't have the
14964                      * rules hard-coded for it.  First, get its fold.  This is
14965                      * the simple fold, as the multi-character folds have been
14966                      * handled earlier and separated out */
14967                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14968                                                         (ASCII_FOLD_RESTRICTED)
14969                                                         ? FOLD_FLAGS_NOMIX_ASCII
14970                                                         : 0);
14971
14972                     /* Single character fold of above Latin1.  Add everything in
14973                     * its fold closure to the list that this node should match.
14974                     * The fold closures data structure is a hash with the keys
14975                     * being the UTF-8 of every character that is folded to, like
14976                     * 'k', and the values each an array of all code points that
14977                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14978                     * Multi-character folds are not included */
14979                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14980                                         (char *) foldbuf, foldlen, FALSE)))
14981                     {
14982                         AV* list = (AV*) *listp;
14983                         IV k;
14984                         for (k = 0; k <= av_tindex(list); k++) {
14985                             SV** c_p = av_fetch(list, k, FALSE);
14986                             UV c;
14987                             assert(c_p);
14988
14989                             c = SvUV(*c_p);
14990
14991                             /* /aa doesn't allow folds between ASCII and non- */
14992                             if ((ASCII_FOLD_RESTRICTED
14993                                 && (isASCII(c) != isASCII(j))))
14994                             {
14995                                 continue;
14996                             }
14997
14998                             /* Folds under /l which cross the 255/256 boundary
14999                              * are added to a separate list.  (These are valid
15000                              * only when the locale is UTF-8.) */
15001                             if (c < 256 && LOC) {
15002                                 *use_list = add_cp_to_invlist(*use_list, c);
15003                                 continue;
15004                             }
15005
15006                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15007                             {
15008                                 cp_list = add_cp_to_invlist(cp_list, c);
15009                             }
15010                             else {
15011                                 /* Similarly folds involving non-ascii Latin1
15012                                 * characters under /d are added to their list */
15013                                 depends_list = add_cp_to_invlist(depends_list,
15014                                                                  c);
15015                             }
15016                         }
15017                     }
15018                 }
15019             }
15020             SvREFCNT_dec_NN(fold_intersection);
15021         }
15022
15023         /* Now that we have finished adding all the folds, there is no reason
15024          * to keep the foldable list separate */
15025         _invlist_union(cp_list, cp_foldable_list, &cp_list);
15026         SvREFCNT_dec_NN(cp_foldable_list);
15027     }
15028
15029     /* And combine the result (if any) with any inversion list from posix
15030      * classes.  The lists are kept separate up to now because we don't want to
15031      * fold the classes (folding of those is automatically handled by the swash
15032      * fetching code) */
15033     if (posixes || nposixes) {
15034         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15035             /* Under /a and /aa, nothing above ASCII matches these */
15036             _invlist_intersection(posixes,
15037                                   PL_XPosix_ptrs[_CC_ASCII],
15038                                   &posixes);
15039         }
15040         if (nposixes) {
15041             if (DEPENDS_SEMANTICS) {
15042                 /* Under /d, everything in the upper half of the Latin1 range
15043                  * matches these complements */
15044                 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15045             }
15046             else if (AT_LEAST_ASCII_RESTRICTED) {
15047                 /* Under /a and /aa, everything above ASCII matches these
15048                  * complements */
15049                 _invlist_union_complement_2nd(nposixes,
15050                                               PL_XPosix_ptrs[_CC_ASCII],
15051                                               &nposixes);
15052             }
15053             if (posixes) {
15054                 _invlist_union(posixes, nposixes, &posixes);
15055                 SvREFCNT_dec_NN(nposixes);
15056             }
15057             else {
15058                 posixes = nposixes;
15059             }
15060         }
15061         if (! DEPENDS_SEMANTICS) {
15062             if (cp_list) {
15063                 _invlist_union(cp_list, posixes, &cp_list);
15064                 SvREFCNT_dec_NN(posixes);
15065             }
15066             else {
15067                 cp_list = posixes;
15068             }
15069         }
15070         else {
15071             /* Under /d, we put into a separate list the Latin1 things that
15072              * match only when the target string is utf8 */
15073             SV* nonascii_but_latin1_properties = NULL;
15074             _invlist_intersection(posixes, PL_UpperLatin1,
15075                                   &nonascii_but_latin1_properties);
15076             _invlist_subtract(posixes, nonascii_but_latin1_properties,
15077                               &posixes);
15078             if (cp_list) {
15079                 _invlist_union(cp_list, posixes, &cp_list);
15080                 SvREFCNT_dec_NN(posixes);
15081             }
15082             else {
15083                 cp_list = posixes;
15084             }
15085
15086             if (depends_list) {
15087                 _invlist_union(depends_list, nonascii_but_latin1_properties,
15088                                &depends_list);
15089                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15090             }
15091             else {
15092                 depends_list = nonascii_but_latin1_properties;
15093             }
15094         }
15095     }
15096
15097     /* And combine the result (if any) with any inversion list from properties.
15098      * The lists are kept separate up to now so that we can distinguish the two
15099      * in regards to matching above-Unicode.  A run-time warning is generated
15100      * if a Unicode property is matched against a non-Unicode code point. But,
15101      * we allow user-defined properties to match anything, without any warning,
15102      * and we also suppress the warning if there is a portion of the character
15103      * class that isn't a Unicode property, and which matches above Unicode, \W
15104      * or [\x{110000}] for example.
15105      * (Note that in this case, unlike the Posix one above, there is no
15106      * <depends_list>, because having a Unicode property forces Unicode
15107      * semantics */
15108     if (properties) {
15109         if (cp_list) {
15110
15111             /* If it matters to the final outcome, see if a non-property
15112              * component of the class matches above Unicode.  If so, the
15113              * warning gets suppressed.  This is true even if just a single
15114              * such code point is specified, as though not strictly correct if
15115              * another such code point is matched against, the fact that they
15116              * are using above-Unicode code points indicates they should know
15117              * the issues involved */
15118             if (warn_super) {
15119                 warn_super = ! (invert
15120                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15121             }
15122
15123             _invlist_union(properties, cp_list, &cp_list);
15124             SvREFCNT_dec_NN(properties);
15125         }
15126         else {
15127             cp_list = properties;
15128         }
15129
15130         if (warn_super) {
15131             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15132         }
15133     }
15134
15135     /* Here, we have calculated what code points should be in the character
15136      * class.
15137      *
15138      * Now we can see about various optimizations.  Fold calculation (which we
15139      * did above) needs to take place before inversion.  Otherwise /[^k]/i
15140      * would invert to include K, which under /i would match k, which it
15141      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
15142      * folded until runtime */
15143
15144     /* If we didn't do folding, it's because some information isn't available
15145      * until runtime; set the run-time fold flag for these.  (We don't have to
15146      * worry about properties folding, as that is taken care of by the swash
15147      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
15148      * locales, or the class matches at least one 0-255 range code point */
15149     if (LOC && FOLD) {
15150         if (only_utf8_locale_list) {
15151             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15152         }
15153         else if (cp_list) { /* Look to see if there a 0-255 code point is in
15154                                the list */
15155             UV start, end;
15156             invlist_iterinit(cp_list);
15157             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15158                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15159             }
15160             invlist_iterfinish(cp_list);
15161         }
15162     }
15163
15164     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15165      * at compile time.  Besides not inverting folded locale now, we can't
15166      * invert if there are things such as \w, which aren't known until runtime
15167      * */
15168     if (cp_list
15169         && invert
15170         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15171         && ! depends_list
15172         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15173     {
15174         _invlist_invert(cp_list);
15175
15176         /* Any swash can't be used as-is, because we've inverted things */
15177         if (swash) {
15178             SvREFCNT_dec_NN(swash);
15179             swash = NULL;
15180         }
15181
15182         /* Clear the invert flag since have just done it here */
15183         invert = FALSE;
15184     }
15185
15186     if (ret_invlist) {
15187         *ret_invlist = cp_list;
15188         SvREFCNT_dec(swash);
15189
15190         /* Discard the generated node */
15191         if (SIZE_ONLY) {
15192             RExC_size = orig_size;
15193         }
15194         else {
15195             RExC_emit = orig_emit;
15196         }
15197         return orig_emit;
15198     }
15199
15200     /* Some character classes are equivalent to other nodes.  Such nodes take
15201      * up less room and generally fewer operations to execute than ANYOF nodes.
15202      * Above, we checked for and optimized into some such equivalents for
15203      * certain common classes that are easy to test.  Getting to this point in
15204      * the code means that the class didn't get optimized there.  Since this
15205      * code is only executed in Pass 2, it is too late to save space--it has
15206      * been allocated in Pass 1, and currently isn't given back.  But turning
15207      * things into an EXACTish node can allow the optimizer to join it to any
15208      * adjacent such nodes.  And if the class is equivalent to things like /./,
15209      * expensive run-time swashes can be avoided.  Now that we have more
15210      * complete information, we can find things necessarily missed by the
15211      * earlier code.  I (khw) am not sure how much to look for here.  It would
15212      * be easy, but perhaps too slow, to check any candidates against all the
15213      * node types they could possibly match using _invlistEQ(). */
15214
15215     if (cp_list
15216         && ! invert
15217         && ! depends_list
15218         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15219         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15220
15221            /* We don't optimize if we are supposed to make sure all non-Unicode
15222             * code points raise a warning, as only ANYOF nodes have this check.
15223             * */
15224         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15225     {
15226         UV start, end;
15227         U8 op = END;  /* The optimzation node-type */
15228         const char * cur_parse= RExC_parse;
15229
15230         invlist_iterinit(cp_list);
15231         if (! invlist_iternext(cp_list, &start, &end)) {
15232
15233             /* Here, the list is empty.  This happens, for example, when a
15234              * Unicode property is the only thing in the character class, and
15235              * it doesn't match anything.  (perluniprops.pod notes such
15236              * properties) */
15237             op = OPFAIL;
15238             *flagp |= HASWIDTH|SIMPLE;
15239         }
15240         else if (start == end) {    /* The range is a single code point */
15241             if (! invlist_iternext(cp_list, &start, &end)
15242
15243                     /* Don't do this optimization if it would require changing
15244                      * the pattern to UTF-8 */
15245                 && (start < 256 || UTF))
15246             {
15247                 /* Here, the list contains a single code point.  Can optimize
15248                  * into an EXACTish node */
15249
15250                 value = start;
15251
15252                 if (! FOLD) {
15253                     op = EXACT;
15254                 }
15255                 else if (LOC) {
15256
15257                     /* A locale node under folding with one code point can be
15258                      * an EXACTFL, as its fold won't be calculated until
15259                      * runtime */
15260                     op = EXACTFL;
15261                 }
15262                 else {
15263
15264                     /* Here, we are generally folding, but there is only one
15265                      * code point to match.  If we have to, we use an EXACT
15266                      * node, but it would be better for joining with adjacent
15267                      * nodes in the optimization pass if we used the same
15268                      * EXACTFish node that any such are likely to be.  We can
15269                      * do this iff the code point doesn't participate in any
15270                      * folds.  For example, an EXACTF of a colon is the same as
15271                      * an EXACT one, since nothing folds to or from a colon. */
15272                     if (value < 256) {
15273                         if (IS_IN_SOME_FOLD_L1(value)) {
15274                             op = EXACT;
15275                         }
15276                     }
15277                     else {
15278                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15279                             op = EXACT;
15280                         }
15281                     }
15282
15283                     /* If we haven't found the node type, above, it means we
15284                      * can use the prevailing one */
15285                     if (op == END) {
15286                         op = compute_EXACTish(pRExC_state);
15287                     }
15288                 }
15289             }
15290         }
15291         else if (start == 0) {
15292             if (end == UV_MAX) {
15293                 op = SANY;
15294                 *flagp |= HASWIDTH|SIMPLE;
15295                 RExC_naughty++;
15296             }
15297             else if (end == '\n' - 1
15298                     && invlist_iternext(cp_list, &start, &end)
15299                     && start == '\n' + 1 && end == UV_MAX)
15300             {
15301                 op = REG_ANY;
15302                 *flagp |= HASWIDTH|SIMPLE;
15303                 RExC_naughty++;
15304             }
15305         }
15306         invlist_iterfinish(cp_list);
15307
15308         if (op != END) {
15309             RExC_parse = (char *)orig_parse;
15310             RExC_emit = (regnode *)orig_emit;
15311
15312             ret = reg_node(pRExC_state, op);
15313
15314             RExC_parse = (char *)cur_parse;
15315
15316             if (PL_regkind[op] == EXACT) {
15317                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15318                                            TRUE /* downgradable to EXACT */
15319                                           );
15320             }
15321
15322             SvREFCNT_dec_NN(cp_list);
15323             return ret;
15324         }
15325     }
15326
15327     /* Here, <cp_list> contains all the code points we can determine at
15328      * compile time that match under all conditions.  Go through it, and
15329      * for things that belong in the bitmap, put them there, and delete from
15330      * <cp_list>.  While we are at it, see if everything above 255 is in the
15331      * list, and if so, set a flag to speed up execution */
15332
15333     populate_ANYOF_from_invlist(ret, &cp_list);
15334
15335     if (invert) {
15336         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15337     }
15338
15339     /* Here, the bitmap has been populated with all the Latin1 code points that
15340      * always match.  Can now add to the overall list those that match only
15341      * when the target string is UTF-8 (<depends_list>). */
15342     if (depends_list) {
15343         if (cp_list) {
15344             _invlist_union(cp_list, depends_list, &cp_list);
15345             SvREFCNT_dec_NN(depends_list);
15346         }
15347         else {
15348             cp_list = depends_list;
15349         }
15350         ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15351     }
15352
15353     /* If there is a swash and more than one element, we can't use the swash in
15354      * the optimization below. */
15355     if (swash && element_count > 1) {
15356         SvREFCNT_dec_NN(swash);
15357         swash = NULL;
15358     }
15359
15360     /* Note that the optimization of using 'swash' if it is the only thing in
15361      * the class doesn't have us change swash at all, so it can include things
15362      * that are also in the bitmap; otherwise we have purposely deleted that
15363      * duplicate information */
15364     set_ANYOF_arg(pRExC_state, ret, cp_list,
15365                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15366                    ? listsv : NULL,
15367                   only_utf8_locale_list,
15368                   swash, has_user_defined_property);
15369
15370     *flagp |= HASWIDTH|SIMPLE;
15371
15372     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15373         RExC_contains_locale = 1;
15374     }
15375
15376     return ret;
15377 }
15378
15379 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15380
15381 STATIC void
15382 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15383                 regnode* const node,
15384                 SV* const cp_list,
15385                 SV* const runtime_defns,
15386                 SV* const only_utf8_locale_list,
15387                 SV* const swash,
15388                 const bool has_user_defined_property)
15389 {
15390     /* Sets the arg field of an ANYOF-type node 'node', using information about
15391      * the node passed-in.  If there is nothing outside the node's bitmap, the
15392      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
15393      * the count returned by add_data(), having allocated and stored an array,
15394      * av, that that count references, as follows:
15395      *  av[0] stores the character class description in its textual form.
15396      *        This is used later (regexec.c:Perl_regclass_swash()) to
15397      *        initialize the appropriate swash, and is also useful for dumping
15398      *        the regnode.  This is set to &PL_sv_undef if the textual
15399      *        description is not needed at run-time (as happens if the other
15400      *        elements completely define the class)
15401      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15402      *        computed from av[0].  But if no further computation need be done,
15403      *        the swash is stored here now (and av[0] is &PL_sv_undef).
15404      *  av[2] stores the inversion list of code points that match only if the
15405      *        current locale is UTF-8
15406      *  av[3] stores the cp_list inversion list for use in addition or instead
15407      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15408      *        (Otherwise everything needed is already in av[0] and av[1])
15409      *  av[4] is set if any component of the class is from a user-defined
15410      *        property; used only if av[3] exists */
15411
15412     UV n;
15413
15414     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15415
15416     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15417         assert(! (ANYOF_FLAGS(node)
15418                   & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15419                      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15420         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15421     }
15422     else {
15423         AV * const av = newAV();
15424         SV *rv;
15425
15426         assert(ANYOF_FLAGS(node)
15427                & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15428                   |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15429
15430         av_store(av, 0, (runtime_defns)
15431                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15432         if (swash) {
15433             assert(cp_list);
15434             av_store(av, 1, swash);
15435             SvREFCNT_dec_NN(cp_list);
15436         }
15437         else {
15438             av_store(av, 1, &PL_sv_undef);
15439             if (cp_list) {
15440                 av_store(av, 3, cp_list);
15441                 av_store(av, 4, newSVuv(has_user_defined_property));
15442             }
15443         }
15444
15445         if (only_utf8_locale_list) {
15446             av_store(av, 2, only_utf8_locale_list);
15447         }
15448         else {
15449             av_store(av, 2, &PL_sv_undef);
15450         }
15451
15452         rv = newRV_noinc(MUTABLE_SV(av));
15453         n = add_data(pRExC_state, STR_WITH_LEN("s"));
15454         RExC_rxi->data->data[n] = (void*)rv;
15455         ARG_SET(node, n);
15456     }
15457 }
15458
15459 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15460 SV *
15461 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15462                                         const regnode* node,
15463                                         bool doinit,
15464                                         SV** listsvp,
15465                                         SV** only_utf8_locale_ptr,
15466                                         SV*  exclude_list)
15467
15468 {
15469     /* For internal core use only.
15470      * Returns the swash for the input 'node' in the regex 'prog'.
15471      * If <doinit> is 'true', will attempt to create the swash if not already
15472      *    done.
15473      * If <listsvp> is non-null, will return the printable contents of the
15474      *    swash.  This can be used to get debugging information even before the
15475      *    swash exists, by calling this function with 'doinit' set to false, in
15476      *    which case the components that will be used to eventually create the
15477      *    swash are returned  (in a printable form).
15478      * If <exclude_list> is not NULL, it is an inversion list of things to
15479      *    exclude from what's returned in <listsvp>.
15480      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
15481      * that, in spite of this function's name, the swash it returns may include
15482      * the bitmap data as well */
15483
15484     SV *sw  = NULL;
15485     SV *si  = NULL;         /* Input swash initialization string */
15486     SV*  invlist = NULL;
15487
15488     RXi_GET_DECL(prog,progi);
15489     const struct reg_data * const data = prog ? progi->data : NULL;
15490
15491     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15492
15493     assert(ANYOF_FLAGS(node)
15494         & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15495            |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15496
15497     if (data && data->count) {
15498         const U32 n = ARG(node);
15499
15500         if (data->what[n] == 's') {
15501             SV * const rv = MUTABLE_SV(data->data[n]);
15502             AV * const av = MUTABLE_AV(SvRV(rv));
15503             SV **const ary = AvARRAY(av);
15504             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15505
15506             si = *ary;  /* ary[0] = the string to initialize the swash with */
15507
15508             /* Elements 3 and 4 are either both present or both absent. [3] is
15509              * any inversion list generated at compile time; [4] indicates if
15510              * that inversion list has any user-defined properties in it. */
15511             if (av_tindex(av) >= 2) {
15512                 if (only_utf8_locale_ptr
15513                     && ary[2]
15514                     && ary[2] != &PL_sv_undef)
15515                 {
15516                     *only_utf8_locale_ptr = ary[2];
15517                 }
15518                 else {
15519                     assert(only_utf8_locale_ptr);
15520                     *only_utf8_locale_ptr = NULL;
15521                 }
15522
15523                 if (av_tindex(av) >= 3) {
15524                     invlist = ary[3];
15525                     if (SvUV(ary[4])) {
15526                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15527                     }
15528                 }
15529                 else {
15530                     invlist = NULL;
15531                 }
15532             }
15533
15534             /* Element [1] is reserved for the set-up swash.  If already there,
15535              * return it; if not, create it and store it there */
15536             if (ary[1] && SvROK(ary[1])) {
15537                 sw = ary[1];
15538             }
15539             else if (doinit && ((si && si != &PL_sv_undef)
15540                                  || (invlist && invlist != &PL_sv_undef))) {
15541                 assert(si);
15542                 sw = _core_swash_init("utf8", /* the utf8 package */
15543                                       "", /* nameless */
15544                                       si,
15545                                       1, /* binary */
15546                                       0, /* not from tr/// */
15547                                       invlist,
15548                                       &swash_init_flags);
15549                 (void)av_store(av, 1, sw);
15550             }
15551         }
15552     }
15553
15554     /* If requested, return a printable version of what this swash matches */
15555     if (listsvp) {
15556         SV* matches_string = newSVpvs("");
15557
15558         /* The swash should be used, if possible, to get the data, as it
15559          * contains the resolved data.  But this function can be called at
15560          * compile-time, before everything gets resolved, in which case we
15561          * return the currently best available information, which is the string
15562          * that will eventually be used to do that resolving, 'si' */
15563         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15564             && (si && si != &PL_sv_undef))
15565         {
15566             sv_catsv(matches_string, si);
15567         }
15568
15569         /* Add the inversion list to whatever we have.  This may have come from
15570          * the swash, or from an input parameter */
15571         if (invlist) {
15572             if (exclude_list) {
15573                 SV* clone = invlist_clone(invlist);
15574                 _invlist_subtract(clone, exclude_list, &clone);
15575                 sv_catsv(matches_string, _invlist_contents(clone));
15576                 SvREFCNT_dec_NN(clone);
15577             }
15578             else {
15579                 sv_catsv(matches_string, _invlist_contents(invlist));
15580             }
15581         }
15582         *listsvp = matches_string;
15583     }
15584
15585     return sw;
15586 }
15587 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15588
15589 /* reg_skipcomment()
15590
15591    Absorbs an /x style # comment from the input stream,
15592    returning a pointer to the first character beyond the comment, or if the
15593    comment terminates the pattern without anything following it, this returns
15594    one past the final character of the pattern (in other words, RExC_end) and
15595    sets the REG_RUN_ON_COMMENT_SEEN flag.
15596
15597    Note it's the callers responsibility to ensure that we are
15598    actually in /x mode
15599
15600 */
15601
15602 PERL_STATIC_INLINE char*
15603 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15604 {
15605     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15606
15607     assert(*p == '#');
15608
15609     while (p < RExC_end) {
15610         if (*(++p) == '\n') {
15611             return p+1;
15612         }
15613     }
15614
15615     /* we ran off the end of the pattern without ending the comment, so we have
15616      * to add an \n when wrapping */
15617     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15618     return p;
15619 }
15620
15621 /* nextchar()
15622
15623    Advances the parse position, and optionally absorbs
15624    "whitespace" from the inputstream.
15625
15626    Without /x "whitespace" means (?#...) style comments only,
15627    with /x this means (?#...) and # comments and whitespace proper.
15628
15629    Returns the RExC_parse point from BEFORE the scan occurs.
15630
15631    This is the /x friendly way of saying RExC_parse++.
15632 */
15633
15634 STATIC char*
15635 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15636 {
15637     char* const retval = RExC_parse++;
15638
15639     PERL_ARGS_ASSERT_NEXTCHAR;
15640
15641     for (;;) {
15642         if (RExC_end - RExC_parse >= 3
15643             && *RExC_parse == '('
15644             && RExC_parse[1] == '?'
15645             && RExC_parse[2] == '#')
15646         {
15647             while (*RExC_parse != ')') {
15648                 if (RExC_parse == RExC_end)
15649                     FAIL("Sequence (?#... not terminated");
15650                 RExC_parse++;
15651             }
15652             RExC_parse++;
15653             continue;
15654         }
15655         if (RExC_flags & RXf_PMf_EXTENDED) {
15656             char * p = regpatws(pRExC_state, RExC_parse,
15657                                           TRUE); /* means recognize comments */
15658             if (p != RExC_parse) {
15659                 RExC_parse = p;
15660                 continue;
15661             }
15662         }
15663         return retval;
15664     }
15665 }
15666
15667 STATIC regnode *
15668 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15669 {
15670     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15671      * space.  In pass1, it aligns and increments RExC_size; in pass2,
15672      * RExC_emit */
15673
15674     regnode * const ret = RExC_emit;
15675     GET_RE_DEBUG_FLAGS_DECL;
15676
15677     PERL_ARGS_ASSERT_REGNODE_GUTS;
15678
15679     assert(extra_size >= regarglen[op]);
15680
15681     if (SIZE_ONLY) {
15682         SIZE_ALIGN(RExC_size);
15683         RExC_size += 1 + extra_size;
15684         return(ret);
15685     }
15686     if (RExC_emit >= RExC_emit_bound)
15687         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15688                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15689
15690     NODE_ALIGN_FILL(ret);
15691 #ifndef RE_TRACK_PATTERN_OFFSETS
15692     PERL_UNUSED_ARG(name);
15693 #else
15694     if (RExC_offsets) {         /* MJD */
15695         MJD_OFFSET_DEBUG(
15696               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15697               name, __LINE__,
15698               PL_reg_name[op],
15699               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15700                 ? "Overwriting end of array!\n" : "OK",
15701               (UV)(RExC_emit - RExC_emit_start),
15702               (UV)(RExC_parse - RExC_start),
15703               (UV)RExC_offsets[0]));
15704         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15705     }
15706 #endif
15707     return(ret);
15708 }
15709
15710 /*
15711 - reg_node - emit a node
15712 */
15713 STATIC regnode *                        /* Location. */
15714 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15715 {
15716     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15717
15718     PERL_ARGS_ASSERT_REG_NODE;
15719
15720     assert(regarglen[op] == 0);
15721
15722     if (PASS2) {
15723         regnode *ptr = ret;
15724         FILL_ADVANCE_NODE(ptr, op);
15725         RExC_emit = ptr;
15726     }
15727     return(ret);
15728 }
15729
15730 /*
15731 - reganode - emit a node with an argument
15732 */
15733 STATIC regnode *                        /* Location. */
15734 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15735 {
15736     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15737
15738     PERL_ARGS_ASSERT_REGANODE;
15739
15740     assert(regarglen[op] == 1);
15741
15742     if (PASS2) {
15743         regnode *ptr = ret;
15744         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15745         RExC_emit = ptr;
15746     }
15747     return(ret);
15748 }
15749
15750 STATIC regnode *
15751 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15752 {
15753     /* emit a node with U32 and I32 arguments */
15754
15755     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15756
15757     PERL_ARGS_ASSERT_REG2LANODE;
15758
15759     assert(regarglen[op] == 2);
15760
15761     if (PASS2) {
15762         regnode *ptr = ret;
15763         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15764         RExC_emit = ptr;
15765     }
15766     return(ret);
15767 }
15768
15769 /*
15770 - reguni - emit (if appropriate) a Unicode character
15771 */
15772 PERL_STATIC_INLINE STRLEN
15773 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15774 {
15775     PERL_ARGS_ASSERT_REGUNI;
15776
15777     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15778 }
15779
15780 /*
15781 - reginsert - insert an operator in front of already-emitted operand
15782 *
15783 * Means relocating the operand.
15784 */
15785 STATIC void
15786 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15787 {
15788     regnode *src;
15789     regnode *dst;
15790     regnode *place;
15791     const int offset = regarglen[(U8)op];
15792     const int size = NODE_STEP_REGNODE + offset;
15793     GET_RE_DEBUG_FLAGS_DECL;
15794
15795     PERL_ARGS_ASSERT_REGINSERT;
15796     PERL_UNUSED_CONTEXT;
15797     PERL_UNUSED_ARG(depth);
15798 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15799     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15800     if (SIZE_ONLY) {
15801         RExC_size += size;
15802         return;
15803     }
15804
15805     src = RExC_emit;
15806     RExC_emit += size;
15807     dst = RExC_emit;
15808     if (RExC_open_parens) {
15809         int paren;
15810         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15811         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15812             if ( RExC_open_parens[paren] >= opnd ) {
15813                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15814                 RExC_open_parens[paren] += size;
15815             } else {
15816                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15817             }
15818             if ( RExC_close_parens[paren] >= opnd ) {
15819                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15820                 RExC_close_parens[paren] += size;
15821             } else {
15822                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15823             }
15824         }
15825     }
15826
15827     while (src > opnd) {
15828         StructCopy(--src, --dst, regnode);
15829 #ifdef RE_TRACK_PATTERN_OFFSETS
15830         if (RExC_offsets) {     /* MJD 20010112 */
15831             MJD_OFFSET_DEBUG(
15832                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15833                   "reg_insert",
15834                   __LINE__,
15835                   PL_reg_name[op],
15836                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15837                     ? "Overwriting end of array!\n" : "OK",
15838                   (UV)(src - RExC_emit_start),
15839                   (UV)(dst - RExC_emit_start),
15840                   (UV)RExC_offsets[0]));
15841             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15842             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15843         }
15844 #endif
15845     }
15846
15847
15848     place = opnd;               /* Op node, where operand used to be. */
15849 #ifdef RE_TRACK_PATTERN_OFFSETS
15850     if (RExC_offsets) {         /* MJD */
15851         MJD_OFFSET_DEBUG(
15852               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15853               "reginsert",
15854               __LINE__,
15855               PL_reg_name[op],
15856               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15857               ? "Overwriting end of array!\n" : "OK",
15858               (UV)(place - RExC_emit_start),
15859               (UV)(RExC_parse - RExC_start),
15860               (UV)RExC_offsets[0]));
15861         Set_Node_Offset(place, RExC_parse);
15862         Set_Node_Length(place, 1);
15863     }
15864 #endif
15865     src = NEXTOPER(place);
15866     FILL_ADVANCE_NODE(place, op);
15867     Zero(src, offset, regnode);
15868 }
15869
15870 /*
15871 - regtail - set the next-pointer at the end of a node chain of p to val.
15872 - SEE ALSO: regtail_study
15873 */
15874 /* TODO: All three parms should be const */
15875 STATIC void
15876 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15877                 const regnode *val,U32 depth)
15878 {
15879     regnode *scan;
15880     GET_RE_DEBUG_FLAGS_DECL;
15881
15882     PERL_ARGS_ASSERT_REGTAIL;
15883 #ifndef DEBUGGING
15884     PERL_UNUSED_ARG(depth);
15885 #endif
15886
15887     if (SIZE_ONLY)
15888         return;
15889
15890     /* Find last node. */
15891     scan = p;
15892     for (;;) {
15893         regnode * const temp = regnext(scan);
15894         DEBUG_PARSE_r({
15895             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15896             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15897             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15898                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15899                     (temp == NULL ? "->" : ""),
15900                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15901             );
15902         });
15903         if (temp == NULL)
15904             break;
15905         scan = temp;
15906     }
15907
15908     if (reg_off_by_arg[OP(scan)]) {
15909         ARG_SET(scan, val - scan);
15910     }
15911     else {
15912         NEXT_OFF(scan) = val - scan;
15913     }
15914 }
15915
15916 #ifdef DEBUGGING
15917 /*
15918 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15919 - Look for optimizable sequences at the same time.
15920 - currently only looks for EXACT chains.
15921
15922 This is experimental code. The idea is to use this routine to perform
15923 in place optimizations on branches and groups as they are constructed,
15924 with the long term intention of removing optimization from study_chunk so
15925 that it is purely analytical.
15926
15927 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15928 to control which is which.
15929
15930 */
15931 /* TODO: All four parms should be const */
15932
15933 STATIC U8
15934 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15935                       const regnode *val,U32 depth)
15936 {
15937     regnode *scan;
15938     U8 exact = PSEUDO;
15939 #ifdef EXPERIMENTAL_INPLACESCAN
15940     I32 min = 0;
15941 #endif
15942     GET_RE_DEBUG_FLAGS_DECL;
15943
15944     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15945
15946
15947     if (SIZE_ONLY)
15948         return exact;
15949
15950     /* Find last node. */
15951
15952     scan = p;
15953     for (;;) {
15954         regnode * const temp = regnext(scan);
15955 #ifdef EXPERIMENTAL_INPLACESCAN
15956         if (PL_regkind[OP(scan)] == EXACT) {
15957             bool unfolded_multi_char;   /* Unexamined in this routine */
15958             if (join_exact(pRExC_state, scan, &min,
15959                            &unfolded_multi_char, 1, val, depth+1))
15960                 return EXACT;
15961         }
15962 #endif
15963         if ( exact ) {
15964             switch (OP(scan)) {
15965                 case EXACT:
15966                 case EXACTF:
15967                 case EXACTFA_NO_TRIE:
15968                 case EXACTFA:
15969                 case EXACTFU:
15970                 case EXACTFU_SS:
15971                 case EXACTFL:
15972                         if( exact == PSEUDO )
15973                             exact= OP(scan);
15974                         else if ( exact != OP(scan) )
15975                             exact= 0;
15976                 case NOTHING:
15977                     break;
15978                 default:
15979                     exact= 0;
15980             }
15981         }
15982         DEBUG_PARSE_r({
15983             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15984             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15985             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15986                 SvPV_nolen_const(RExC_mysv),
15987                 REG_NODE_NUM(scan),
15988                 PL_reg_name[exact]);
15989         });
15990         if (temp == NULL)
15991             break;
15992         scan = temp;
15993     }
15994     DEBUG_PARSE_r({
15995         DEBUG_PARSE_MSG("");
15996         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
15997         PerlIO_printf(Perl_debug_log,
15998                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15999                       SvPV_nolen_const(RExC_mysv),
16000                       (IV)REG_NODE_NUM(val),
16001                       (IV)(val - scan)
16002         );
16003     });
16004     if (reg_off_by_arg[OP(scan)]) {
16005         ARG_SET(scan, val - scan);
16006     }
16007     else {
16008         NEXT_OFF(scan) = val - scan;
16009     }
16010
16011     return exact;
16012 }
16013 #endif
16014
16015 /*
16016  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16017  */
16018 #ifdef DEBUGGING
16019
16020 static void
16021 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16022 {
16023     int bit;
16024     int set=0;
16025
16026     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16027
16028     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16029         if (flags & (1<<bit)) {
16030             if (!set++ && lead)
16031                 PerlIO_printf(Perl_debug_log, "%s",lead);
16032             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16033         }
16034     }
16035     if (lead)  {
16036         if (set)
16037             PerlIO_printf(Perl_debug_log, "\n");
16038         else
16039             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16040     }
16041 }
16042
16043 static void
16044 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16045 {
16046     int bit;
16047     int set=0;
16048     regex_charset cs;
16049
16050     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16051
16052     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16053         if (flags & (1<<bit)) {
16054             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
16055                 continue;
16056             }
16057             if (!set++ && lead)
16058                 PerlIO_printf(Perl_debug_log, "%s",lead);
16059             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16060         }
16061     }
16062     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16063             if (!set++ && lead) {
16064                 PerlIO_printf(Perl_debug_log, "%s",lead);
16065             }
16066             switch (cs) {
16067                 case REGEX_UNICODE_CHARSET:
16068                     PerlIO_printf(Perl_debug_log, "UNICODE");
16069                     break;
16070                 case REGEX_LOCALE_CHARSET:
16071                     PerlIO_printf(Perl_debug_log, "LOCALE");
16072                     break;
16073                 case REGEX_ASCII_RESTRICTED_CHARSET:
16074                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16075                     break;
16076                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16077                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16078                     break;
16079                 default:
16080                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16081                     break;
16082             }
16083     }
16084     if (lead)  {
16085         if (set)
16086             PerlIO_printf(Perl_debug_log, "\n");
16087         else
16088             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16089     }
16090 }
16091 #endif
16092
16093 void
16094 Perl_regdump(pTHX_ const regexp *r)
16095 {
16096 #ifdef DEBUGGING
16097     SV * const sv = sv_newmortal();
16098     SV *dsv= sv_newmortal();
16099     RXi_GET_DECL(r,ri);
16100     GET_RE_DEBUG_FLAGS_DECL;
16101
16102     PERL_ARGS_ASSERT_REGDUMP;
16103
16104     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16105
16106     /* Header fields of interest. */
16107     if (r->anchored_substr) {
16108         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16109             RE_SV_DUMPLEN(r->anchored_substr), 30);
16110         PerlIO_printf(Perl_debug_log,
16111                       "anchored %s%s at %"IVdf" ",
16112                       s, RE_SV_TAIL(r->anchored_substr),
16113                       (IV)r->anchored_offset);
16114     } else if (r->anchored_utf8) {
16115         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16116             RE_SV_DUMPLEN(r->anchored_utf8), 30);
16117         PerlIO_printf(Perl_debug_log,
16118                       "anchored utf8 %s%s at %"IVdf" ",
16119                       s, RE_SV_TAIL(r->anchored_utf8),
16120                       (IV)r->anchored_offset);
16121     }
16122     if (r->float_substr) {
16123         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16124             RE_SV_DUMPLEN(r->float_substr), 30);
16125         PerlIO_printf(Perl_debug_log,
16126                       "floating %s%s at %"IVdf"..%"UVuf" ",
16127                       s, RE_SV_TAIL(r->float_substr),
16128                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16129     } else if (r->float_utf8) {
16130         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16131             RE_SV_DUMPLEN(r->float_utf8), 30);
16132         PerlIO_printf(Perl_debug_log,
16133                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16134                       s, RE_SV_TAIL(r->float_utf8),
16135                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16136     }
16137     if (r->check_substr || r->check_utf8)
16138         PerlIO_printf(Perl_debug_log,
16139                       (const char *)
16140                       (r->check_substr == r->float_substr
16141                        && r->check_utf8 == r->float_utf8
16142                        ? "(checking floating" : "(checking anchored"));
16143     if (r->intflags & PREGf_NOSCAN)
16144         PerlIO_printf(Perl_debug_log, " noscan");
16145     if (r->extflags & RXf_CHECK_ALL)
16146         PerlIO_printf(Perl_debug_log, " isall");
16147     if (r->check_substr || r->check_utf8)
16148         PerlIO_printf(Perl_debug_log, ") ");
16149
16150     if (ri->regstclass) {
16151         regprop(r, sv, ri->regstclass, NULL, NULL);
16152         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16153     }
16154     if (r->intflags & PREGf_ANCH) {
16155         PerlIO_printf(Perl_debug_log, "anchored");
16156         if (r->intflags & PREGf_ANCH_MBOL)
16157             PerlIO_printf(Perl_debug_log, "(MBOL)");
16158         if (r->intflags & PREGf_ANCH_SBOL)
16159             PerlIO_printf(Perl_debug_log, "(SBOL)");
16160         if (r->intflags & PREGf_ANCH_GPOS)
16161             PerlIO_printf(Perl_debug_log, "(GPOS)");
16162         PerlIO_putc(Perl_debug_log, ' ');
16163     }
16164     if (r->intflags & PREGf_GPOS_SEEN)
16165         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16166     if (r->intflags & PREGf_SKIP)
16167         PerlIO_printf(Perl_debug_log, "plus ");
16168     if (r->intflags & PREGf_IMPLICIT)
16169         PerlIO_printf(Perl_debug_log, "implicit ");
16170     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16171     if (r->extflags & RXf_EVAL_SEEN)
16172         PerlIO_printf(Perl_debug_log, "with eval ");
16173     PerlIO_printf(Perl_debug_log, "\n");
16174     DEBUG_FLAGS_r({
16175         regdump_extflags("r->extflags: ",r->extflags);
16176         regdump_intflags("r->intflags: ",r->intflags);
16177     });
16178 #else
16179     PERL_ARGS_ASSERT_REGDUMP;
16180     PERL_UNUSED_CONTEXT;
16181     PERL_UNUSED_ARG(r);
16182 #endif  /* DEBUGGING */
16183 }
16184
16185 /*
16186 - regprop - printable representation of opcode, with run time support
16187 */
16188
16189 void
16190 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16191 {
16192 #ifdef DEBUGGING
16193     int k;
16194
16195     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16196     static const char * const anyofs[] = {
16197 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16198     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
16199     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
16200     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
16201     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
16202     || _CC_VERTSPACE != 16
16203   #error Need to adjust order of anyofs[]
16204 #endif
16205         "\\w",
16206         "\\W",
16207         "\\d",
16208         "\\D",
16209         "[:alpha:]",
16210         "[:^alpha:]",
16211         "[:lower:]",
16212         "[:^lower:]",
16213         "[:upper:]",
16214         "[:^upper:]",
16215         "[:punct:]",
16216         "[:^punct:]",
16217         "[:print:]",
16218         "[:^print:]",
16219         "[:alnum:]",
16220         "[:^alnum:]",
16221         "[:graph:]",
16222         "[:^graph:]",
16223         "[:cased:]",
16224         "[:^cased:]",
16225         "\\s",
16226         "\\S",
16227         "[:blank:]",
16228         "[:^blank:]",
16229         "[:xdigit:]",
16230         "[:^xdigit:]",
16231         "[:space:]",
16232         "[:^space:]",
16233         "[:cntrl:]",
16234         "[:^cntrl:]",
16235         "[:ascii:]",
16236         "[:^ascii:]",
16237         "\\v",
16238         "\\V"
16239     };
16240     RXi_GET_DECL(prog,progi);
16241     GET_RE_DEBUG_FLAGS_DECL;
16242
16243     PERL_ARGS_ASSERT_REGPROP;
16244
16245     sv_setpvn(sv, "", 0);
16246
16247     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
16248         /* It would be nice to FAIL() here, but this may be called from
16249            regexec.c, and it would be hard to supply pRExC_state. */
16250         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16251                                               (int)OP(o), (int)REGNODE_MAX);
16252     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16253
16254     k = PL_regkind[OP(o)];
16255
16256     if (k == EXACT) {
16257         sv_catpvs(sv, " ");
16258         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16259          * is a crude hack but it may be the best for now since
16260          * we have no flag "this EXACTish node was UTF-8"
16261          * --jhi */
16262         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16263                   PERL_PV_ESCAPE_UNI_DETECT |
16264                   PERL_PV_ESCAPE_NONASCII   |
16265                   PERL_PV_PRETTY_ELLIPSES   |
16266                   PERL_PV_PRETTY_LTGT       |
16267                   PERL_PV_PRETTY_NOCLEAR
16268                   );
16269     } else if (k == TRIE) {
16270         /* print the details of the trie in dumpuntil instead, as
16271          * progi->data isn't available here */
16272         const char op = OP(o);
16273         const U32 n = ARG(o);
16274         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16275                (reg_ac_data *)progi->data->data[n] :
16276                NULL;
16277         const reg_trie_data * const trie
16278             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16279
16280         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16281         DEBUG_TRIE_COMPILE_r(
16282           Perl_sv_catpvf(aTHX_ sv,
16283             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16284             (UV)trie->startstate,
16285             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16286             (UV)trie->wordcount,
16287             (UV)trie->minlen,
16288             (UV)trie->maxlen,
16289             (UV)TRIE_CHARCOUNT(trie),
16290             (UV)trie->uniquecharcount
16291           );
16292         );
16293         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16294             sv_catpvs(sv, "[");
16295             (void) put_charclass_bitmap_innards(sv,
16296                                                 (IS_ANYOF_TRIE(op))
16297                                                  ? ANYOF_BITMAP(o)
16298                                                  : TRIE_BITMAP(trie),
16299                                                 NULL);
16300             sv_catpvs(sv, "]");
16301         }
16302
16303     } else if (k == CURLY) {
16304         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16305             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16306         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16307     }
16308     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
16309         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16310     else if (k == REF || k == OPEN || k == CLOSE
16311              || k == GROUPP || OP(o)==ACCEPT)
16312     {
16313         AV *name_list= NULL;
16314         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
16315         if ( RXp_PAREN_NAMES(prog) ) {
16316             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16317         } else if ( pRExC_state ) {
16318             name_list= RExC_paren_name_list;
16319         }
16320         if (name_list) {
16321             if ( k != REF || (OP(o) < NREF)) {
16322                 SV **name= av_fetch(name_list, ARG(o), 0 );
16323                 if (name)
16324                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16325             }
16326             else {
16327                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16328                 I32 *nums=(I32*)SvPVX(sv_dat);
16329                 SV **name= av_fetch(name_list, nums[0], 0 );
16330                 I32 n;
16331                 if (name) {
16332                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
16333                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16334                                     (n ? "," : ""), (IV)nums[n]);
16335                     }
16336                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16337                 }
16338             }
16339         }
16340         if ( k == REF && reginfo) {
16341             U32 n = ARG(o);  /* which paren pair */
16342             I32 ln = prog->offs[n].start;
16343             if (prog->lastparen < n || ln == -1)
16344                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16345             else if (ln == prog->offs[n].end)
16346                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16347             else {
16348                 const char *s = reginfo->strbeg + ln;
16349                 Perl_sv_catpvf(aTHX_ sv, ": ");
16350                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16351                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16352             }
16353         }
16354     } else if (k == GOSUB) {
16355         AV *name_list= NULL;
16356         if ( RXp_PAREN_NAMES(prog) ) {
16357             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16358         } else if ( pRExC_state ) {
16359             name_list= RExC_paren_name_list;
16360         }
16361
16362         /* Paren and offset */
16363         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16364         if (name_list) {
16365             SV **name= av_fetch(name_list, ARG(o), 0 );
16366             if (name)
16367                 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16368         }
16369     }
16370     else if (k == VERB) {
16371         if (!o->flags)
16372             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16373                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16374     } else if (k == LOGICAL)
16375         /* 2: embedded, otherwise 1 */
16376         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16377     else if (k == ANYOF) {
16378         const U8 flags = ANYOF_FLAGS(o);
16379         int do_sep = 0;
16380         SV* bitmap_invlist;  /* Will hold what the bit map contains */
16381
16382
16383         if (flags & ANYOF_LOCALE_FLAGS)
16384             sv_catpvs(sv, "{loc}");
16385         if (flags & ANYOF_LOC_FOLD)
16386             sv_catpvs(sv, "{i}");
16387         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16388         if (flags & ANYOF_INVERT)
16389             sv_catpvs(sv, "^");
16390
16391         /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16392          * */
16393         do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16394                                                             &bitmap_invlist);
16395
16396         /* output any special charclass tests (used entirely under use
16397          * locale) * */
16398         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16399             int i;
16400             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16401                 if (ANYOF_POSIXL_TEST(o,i)) {
16402                     sv_catpv(sv, anyofs[i]);
16403                     do_sep = 1;
16404                 }
16405             }
16406         }
16407
16408         if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16409                       |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16410                       |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16411                       |ANYOF_LOC_FOLD)))
16412         {
16413             if (do_sep) {
16414                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16415                 if (flags & ANYOF_INVERT)
16416                     /*make sure the invert info is in each */
16417                     sv_catpvs(sv, "^");
16418             }
16419
16420             if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16421                 sv_catpvs(sv, "{non-utf8-latin1-all}");
16422             }
16423
16424             /* output information about the unicode matching */
16425             if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16426                 sv_catpvs(sv, "{above_bitmap_all}");
16427             else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16428                 SV *lv; /* Set if there is something outside the bit map. */
16429                 bool byte_output = FALSE;   /* If something in the bitmap has
16430                                                been output */
16431                 SV *only_utf8_locale;
16432
16433                 /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
16434                  * is used to guarantee that nothing in the bitmap gets
16435                  * returned */
16436                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16437                                                     &lv, &only_utf8_locale,
16438                                                     bitmap_invlist);
16439                 if (lv && lv != &PL_sv_undef) {
16440                     char *s = savesvpv(lv);
16441                     char * const origs = s;
16442
16443                     while (*s && *s != '\n')
16444                         s++;
16445
16446                     if (*s == '\n') {
16447                         const char * const t = ++s;
16448
16449                         if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16450                             sv_catpvs(sv, "{outside bitmap}");
16451                         }
16452                         else {
16453                             sv_catpvs(sv, "{utf8}");
16454                         }
16455
16456                         if (byte_output) {
16457                             sv_catpvs(sv, " ");
16458                         }
16459
16460                         while (*s) {
16461                             if (*s == '\n') {
16462
16463                                 /* Truncate very long output */
16464                                 if (s - origs > 256) {
16465                                     Perl_sv_catpvf(aTHX_ sv,
16466                                                 "%.*s...",
16467                                                 (int) (s - origs - 1),
16468                                                 t);
16469                                     goto out_dump;
16470                                 }
16471                                 *s = ' ';
16472                             }
16473                             else if (*s == '\t') {
16474                                 *s = '-';
16475                             }
16476                             s++;
16477                         }
16478                         if (s[-1] == ' ')
16479                             s[-1] = 0;
16480
16481                         sv_catpv(sv, t);
16482                     }
16483
16484                 out_dump:
16485
16486                     Safefree(origs);
16487                     SvREFCNT_dec_NN(lv);
16488                 }
16489
16490                 if ((flags & ANYOF_LOC_FOLD)
16491                      && only_utf8_locale
16492                      && only_utf8_locale != &PL_sv_undef)
16493                 {
16494                     UV start, end;
16495                     int max_entries = 256;
16496
16497                     sv_catpvs(sv, "{utf8 locale}");
16498                     invlist_iterinit(only_utf8_locale);
16499                     while (invlist_iternext(only_utf8_locale,
16500                                             &start, &end)) {
16501                         put_range(sv, start, end, FALSE);
16502                         max_entries --;
16503                         if (max_entries < 0) {
16504                             sv_catpvs(sv, "...");
16505                             break;
16506                         }
16507                     }
16508                     invlist_iterfinish(only_utf8_locale);
16509                 }
16510             }
16511         }
16512         SvREFCNT_dec(bitmap_invlist);
16513
16514
16515         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16516     }
16517     else if (k == POSIXD || k == NPOSIXD) {
16518         U8 index = FLAGS(o) * 2;
16519         if (index < C_ARRAY_LENGTH(anyofs)) {
16520             if (*anyofs[index] != '[')  {
16521                 sv_catpv(sv, "[");
16522             }
16523             sv_catpv(sv, anyofs[index]);
16524             if (*anyofs[index] != '[')  {
16525                 sv_catpv(sv, "]");
16526             }
16527         }
16528         else {
16529             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16530         }
16531     }
16532     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16533         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16534     else if (OP(o) == SBOL)
16535         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16536 #else
16537     PERL_UNUSED_CONTEXT;
16538     PERL_UNUSED_ARG(sv);
16539     PERL_UNUSED_ARG(o);
16540     PERL_UNUSED_ARG(prog);
16541     PERL_UNUSED_ARG(reginfo);
16542     PERL_UNUSED_ARG(pRExC_state);
16543 #endif  /* DEBUGGING */
16544 }
16545
16546
16547
16548 SV *
16549 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16550 {                               /* Assume that RE_INTUIT is set */
16551     struct regexp *const prog = ReANY(r);
16552     GET_RE_DEBUG_FLAGS_DECL;
16553
16554     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16555     PERL_UNUSED_CONTEXT;
16556
16557     DEBUG_COMPILE_r(
16558         {
16559             const char * const s = SvPV_nolen_const(prog->check_substr
16560                       ? prog->check_substr : prog->check_utf8);
16561
16562             if (!PL_colorset) reginitcolors();
16563             PerlIO_printf(Perl_debug_log,
16564                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16565                       PL_colors[4],
16566                       prog->check_substr ? "" : "utf8 ",
16567                       PL_colors[5],PL_colors[0],
16568                       s,
16569                       PL_colors[1],
16570                       (strlen(s) > 60 ? "..." : ""));
16571         } );
16572
16573     return prog->check_substr ? prog->check_substr : prog->check_utf8;
16574 }
16575
16576 /*
16577    pregfree()
16578
16579    handles refcounting and freeing the perl core regexp structure. When
16580    it is necessary to actually free the structure the first thing it
16581    does is call the 'free' method of the regexp_engine associated to
16582    the regexp, allowing the handling of the void *pprivate; member
16583    first. (This routine is not overridable by extensions, which is why
16584    the extensions free is called first.)
16585
16586    See regdupe and regdupe_internal if you change anything here.
16587 */
16588 #ifndef PERL_IN_XSUB_RE
16589 void
16590 Perl_pregfree(pTHX_ REGEXP *r)
16591 {
16592     SvREFCNT_dec(r);
16593 }
16594
16595 void
16596 Perl_pregfree2(pTHX_ REGEXP *rx)
16597 {
16598     struct regexp *const r = ReANY(rx);
16599     GET_RE_DEBUG_FLAGS_DECL;
16600
16601     PERL_ARGS_ASSERT_PREGFREE2;
16602
16603     if (r->mother_re) {
16604         ReREFCNT_dec(r->mother_re);
16605     } else {
16606         CALLREGFREE_PVT(rx); /* free the private data */
16607         SvREFCNT_dec(RXp_PAREN_NAMES(r));
16608         Safefree(r->xpv_len_u.xpvlenu_pv);
16609     }
16610     if (r->substrs) {
16611         SvREFCNT_dec(r->anchored_substr);
16612         SvREFCNT_dec(r->anchored_utf8);
16613         SvREFCNT_dec(r->float_substr);
16614         SvREFCNT_dec(r->float_utf8);
16615         Safefree(r->substrs);
16616     }
16617     RX_MATCH_COPY_FREE(rx);
16618 #ifdef PERL_ANY_COW
16619     SvREFCNT_dec(r->saved_copy);
16620 #endif
16621     Safefree(r->offs);
16622     SvREFCNT_dec(r->qr_anoncv);
16623     rx->sv_u.svu_rx = 0;
16624 }
16625
16626 /*  reg_temp_copy()
16627
16628     This is a hacky workaround to the structural issue of match results
16629     being stored in the regexp structure which is in turn stored in
16630     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16631     could be PL_curpm in multiple contexts, and could require multiple
16632     result sets being associated with the pattern simultaneously, such
16633     as when doing a recursive match with (??{$qr})
16634
16635     The solution is to make a lightweight copy of the regexp structure
16636     when a qr// is returned from the code executed by (??{$qr}) this
16637     lightweight copy doesn't actually own any of its data except for
16638     the starp/end and the actual regexp structure itself.
16639
16640 */
16641
16642
16643 REGEXP *
16644 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16645 {
16646     struct regexp *ret;
16647     struct regexp *const r = ReANY(rx);
16648     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16649
16650     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16651
16652     if (!ret_x)
16653         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16654     else {
16655         SvOK_off((SV *)ret_x);
16656         if (islv) {
16657             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16658                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16659                made both spots point to the same regexp body.) */
16660             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16661             assert(!SvPVX(ret_x));
16662             ret_x->sv_u.svu_rx = temp->sv_any;
16663             temp->sv_any = NULL;
16664             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16665             SvREFCNT_dec_NN(temp);
16666             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16667                ing below will not set it. */
16668             SvCUR_set(ret_x, SvCUR(rx));
16669         }
16670     }
16671     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16672        sv_force_normal(sv) is called.  */
16673     SvFAKE_on(ret_x);
16674     ret = ReANY(ret_x);
16675
16676     SvFLAGS(ret_x) |= SvUTF8(rx);
16677     /* We share the same string buffer as the original regexp, on which we
16678        hold a reference count, incremented when mother_re is set below.
16679        The string pointer is copied here, being part of the regexp struct.
16680      */
16681     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16682            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16683     if (r->offs) {
16684         const I32 npar = r->nparens+1;
16685         Newx(ret->offs, npar, regexp_paren_pair);
16686         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16687     }
16688     if (r->substrs) {
16689         Newx(ret->substrs, 1, struct reg_substr_data);
16690         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16691
16692         SvREFCNT_inc_void(ret->anchored_substr);
16693         SvREFCNT_inc_void(ret->anchored_utf8);
16694         SvREFCNT_inc_void(ret->float_substr);
16695         SvREFCNT_inc_void(ret->float_utf8);
16696
16697         /* check_substr and check_utf8, if non-NULL, point to either their
16698            anchored or float namesakes, and don't hold a second reference.  */
16699     }
16700     RX_MATCH_COPIED_off(ret_x);
16701 #ifdef PERL_ANY_COW
16702     ret->saved_copy = NULL;
16703 #endif
16704     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16705     SvREFCNT_inc_void(ret->qr_anoncv);
16706
16707     return ret_x;
16708 }
16709 #endif
16710
16711 /* regfree_internal()
16712
16713    Free the private data in a regexp. This is overloadable by
16714    extensions. Perl takes care of the regexp structure in pregfree(),
16715    this covers the *pprivate pointer which technically perl doesn't
16716    know about, however of course we have to handle the
16717    regexp_internal structure when no extension is in use.
16718
16719    Note this is called before freeing anything in the regexp
16720    structure.
16721  */
16722
16723 void
16724 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16725 {
16726     struct regexp *const r = ReANY(rx);
16727     RXi_GET_DECL(r,ri);
16728     GET_RE_DEBUG_FLAGS_DECL;
16729
16730     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16731
16732     DEBUG_COMPILE_r({
16733         if (!PL_colorset)
16734             reginitcolors();
16735         {
16736             SV *dsv= sv_newmortal();
16737             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16738                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16739             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16740                 PL_colors[4],PL_colors[5],s);
16741         }
16742     });
16743 #ifdef RE_TRACK_PATTERN_OFFSETS
16744     if (ri->u.offsets)
16745         Safefree(ri->u.offsets);             /* 20010421 MJD */
16746 #endif
16747     if (ri->code_blocks) {
16748         int n;
16749         for (n = 0; n < ri->num_code_blocks; n++)
16750             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16751         Safefree(ri->code_blocks);
16752     }
16753
16754     if (ri->data) {
16755         int n = ri->data->count;
16756
16757         while (--n >= 0) {
16758           /* If you add a ->what type here, update the comment in regcomp.h */
16759             switch (ri->data->what[n]) {
16760             case 'a':
16761             case 'r':
16762             case 's':
16763             case 'S':
16764             case 'u':
16765                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16766                 break;
16767             case 'f':
16768                 Safefree(ri->data->data[n]);
16769                 break;
16770             case 'l':
16771             case 'L':
16772                 break;
16773             case 'T':
16774                 { /* Aho Corasick add-on structure for a trie node.
16775                      Used in stclass optimization only */
16776                     U32 refcount;
16777                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16778 #ifdef USE_ITHREADS
16779                     dVAR;
16780 #endif
16781                     OP_REFCNT_LOCK;
16782                     refcount = --aho->refcount;
16783                     OP_REFCNT_UNLOCK;
16784                     if ( !refcount ) {
16785                         PerlMemShared_free(aho->states);
16786                         PerlMemShared_free(aho->fail);
16787                          /* do this last!!!! */
16788                         PerlMemShared_free(ri->data->data[n]);
16789                         /* we should only ever get called once, so
16790                          * assert as much, and also guard the free
16791                          * which /might/ happen twice. At the least
16792                          * it will make code anlyzers happy and it
16793                          * doesn't cost much. - Yves */
16794                         assert(ri->regstclass);
16795                         if (ri->regstclass) {
16796                             PerlMemShared_free(ri->regstclass);
16797                             ri->regstclass = 0;
16798                         }
16799                     }
16800                 }
16801                 break;
16802             case 't':
16803                 {
16804                     /* trie structure. */
16805                     U32 refcount;
16806                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16807 #ifdef USE_ITHREADS
16808                     dVAR;
16809 #endif
16810                     OP_REFCNT_LOCK;
16811                     refcount = --trie->refcount;
16812                     OP_REFCNT_UNLOCK;
16813                     if ( !refcount ) {
16814                         PerlMemShared_free(trie->charmap);
16815                         PerlMemShared_free(trie->states);
16816                         PerlMemShared_free(trie->trans);
16817                         if (trie->bitmap)
16818                             PerlMemShared_free(trie->bitmap);
16819                         if (trie->jump)
16820                             PerlMemShared_free(trie->jump);
16821                         PerlMemShared_free(trie->wordinfo);
16822                         /* do this last!!!! */
16823                         PerlMemShared_free(ri->data->data[n]);
16824                     }
16825                 }
16826                 break;
16827             default:
16828                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16829                                                     ri->data->what[n]);
16830             }
16831         }
16832         Safefree(ri->data->what);
16833         Safefree(ri->data);
16834     }
16835
16836     Safefree(ri);
16837 }
16838
16839 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16840 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16841 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16842
16843 /*
16844    re_dup - duplicate a regexp.
16845
16846    This routine is expected to clone a given regexp structure. It is only
16847    compiled under USE_ITHREADS.
16848
16849    After all of the core data stored in struct regexp is duplicated
16850    the regexp_engine.dupe method is used to copy any private data
16851    stored in the *pprivate pointer. This allows extensions to handle
16852    any duplication it needs to do.
16853
16854    See pregfree() and regfree_internal() if you change anything here.
16855 */
16856 #if defined(USE_ITHREADS)
16857 #ifndef PERL_IN_XSUB_RE
16858 void
16859 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16860 {
16861     dVAR;
16862     I32 npar;
16863     const struct regexp *r = ReANY(sstr);
16864     struct regexp *ret = ReANY(dstr);
16865
16866     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16867
16868     npar = r->nparens+1;
16869     Newx(ret->offs, npar, regexp_paren_pair);
16870     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16871
16872     if (ret->substrs) {
16873         /* Do it this way to avoid reading from *r after the StructCopy().
16874            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16875            cache, it doesn't matter.  */
16876         const bool anchored = r->check_substr
16877             ? r->check_substr == r->anchored_substr
16878             : r->check_utf8 == r->anchored_utf8;
16879         Newx(ret->substrs, 1, struct reg_substr_data);
16880         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16881
16882         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16883         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16884         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16885         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16886
16887         /* check_substr and check_utf8, if non-NULL, point to either their
16888            anchored or float namesakes, and don't hold a second reference.  */
16889
16890         if (ret->check_substr) {
16891             if (anchored) {
16892                 assert(r->check_utf8 == r->anchored_utf8);
16893                 ret->check_substr = ret->anchored_substr;
16894                 ret->check_utf8 = ret->anchored_utf8;
16895             } else {
16896                 assert(r->check_substr == r->float_substr);
16897                 assert(r->check_utf8 == r->float_utf8);
16898                 ret->check_substr = ret->float_substr;
16899                 ret->check_utf8 = ret->float_utf8;
16900             }
16901         } else if (ret->check_utf8) {
16902             if (anchored) {
16903                 ret->check_utf8 = ret->anchored_utf8;
16904             } else {
16905                 ret->check_utf8 = ret->float_utf8;
16906             }
16907         }
16908     }
16909
16910     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16911     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16912
16913     if (ret->pprivate)
16914         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16915
16916     if (RX_MATCH_COPIED(dstr))
16917         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16918     else
16919         ret->subbeg = NULL;
16920 #ifdef PERL_ANY_COW
16921     ret->saved_copy = NULL;
16922 #endif
16923
16924     /* Whether mother_re be set or no, we need to copy the string.  We
16925        cannot refrain from copying it when the storage points directly to
16926        our mother regexp, because that's
16927                1: a buffer in a different thread
16928                2: something we no longer hold a reference on
16929                so we need to copy it locally.  */
16930     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16931     ret->mother_re   = NULL;
16932 }
16933 #endif /* PERL_IN_XSUB_RE */
16934
16935 /*
16936    regdupe_internal()
16937
16938    This is the internal complement to regdupe() which is used to copy
16939    the structure pointed to by the *pprivate pointer in the regexp.
16940    This is the core version of the extension overridable cloning hook.
16941    The regexp structure being duplicated will be copied by perl prior
16942    to this and will be provided as the regexp *r argument, however
16943    with the /old/ structures pprivate pointer value. Thus this routine
16944    may override any copying normally done by perl.
16945
16946    It returns a pointer to the new regexp_internal structure.
16947 */
16948
16949 void *
16950 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16951 {
16952     dVAR;
16953     struct regexp *const r = ReANY(rx);
16954     regexp_internal *reti;
16955     int len;
16956     RXi_GET_DECL(r,ri);
16957
16958     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16959
16960     len = ProgLen(ri);
16961
16962     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16963           char, regexp_internal);
16964     Copy(ri->program, reti->program, len+1, regnode);
16965
16966     reti->num_code_blocks = ri->num_code_blocks;
16967     if (ri->code_blocks) {
16968         int n;
16969         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16970                 struct reg_code_block);
16971         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16972                 struct reg_code_block);
16973         for (n = 0; n < ri->num_code_blocks; n++)
16974              reti->code_blocks[n].src_regex = (REGEXP*)
16975                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16976     }
16977     else
16978         reti->code_blocks = NULL;
16979
16980     reti->regstclass = NULL;
16981
16982     if (ri->data) {
16983         struct reg_data *d;
16984         const int count = ri->data->count;
16985         int i;
16986
16987         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16988                 char, struct reg_data);
16989         Newx(d->what, count, U8);
16990
16991         d->count = count;
16992         for (i = 0; i < count; i++) {
16993             d->what[i] = ri->data->what[i];
16994             switch (d->what[i]) {
16995                 /* see also regcomp.h and regfree_internal() */
16996             case 'a': /* actually an AV, but the dup function is identical.  */
16997             case 'r':
16998             case 's':
16999             case 'S':
17000             case 'u': /* actually an HV, but the dup function is identical.  */
17001                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17002                 break;
17003             case 'f':
17004                 /* This is cheating. */
17005                 Newx(d->data[i], 1, regnode_ssc);
17006                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17007                 reti->regstclass = (regnode*)d->data[i];
17008                 break;
17009             case 'T':
17010                 /* Trie stclasses are readonly and can thus be shared
17011                  * without duplication. We free the stclass in pregfree
17012                  * when the corresponding reg_ac_data struct is freed.
17013                  */
17014                 reti->regstclass= ri->regstclass;
17015                 /* FALLTHROUGH */
17016             case 't':
17017                 OP_REFCNT_LOCK;
17018                 ((reg_trie_data*)ri->data->data[i])->refcount++;
17019                 OP_REFCNT_UNLOCK;
17020                 /* FALLTHROUGH */
17021             case 'l':
17022             case 'L':
17023                 d->data[i] = ri->data->data[i];
17024                 break;
17025             default:
17026                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17027                                                            ri->data->what[i]);
17028             }
17029         }
17030
17031         reti->data = d;
17032     }
17033     else
17034         reti->data = NULL;
17035
17036     reti->name_list_idx = ri->name_list_idx;
17037
17038 #ifdef RE_TRACK_PATTERN_OFFSETS
17039     if (ri->u.offsets) {
17040         Newx(reti->u.offsets, 2*len+1, U32);
17041         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17042     }
17043 #else
17044     SetProgLen(reti,len);
17045 #endif
17046
17047     return (void*)reti;
17048 }
17049
17050 #endif    /* USE_ITHREADS */
17051
17052 #ifndef PERL_IN_XSUB_RE
17053
17054 /*
17055  - regnext - dig the "next" pointer out of a node
17056  */
17057 regnode *
17058 Perl_regnext(pTHX_ regnode *p)
17059 {
17060     I32 offset;
17061
17062     if (!p)
17063         return(NULL);
17064
17065     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
17066         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17067                                                 (int)OP(p), (int)REGNODE_MAX);
17068     }
17069
17070     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17071     if (offset == 0)
17072         return(NULL);
17073
17074     return(p+offset);
17075 }
17076 #endif
17077
17078 STATIC void
17079 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17080 {
17081     va_list args;
17082     STRLEN l1 = strlen(pat1);
17083     STRLEN l2 = strlen(pat2);
17084     char buf[512];
17085     SV *msv;
17086     const char *message;
17087
17088     PERL_ARGS_ASSERT_RE_CROAK2;
17089
17090     if (l1 > 510)
17091         l1 = 510;
17092     if (l1 + l2 > 510)
17093         l2 = 510 - l1;
17094     Copy(pat1, buf, l1 , char);
17095     Copy(pat2, buf + l1, l2 , char);
17096     buf[l1 + l2] = '\n';
17097     buf[l1 + l2 + 1] = '\0';
17098     va_start(args, pat2);
17099     msv = vmess(buf, &args);
17100     va_end(args);
17101     message = SvPV_const(msv,l1);
17102     if (l1 > 512)
17103         l1 = 512;
17104     Copy(message, buf, l1 , char);
17105     /* l1-1 to avoid \n */
17106     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17107 }
17108
17109 #ifdef DEBUGGING
17110 /* Certain characters are output as a sequence with the first being a
17111  * backslash. */
17112 #define isBACKSLASHED_PUNCT(c)                                              \
17113                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17114
17115 STATIC void
17116 S_put_code_point(pTHX_ SV *sv, UV c)
17117 {
17118     PERL_ARGS_ASSERT_PUT_CODE_POINT;
17119
17120     if (c > 255) {
17121         Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17122     }
17123     else if (isPRINT(c)) {
17124         const char string = (char) c;
17125         if (isBACKSLASHED_PUNCT(c))
17126             sv_catpvs(sv, "\\");
17127         sv_catpvn(sv, &string, 1);
17128     }
17129     else {
17130         const char * const mnemonic = cntrl_to_mnemonic((char) c);
17131         if (mnemonic) {
17132             Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17133         }
17134         else {
17135             Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17136         }
17137     }
17138 }
17139
17140 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17141
17142 STATIC void
17143 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17144 {
17145     /* Appends to 'sv' a displayable version of the range of code points from
17146      * 'start' to 'end'.  It assumes that only ASCII printables are displayable
17147      * as-is (though some of these will be escaped by put_code_point()). */
17148
17149     const unsigned int min_range_count = 3;
17150
17151     assert(start <= end);
17152
17153     PERL_ARGS_ASSERT_PUT_RANGE;
17154
17155     while (start <= end) {
17156         UV this_end;
17157         const char * format;
17158
17159         if (end - start < min_range_count) {
17160
17161             /* Individual chars in short ranges */
17162             for (; start <= end; start++) {
17163                 put_code_point(sv, start);
17164             }
17165             break;
17166         }
17167
17168         /* If permitted by the input options, and there is a possibility that
17169          * this range contains a printable literal, look to see if there is
17170          * one.  */
17171         if (allow_literals && start <= MAX_PRINT_A) {
17172
17173             /* If the range begin isn't an ASCII printable, effectively split
17174              * the range into two parts:
17175              *  1) the portion before the first such printable,
17176              *  2) the rest
17177              * and output them separately. */
17178             if (! isPRINT_A(start)) {
17179                 UV temp_end = start + 1;
17180
17181                 /* There is no point looking beyond the final possible
17182                  * printable, in MAX_PRINT_A */
17183                 UV max = MIN(end, MAX_PRINT_A);
17184
17185                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17186                     temp_end++;
17187                 }
17188
17189                 /* Here, temp_end points to one beyond the first printable if
17190                  * found, or to one beyond 'max' if not.  If none found, make
17191                  * sure that we use the entire range */
17192                 if (temp_end > MAX_PRINT_A) {
17193                     temp_end = end + 1;
17194                 }
17195
17196                 /* Output the first part of the split range, the part that
17197                  * doesn't have printables, with no looking for literals
17198                  * (otherwise we would infinitely recurse) */
17199                 put_range(sv, start, temp_end - 1, FALSE);
17200
17201                 /* The 2nd part of the range (if any) starts here. */
17202                 start = temp_end;
17203
17204                 /* We continue instead of dropping down because even if the 2nd
17205                  * part is non-empty, it could be so short that we want to
17206                  * output it specially, as tested for at the top of this loop.
17207                  * */
17208                 continue;
17209             }
17210
17211             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
17212              * output a sub-range of just the digits or letters, then process
17213              * the remaining portion as usual. */
17214             if (isALPHANUMERIC_A(start)) {
17215                 UV mask = (isDIGIT_A(start))
17216                            ? _CC_DIGIT
17217                              : isUPPER_A(start)
17218                                ? _CC_UPPER
17219                                : _CC_LOWER;
17220                 UV temp_end = start + 1;
17221
17222                 /* Find the end of the sub-range that includes just the
17223                  * characters in the same class as the first character in it */
17224                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17225                     temp_end++;
17226                 }
17227                 temp_end--;
17228
17229                 /* For short ranges, don't duplicate the code above to output
17230                  * them; just call recursively */
17231                 if (temp_end - start < min_range_count) {
17232                     put_range(sv, start, temp_end, FALSE);
17233                 }
17234                 else {  /* Output as a range */
17235                     put_code_point(sv, start);
17236                     sv_catpvs(sv, "-");
17237                     put_code_point(sv, temp_end);
17238                 }
17239                 start = temp_end + 1;
17240                 continue;
17241             }
17242
17243             /* We output any other printables as individual characters */
17244             if (isPUNCT_A(start) || isSPACE_A(start)) {
17245                 while (start <= end && (isPUNCT_A(start)
17246                                         || isSPACE_A(start)))
17247                 {
17248                     put_code_point(sv, start);
17249                     start++;
17250                 }
17251                 continue;
17252             }
17253         } /* End of looking for literals */
17254
17255         /* Here is not to output as a literal.  Some control characters have
17256          * mnemonic names.  Split off any of those at the beginning and end of
17257          * the range to print mnemonically.  It isn't possible for many of
17258          * these to be in a row, so this won't overwhelm with output */
17259         while (isMNEMONIC_CNTRL(start) && start <= end) {
17260             put_code_point(sv, start);
17261             start++;
17262         }
17263         if (start < end && isMNEMONIC_CNTRL(end)) {
17264
17265             /* Here, the final character in the range has a mnemonic name.
17266              * Work backwards from the end to find the final non-mnemonic */
17267             UV temp_end = end - 1;
17268             while (isMNEMONIC_CNTRL(temp_end)) {
17269                 temp_end--;
17270             }
17271
17272             /* And separately output the range that doesn't have mnemonics */
17273             put_range(sv, start, temp_end, FALSE);
17274
17275             /* Then output the mnemonic trailing controls */
17276             start = temp_end + 1;
17277             while (start <= end) {
17278                 put_code_point(sv, start);
17279                 start++;
17280             }
17281             break;
17282         }
17283
17284         /* As a final resort, output the range or subrange as hex. */
17285
17286         this_end = (end < NUM_ANYOF_CODE_POINTS)
17287                     ? end
17288                     : NUM_ANYOF_CODE_POINTS - 1;
17289         format = (this_end < 256)
17290                  ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17291                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17292         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17293         break;
17294     }
17295 }
17296
17297 STATIC bool
17298 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17299 {
17300     /* Appends to 'sv' a displayable version of the innards of the bracketed
17301      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
17302      * output anything, and bitmap_invlist, if not NULL, will point to an
17303      * inversion list of what is in the bit map */
17304
17305     int i;
17306     UV start, end;
17307     unsigned int punct_count = 0;
17308     SV* invlist = NULL;
17309     SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
17310     bool allow_literals = TRUE;
17311
17312     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17313
17314     invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17315
17316     /* Worst case is exactly every-other code point is in the list */
17317     *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17318
17319     /* Convert the bit map to an inversion list, keeping track of how many
17320      * ASCII puncts are set, including an extra amount for the backslashed
17321      * ones.  */
17322     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17323         if (BITMAP_TEST(bitmap, i)) {
17324             *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17325             if (isPUNCT_A(i)) {
17326                 punct_count++;
17327                 if isBACKSLASHED_PUNCT(i) {
17328                     punct_count++;
17329                 }
17330             }
17331         }
17332     }
17333
17334     /* Nothing to output */
17335     if (_invlist_len(*invlist_ptr) == 0) {
17336         SvREFCNT_dec(invlist);
17337         return FALSE;
17338     }
17339
17340     /* Generally, it is more readable if printable characters are output as
17341      * literals, but if a range (nearly) spans all of them, it's best to output
17342      * it as a single range.  This code will use a single range if all but 2
17343      * printables are in it */
17344     invlist_iterinit(*invlist_ptr);
17345     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17346
17347         /* If range starts beyond final printable, it doesn't have any in it */
17348         if (start > MAX_PRINT_A) {
17349             break;
17350         }
17351
17352         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
17353          * all but two, the range must start and end no later than 2 from
17354          * either end */
17355         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17356             if (end > MAX_PRINT_A) {
17357                 end = MAX_PRINT_A;
17358             }
17359             if (start < ' ') {
17360                 start = ' ';
17361             }
17362             if (end - start >= MAX_PRINT_A - ' ' - 2) {
17363                 allow_literals = FALSE;
17364             }
17365             break;
17366         }
17367     }
17368     invlist_iterfinish(*invlist_ptr);
17369
17370     /* The legibility of the output depends mostly on how many punctuation
17371      * characters are output.  There are 32 possible ASCII ones, and some have
17372      * an additional backslash, bringing it to currently 36, so if any more
17373      * than 18 are to be output, we can instead output it as its complement,
17374      * yielding fewer puncts, and making it more legible.  But give some weight
17375      * to the fact that outputting it as a complement is less legible than a
17376      * straight output, so don't complement unless we are somewhat over the 18
17377      * mark */
17378     if (allow_literals && punct_count > 22) {
17379         sv_catpvs(sv, "^");
17380
17381         /* Add everything remaining to the list, so when we invert it just
17382          * below, it will be excluded */
17383         _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17384         _invlist_invert(*invlist_ptr);
17385     }
17386
17387     /* Here we have figured things out.  Output each range */
17388     invlist_iterinit(*invlist_ptr);
17389     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17390         if (start >= NUM_ANYOF_CODE_POINTS) {
17391             break;
17392         }
17393         put_range(sv, start, end, allow_literals);
17394     }
17395     invlist_iterfinish(*invlist_ptr);
17396
17397     return TRUE;
17398 }
17399
17400 #define CLEAR_OPTSTART \
17401     if (optstart) STMT_START {                                               \
17402         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
17403                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17404         optstart=NULL;                                                       \
17405     } STMT_END
17406
17407 #define DUMPUNTIL(b,e)                                                       \
17408                     CLEAR_OPTSTART;                                          \
17409                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17410
17411 STATIC const regnode *
17412 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17413             const regnode *last, const regnode *plast,
17414             SV* sv, I32 indent, U32 depth)
17415 {
17416     U8 op = PSEUDO;     /* Arbitrary non-END op. */
17417     const regnode *next;
17418     const regnode *optstart= NULL;
17419
17420     RXi_GET_DECL(r,ri);
17421     GET_RE_DEBUG_FLAGS_DECL;
17422
17423     PERL_ARGS_ASSERT_DUMPUNTIL;
17424
17425 #ifdef DEBUG_DUMPUNTIL
17426     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17427         last ? last-start : 0,plast ? plast-start : 0);
17428 #endif
17429
17430     if (plast && plast < last)
17431         last= plast;
17432
17433     while (PL_regkind[op] != END && (!last || node < last)) {
17434         assert(node);
17435         /* While that wasn't END last time... */
17436         NODE_ALIGN(node);
17437         op = OP(node);
17438         if (op == CLOSE || op == WHILEM)
17439             indent--;
17440         next = regnext((regnode *)node);
17441
17442         /* Where, what. */
17443         if (OP(node) == OPTIMIZED) {
17444             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17445                 optstart = node;
17446             else
17447                 goto after_print;
17448         } else
17449             CLEAR_OPTSTART;
17450
17451         regprop(r, sv, node, NULL, NULL);
17452         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17453                       (int)(2*indent + 1), "", SvPVX_const(sv));
17454
17455         if (OP(node) != OPTIMIZED) {
17456             if (next == NULL)           /* Next ptr. */
17457                 PerlIO_printf(Perl_debug_log, " (0)");
17458             else if (PL_regkind[(U8)op] == BRANCH
17459                      && PL_regkind[OP(next)] != BRANCH )
17460                 PerlIO_printf(Perl_debug_log, " (FAIL)");
17461             else
17462                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17463             (void)PerlIO_putc(Perl_debug_log, '\n');
17464         }
17465
17466       after_print:
17467         if (PL_regkind[(U8)op] == BRANCHJ) {
17468             assert(next);
17469             {
17470                 const regnode *nnode = (OP(next) == LONGJMP
17471                                        ? regnext((regnode *)next)
17472                                        : next);
17473                 if (last && nnode > last)
17474                     nnode = last;
17475                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17476             }
17477         }
17478         else if (PL_regkind[(U8)op] == BRANCH) {
17479             assert(next);
17480             DUMPUNTIL(NEXTOPER(node), next);
17481         }
17482         else if ( PL_regkind[(U8)op]  == TRIE ) {
17483             const regnode *this_trie = node;
17484             const char op = OP(node);
17485             const U32 n = ARG(node);
17486             const reg_ac_data * const ac = op>=AHOCORASICK ?
17487                (reg_ac_data *)ri->data->data[n] :
17488                NULL;
17489             const reg_trie_data * const trie =
17490                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17491 #ifdef DEBUGGING
17492             AV *const trie_words
17493                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17494 #endif
17495             const regnode *nextbranch= NULL;
17496             I32 word_idx;
17497             sv_setpvs(sv, "");
17498             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17499                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17500
17501                 PerlIO_printf(Perl_debug_log, "%*s%s ",
17502                    (int)(2*(indent+3)), "",
17503                     elem_ptr
17504                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17505                                 SvCUR(*elem_ptr), 60,
17506                                 PL_colors[0], PL_colors[1],
17507                                 (SvUTF8(*elem_ptr)
17508                                  ? PERL_PV_ESCAPE_UNI
17509                                  : 0)
17510                                 | PERL_PV_PRETTY_ELLIPSES
17511                                 | PERL_PV_PRETTY_LTGT
17512                             )
17513                     : "???"
17514                 );
17515                 if (trie->jump) {
17516                     U16 dist= trie->jump[word_idx+1];
17517                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17518                                (UV)((dist ? this_trie + dist : next) - start));
17519                     if (dist) {
17520                         if (!nextbranch)
17521                             nextbranch= this_trie + trie->jump[0];
17522                         DUMPUNTIL(this_trie + dist, nextbranch);
17523                     }
17524                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17525                         nextbranch= regnext((regnode *)nextbranch);
17526                 } else {
17527                     PerlIO_printf(Perl_debug_log, "\n");
17528                 }
17529             }
17530             if (last && next > last)
17531                 node= last;
17532             else
17533                 node= next;
17534         }
17535         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
17536             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17537                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17538         }
17539         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17540             assert(next);
17541             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17542         }
17543         else if ( op == PLUS || op == STAR) {
17544             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17545         }
17546         else if (PL_regkind[(U8)op] == ANYOF) {
17547             /* arglen 1 + class block */
17548             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17549                           ? ANYOF_POSIXL_SKIP
17550                           : ANYOF_SKIP);
17551             node = NEXTOPER(node);
17552         }
17553         else if (PL_regkind[(U8)op] == EXACT) {
17554             /* Literal string, where present. */
17555             node += NODE_SZ_STR(node) - 1;
17556             node = NEXTOPER(node);
17557         }
17558         else {
17559             node = NEXTOPER(node);
17560             node += regarglen[(U8)op];
17561         }
17562         if (op == CURLYX || op == OPEN)
17563             indent++;
17564     }
17565     CLEAR_OPTSTART;
17566 #ifdef DEBUG_DUMPUNTIL
17567     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17568 #endif
17569     return node;
17570 }
17571
17572 #endif  /* DEBUGGING */
17573
17574 /*
17575  * Local variables:
17576  * c-indentation-style: bsd
17577  * c-basic-offset: 4
17578  * indent-tabs-mode: nil
17579  * End:
17580  *
17581  * ex: set ts=8 sts=4 sw=4 et:
17582  */