regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102
 103 struct RExC_state_t {
 104     U32         flags;                  /* RXf_* are we folding, multilining? */
 105     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 106     char        *precomp;               /* uncompiled string. */
 107     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 108     regexp      *rx;                    /* perl core regexp structure */
 109     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 110     char        *start;                 /* Start of input for compile */
 111     char        *end;                   /* End of input for compile */
 112     char        *parse;                 /* Input-scan pointer. */
 113     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 114     regnode     *emit_start;            /* Start of emitted-code area */
 115     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 116     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 117                                            implies compiling, so don't emit */
 118     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 119                                            large enough for the largest
 120                                            non-EXACTish node, so can use it as
 121                                            scratch in pass1 */
 122     I32         naughty;                /* How bad is this pattern? */
 123     I32         sawback;                /* Did we see \1, ...? */
 124     U32         seen;
 125     SSize_t     size;                   /* Code size. */
 126     I32                npar;                        /* Capture buffer count, (OPEN) plus one. ("par" 0 is the whole pattern)*/
 127     I32         nestroot;               /* root parens we are in - used by accept */
 128     I32         extralen;
 129     I32         seen_zerolen;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved through */
 145     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 146     I32         in_lookbehind;
 147     I32         contains_locale;
 148     I32         contains_i;
 149     I32         override_recoding;
 150     I32         in_multi_char_class;
 151     struct reg_code_block *code_blocks; /* positions of literal (?{})
 152                                             within pattern */
 153     int         num_code_blocks;        /* size of code_blocks[] */
 154     int         code_index;             /* next code_blocks[] slot */
 155 #if ADD_TO_REGEXEC
 156     char        *starttry;              /* -Dr: where regtry was called. */
 157 #define RExC_starttry   (pRExC_state->starttry)
 158 #endif
 159     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 160 #ifdef DEBUGGING
 161     const char  *lastparse;
 162     I32         lastnum;
 163     AV          *paren_name_list;       /* idx -> name */
 164 #define RExC_lastparse  (pRExC_state->lastparse)
 165 #define RExC_lastnum    (pRExC_state->lastnum)
 166 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 167 #endif
 168 };
 169
 170 #define RExC_flags      (pRExC_state->flags)
 171 #define RExC_pm_flags   (pRExC_state->pm_flags)
 172 #define RExC_precomp    (pRExC_state->precomp)
 173 #define RExC_rx_sv      (pRExC_state->rx_sv)
 174 #define RExC_rx         (pRExC_state->rx)
 175 #define RExC_rxi        (pRExC_state->rxi)
 176 #define RExC_start      (pRExC_state->start)
 177 #define RExC_end        (pRExC_state->end)
 178 #define RExC_parse      (pRExC_state->parse)
 179 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 180 #ifdef RE_TRACK_PATTERN_OFFSETS
 181 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 182 #endif
 183 #define RExC_emit       (pRExC_state->emit)
 184 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 185 #define RExC_emit_start (pRExC_state->emit_start)
 186 #define RExC_emit_bound (pRExC_state->emit_bound)
 187 #define RExC_naughty    (pRExC_state->naughty)
 188 #define RExC_sawback    (pRExC_state->sawback)
 189 #define RExC_seen       (pRExC_state->seen)
 190 #define RExC_size       (pRExC_state->size)
 191 #define RExC_npar       (pRExC_state->npar)
 192 #define RExC_nestroot   (pRExC_state->nestroot)
 193 #define RExC_extralen   (pRExC_state->extralen)
 194 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 195 #define RExC_utf8       (pRExC_state->utf8)
 196 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 197 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 198 #define RExC_open_parens        (pRExC_state->open_parens)
 199 #define RExC_close_parens       (pRExC_state->close_parens)
 200 #define RExC_opend      (pRExC_state->opend)
 201 #define RExC_paren_names        (pRExC_state->paren_names)
 202 #define RExC_recurse    (pRExC_state->recurse)
 203 #define RExC_recurse_count      (pRExC_state->recurse_count)
 204 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 205 #define RExC_study_chunk_recursed_bytes        (pRExC_state->study_chunk_recursed_bytes)
 206 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 207 #define RExC_contains_locale    (pRExC_state->contains_locale)
 208 #define RExC_contains_i (pRExC_state->contains_i)
 209 #define RExC_override_recoding (pRExC_state->override_recoding)
 210 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 211
 212
 213 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 214 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 215         ((*s) == '{' && regcurly(s, FALSE)))
 216
 217 /*
 218  * Flags to be passed up and down.
 219  */
 220 #define WORST           0       /* Worst case. */
 221 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 222
 223 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 224  * character.  (There needs to be a case: in the switch statement in regexec.c
 225  * for any node marked SIMPLE.)  Note that this is not the same thing as
 226  * REGNODE_SIMPLE */
 227 #define SIMPLE          0x02
 228 #define SPSTART         0x04    /* Starts with * or + */
 229 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 230 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 231 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 232
 233 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 234
 235 /* whether trie related optimizations are enabled */
 236 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 237 #define TRIE_STUDY_OPT
 238 #define FULL_TRIE_STUDY
 239 #define TRIE_STCLASS
 240 #endif
 241
 242
 243
 244 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 245 #define PBITVAL(paren) (1 << ((paren) & 7))
 246 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 247 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 248 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 249
 250 #define REQUIRE_UTF8    STMT_START {                                       \
 251                                      if (!UTF) {                           \
 252                                          *flagp = RESTART_UTF8;            \
 253                                          return NULL;                      \
 254                                      }                                     \
 255                         } STMT_END
 256
 257 /* This converts the named class defined in regcomp.h to its equivalent class
 258  * number defined in handy.h. */
 259 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 260 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 261
 262 #define _invlist_union_complement_2nd(a, b, output) \
 263                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 264 #define _invlist_intersection_complement_2nd(a, b, output) \
 265                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 266
 267 /* About scan_data_t.
 268
 269   During optimisation we recurse through the regexp program performing
 270   various inplace (keyhole style) optimisations. In addition study_chunk
 271   and scan_commit populate this data structure with information about
 272   what strings MUST appear in the pattern. We look for the longest
 273   string that must appear at a fixed location, and we look for the
 274   longest string that may appear at a floating location. So for instance
 275   in the pattern:
 276
 277     /FOO[xX]A.*B[xX]BAR/
 278
 279   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 280   strings (because they follow a .* construct). study_chunk will identify
 281   both FOO and BAR as being the longest fixed and floating strings respectively.
 282
 283   The strings can be composites, for instance
 284
 285      /(f)(o)(o)/
 286
 287   will result in a composite fixed substring 'foo'.
 288
 289   For each string some basic information is maintained:
 290
 291   - offset or min_offset
 292     This is the position the string must appear at, or not before.
 293     It also implicitly (when combined with minlenp) tells us how many
 294     characters must match before the string we are searching for.
 295     Likewise when combined with minlenp and the length of the string it
 296     tells us how many characters must appear after the string we have
 297     found.
 298
 299   - max_offset
 300     Only used for floating strings. This is the rightmost point that
 301     the string can appear at. If set to SSize_t_MAX it indicates that the
 302     string can occur infinitely far to the right.
 303
 304   - minlenp
 305     A pointer to the minimum number of characters of the pattern that the
 306     string was found inside. This is important as in the case of positive
 307     lookahead or positive lookbehind we can have multiple patterns
 308     involved. Consider
 309
 310     /(?=FOO).*F/
 311
 312     The minimum length of the pattern overall is 3, the minimum length
 313     of the lookahead part is 3, but the minimum length of the part that
 314     will actually match is 1. So 'FOO's minimum length is 3, but the
 315     minimum length for the F is 1. This is important as the minimum length
 316     is used to determine offsets in front of and behind the string being
 317     looked for.  Since strings can be composites this is the length of the
 318     pattern at the time it was committed with a scan_commit. Note that
 319     the length is calculated by study_chunk, so that the minimum lengths
 320     are not known until the full pattern has been compiled, thus the
 321     pointer to the value.
 322
 323   - lookbehind
 324
 325     In the case of lookbehind the string being searched for can be
 326     offset past the start point of the final matching string.
 327     If this value was just blithely removed from the min_offset it would
 328     invalidate some of the calculations for how many chars must match
 329     before or after (as they are derived from min_offset and minlen and
 330     the length of the string being searched for).
 331     When the final pattern is compiled and the data is moved from the
 332     scan_data_t structure into the regexp structure the information
 333     about lookbehind is factored in, with the information that would
 334     have been lost precalculated in the end_shift field for the
 335     associated string.
 336
 337   The fields pos_min and pos_delta are used to store the minimum offset
 338   and the delta to the maximum offset at the current point in the pattern.
 339
 340 */
 341
 342 typedef struct scan_data_t {
 343     /*I32 len_min;      unused */
 344     /*I32 len_delta;    unused */
 345     SSize_t pos_min;
 346     SSize_t pos_delta;
 347     SV *last_found;
 348     SSize_t last_end;       /* min value, <0 unless valid. */
 349     SSize_t last_start_min;
 350     SSize_t last_start_max;
 351     SV **longest;           /* Either &l_fixed, or &l_float. */
 352     SV *longest_fixed;      /* longest fixed string found in pattern */
 353     SSize_t offset_fixed;   /* offset where it starts */
 354     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 355     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 356     SV *longest_float;      /* longest floating string found in pattern */
 357     SSize_t offset_float_min; /* earliest point in string it can appear */
 358     SSize_t offset_float_max; /* latest point in string it can appear */
 359     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 360     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 361     I32 flags;
 362     I32 whilem_c;
 363     SSize_t *last_closep;
 364     regnode_ssc *start_class;
 365 } scan_data_t;
 366
 367 /* The below is perhaps overboard, but this allows us to save a test at the
 368  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 369  * and 'a' differ by a single bit; the same with the upper and lower case of
 370  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 371  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 372  * then inverts it to form a mask, with just a single 0, in the bit position
 373  * where the upper- and lowercase differ.  XXX There are about 40 other
 374  * instances in the Perl core where this micro-optimization could be used.
 375  * Should decide if maintenance cost is worse, before changing those
 376  *
 377  * Returns a boolean as to whether or not 'v' is either a lowercase or
 378  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 379  * compile-time constant, the generated code is better than some optimizing
 380  * compilers figure out, amounting to a mask and test.  The results are
 381  * meaningless if 'c' is not one of [A-Za-z] */
 382 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 383                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 384
 385 /*
 386  * Forward declarations for pregcomp()'s friends.
 387  */
 388
 389 static const scan_data_t zero_scan_data =
 390   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 391
 392 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 393 #define SF_BEFORE_SEOL          0x0001
 394 #define SF_BEFORE_MEOL          0x0002
 395 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 396 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 397
 398 #define SF_FIX_SHIFT_EOL        (+2)
 399 #define SF_FL_SHIFT_EOL         (+4)
 400
 401 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 402 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 403
 404 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 405 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 406 #define SF_IS_INF               0x0040
 407 #define SF_HAS_PAR              0x0080
 408 #define SF_IN_PAR               0x0100
 409 #define SF_HAS_EVAL             0x0200
 410 #define SCF_DO_SUBSTR           0x0400
 411 #define SCF_DO_STCLASS_AND      0x0800
 412 #define SCF_DO_STCLASS_OR       0x1000
 413 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 414 #define SCF_WHILEM_VISITED_POS  0x2000
 415
 416 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 417 #define SCF_SEEN_ACCEPT         0x8000
 418 #define SCF_TRIE_DOING_RESTUDY 0x10000
 419
 420 #define UTF cBOOL(RExC_utf8)
 421
 422 /* The enums for all these are ordered so things work out correctly */
 423 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 424 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 425 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 426 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 427 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 428 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 429 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 430
 431 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 432
 433 #define OOB_NAMEDCLASS          -1
 434
 435 /* There is no code point that is out-of-bounds, so this is problematic.  But
 436  * its only current use is to initialize a variable that is always set before
 437  * looked at. */
 438 #define OOB_UNICODE             0xDEADBEEF
 439
 440 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 441 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 442
 443
 444 /* length of regex to show in messages that don't mark a position within */
 445 #define RegexLengthToShowInErrorMessages 127
 446
 447 /*
 448  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 449  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 450  * op/pragma/warn/regcomp.
 451  */
 452 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 453 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 454
 455 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 456
 457 #define REPORT_LOCATION_ARGS(offset)            \
 458                 UTF8fARG(UTF, offset, RExC_precomp), \
 459                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 460
 461 /*
 462  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 463  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 464  * "...".
 465  */
 466 #define _FAIL(code) STMT_START {                                        \
 467     const char *ellipses = "";                                          \
 468     IV len = RExC_end - RExC_precomp;                                   \
 469                                                                         \
 470     if (!SIZE_ONLY)                                                     \
 471         SAVEFREESV(RExC_rx_sv);                                         \
 472     if (len > RegexLengthToShowInErrorMessages) {                       \
 473         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 474         len = RegexLengthToShowInErrorMessages - 10;                    \
 475         ellipses = "...";                                               \
 476     }                                                                   \
 477     code;                                                               \
 478 } STMT_END
 479
 480 #define FAIL(msg) _FAIL(                            \
 481     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 482             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 483
 484 #define FAIL2(msg,arg) _FAIL(                       \
 485     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 486             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 487
 488 /*
 489  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 490  */
 491 #define Simple_vFAIL(m) STMT_START {                                    \
 492     const IV offset = RExC_parse - RExC_precomp;                        \
 493     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 494             m, REPORT_LOCATION_ARGS(offset));   \
 495 } STMT_END
 496
 497 /*
 498  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 499  */
 500 #define vFAIL(m) STMT_START {                           \
 501     if (!SIZE_ONLY)                                     \
 502         SAVEFREESV(RExC_rx_sv);                         \
 503     Simple_vFAIL(m);                                    \
 504 } STMT_END
 505
 506 /*
 507  * Like Simple_vFAIL(), but accepts two arguments.
 508  */
 509 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 510     const IV offset = RExC_parse - RExC_precomp;                        \
 511     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 512                       REPORT_LOCATION_ARGS(offset));    \
 513 } STMT_END
 514
 515 /*
 516  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 517  */
 518 #define vFAIL2(m,a1) STMT_START {                       \
 519     if (!SIZE_ONLY)                                     \
 520         SAVEFREESV(RExC_rx_sv);                         \
 521     Simple_vFAIL2(m, a1);                               \
 522 } STMT_END
 523
 524
 525 /*
 526  * Like Simple_vFAIL(), but accepts three arguments.
 527  */
 528 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 529     const IV offset = RExC_parse - RExC_precomp;                \
 530     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 531             REPORT_LOCATION_ARGS(offset));      \
 532 } STMT_END
 533
 534 /*
 535  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 536  */
 537 #define vFAIL3(m,a1,a2) STMT_START {                    \
 538     if (!SIZE_ONLY)                                     \
 539         SAVEFREESV(RExC_rx_sv);                         \
 540     Simple_vFAIL3(m, a1, a2);                           \
 541 } STMT_END
 542
 543 /*
 544  * Like Simple_vFAIL(), but accepts four arguments.
 545  */
 546 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 547     const IV offset = RExC_parse - RExC_precomp;                \
 548     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 549             REPORT_LOCATION_ARGS(offset));      \
 550 } STMT_END
 551
 552 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 553     if (!SIZE_ONLY)                                     \
 554         SAVEFREESV(RExC_rx_sv);                         \
 555     Simple_vFAIL4(m, a1, a2, a3);                       \
 556 } STMT_END
 557
 558 /* A specialized version of vFAIL2 that works with UTF8f */
 559 #define vFAIL2utf8f(m, a1) STMT_START { \
 560     const IV offset = RExC_parse - RExC_precomp;   \
 561     if (!SIZE_ONLY)                                \
 562         SAVEFREESV(RExC_rx_sv);                    \
 563     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 564             REPORT_LOCATION_ARGS(offset));         \
 565 } STMT_END
 566
 567
 568 /* m is not necessarily a "literal string", in this macro */
 569 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 570     const IV offset = loc - RExC_precomp;                               \
 571     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 572             m, REPORT_LOCATION_ARGS(offset));       \
 573 } STMT_END
 574
 575 #define ckWARNreg(loc,m) STMT_START {                                   \
 576     const IV offset = loc - RExC_precomp;                               \
 577     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 578             REPORT_LOCATION_ARGS(offset));              \
 579 } STMT_END
 580
 581 #define vWARN_dep(loc, m) STMT_START {                                  \
 582     const IV offset = loc - RExC_precomp;                               \
 583     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 584             REPORT_LOCATION_ARGS(offset));              \
 585 } STMT_END
 586
 587 #define ckWARNdep(loc,m) STMT_START {                                   \
 588     const IV offset = loc - RExC_precomp;                               \
 589     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 590             m REPORT_LOCATION,                                          \
 591             REPORT_LOCATION_ARGS(offset));              \
 592 } STMT_END
 593
 594 #define ckWARNregdep(loc,m) STMT_START {                                \
 595     const IV offset = loc - RExC_precomp;                               \
 596     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 597             m REPORT_LOCATION,                                          \
 598             REPORT_LOCATION_ARGS(offset));              \
 599 } STMT_END
 600
 601 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 602     const IV offset = loc - RExC_precomp;                               \
 603     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 604             m REPORT_LOCATION,                                          \
 605             a1, REPORT_LOCATION_ARGS(offset));  \
 606 } STMT_END
 607
 608 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 609     const IV offset = loc - RExC_precomp;                               \
 610     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 611             a1, REPORT_LOCATION_ARGS(offset));  \
 612 } STMT_END
 613
 614 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 615     const IV offset = loc - RExC_precomp;                               \
 616     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 617             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 618 } STMT_END
 619
 620 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 621     const IV offset = loc - RExC_precomp;                               \
 622     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 623             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 624 } STMT_END
 625
 626 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 627     const IV offset = loc - RExC_precomp;                               \
 628     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 629             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 630 } STMT_END
 631
 632 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 633     const IV offset = loc - RExC_precomp;                               \
 634     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 635             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 636 } STMT_END
 637
 638 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 639     const IV offset = loc - RExC_precomp;                               \
 640     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 641             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 642 } STMT_END
 643
 644
 645 /* Allow for side effects in s */
 646 #define REGC(c,s) STMT_START {                  \
 647     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 648 } STMT_END
 649
 650 /* Macros for recording node offsets.   20001227 mjd@plover.com
 651  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 652  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 653  * Element 0 holds the number n.
 654  * Position is 1 indexed.
 655  */
 656 #ifndef RE_TRACK_PATTERN_OFFSETS
 657 #define Set_Node_Offset_To_R(node,byte)
 658 #define Set_Node_Offset(node,byte)
 659 #define Set_Cur_Node_Offset
 660 #define Set_Node_Length_To_R(node,len)
 661 #define Set_Node_Length(node,len)
 662 #define Set_Node_Cur_Length(node,start)
 663 #define Node_Offset(n)
 664 #define Node_Length(n)
 665 #define Set_Node_Offset_Length(node,offset,len)
 666 #define ProgLen(ri) ri->u.proglen
 667 #define SetProgLen(ri,x) ri->u.proglen = x
 668 #else
 669 #define ProgLen(ri) ri->u.offsets[0]
 670 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 671 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 672     if (! SIZE_ONLY) {                                                  \
 673         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 674                     __LINE__, (int)(node), (int)(byte)));               \
 675         if((node) < 0) {                                                \
 676             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 677         } else {                                                        \
 678             RExC_offsets[2*(node)-1] = (byte);                          \
 679         }                                                               \
 680     }                                                                   \
 681 } STMT_END
 682
 683 #define Set_Node_Offset(node,byte) \
 684     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 685 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 686
 687 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 688     if (! SIZE_ONLY) {                                                  \
 689         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 690                 __LINE__, (int)(node), (int)(len)));                    \
 691         if((node) < 0) {                                                \
 692             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 693         } else {                                                        \
 694             RExC_offsets[2*(node)] = (len);                             \
 695         }                                                               \
 696     }                                                                   \
 697 } STMT_END
 698
 699 #define Set_Node_Length(node,len) \
 700     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 701 #define Set_Node_Cur_Length(node, start)                \
 702     Set_Node_Length(node, RExC_parse - start)
 703
 704 /* Get offsets and lengths */
 705 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 706 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 707
 708 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 709     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 710     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 711 } STMT_END
 712 #endif
 713
 714 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 715 #define EXPERIMENTAL_INPLACESCAN
 716 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 717
 718 #define DEBUG_RExC_seen() \
 719         DEBUG_OPTIMISE_MORE_r({                                                     \
 720             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                            \
 721                                                                                     \
 722             if (RExC_seen & REG_SEEN_ZERO_LEN)                                      \
 723                 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN ");                 \
 724                                                                                     \
 725             if (RExC_seen & REG_SEEN_LOOKBEHIND)                                    \
 726                 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND ");               \
 727                                                                                     \
 728             if (RExC_seen & REG_SEEN_GPOS)                                          \
 729                 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS ");                     \
 730                                                                                     \
 731             if (RExC_seen & REG_SEEN_CANY)                                            \
 732                 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY ");                     \
 733                                                                                     \
 734             if (RExC_seen & REG_SEEN_RECURSE)                                       \
 735                 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE ");                  \
 736                                                                                     \
 737             if (RExC_seen & REG_TOP_LEVEL_BRANCHES)                                 \
 738                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES ");            \
 739                                                                                     \
 740             if (RExC_seen & REG_SEEN_VERBARG)                                       \
 741                 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG ");                  \
 742                                                                                     \
 743             if (RExC_seen & REG_SEEN_CUTGROUP)                                      \
 744                 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP ");                 \
 745                                                                                     \
 746             if (RExC_seen & REG_SEEN_RUN_ON_COMMENT)                                \
 747                 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT ");           \
 748                                                                                     \
 749             if (RExC_seen & REG_SEEN_EXACTF_SHARP_S)                                \
 750                 PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S ");           \
 751                                                                                     \
 752             if (RExC_seen & REG_SEEN_GOSTART)                                       \
 753                 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART ");                  \
 754                                                                                     \
 755             PerlIO_printf(Perl_debug_log,"\n");                                     \
 756         });
 757
 758 #define DEBUG_STUDYDATA(str,data,depth)                              \
 759 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 760     PerlIO_printf(Perl_debug_log,                                    \
 761         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 762         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 763         (int)(depth)*2, "",                                          \
 764         (IV)((data)->pos_min),                                       \
 765         (IV)((data)->pos_delta),                                     \
 766         (UV)((data)->flags),                                         \
 767         (IV)((data)->whilem_c),                                      \
 768         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 769         is_inf ? "INF " : ""                                         \
 770     );                                                               \
 771     if ((data)->last_found)                                          \
 772         PerlIO_printf(Perl_debug_log,                                \
 773             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 774             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 775             SvPVX_const((data)->last_found),                         \
 776             (IV)((data)->last_end),                                  \
 777             (IV)((data)->last_start_min),                            \
 778             (IV)((data)->last_start_max),                            \
 779             ((data)->longest &&                                      \
 780              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 781             SvPVX_const((data)->longest_fixed),                      \
 782             (IV)((data)->offset_fixed),                              \
 783             ((data)->longest &&                                      \
 784              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 785             SvPVX_const((data)->longest_float),                      \
 786             (IV)((data)->offset_float_min),                          \
 787             (IV)((data)->offset_float_max)                           \
 788         );                                                           \
 789     PerlIO_printf(Perl_debug_log,"\n");                              \
 790 });
 791
 792 /* Mark that we cannot extend a found fixed substring at this point.
 793    Update the longest found anchored substring and the longest found
 794    floating substrings if needed. */
 795
 796 STATIC void
 797 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 798                     SSize_t *minlenp, int is_inf)
 799 {
 800     const STRLEN l = CHR_SVLEN(data->last_found);
 801     const STRLEN old_l = CHR_SVLEN(*data->longest);
 802     GET_RE_DEBUG_FLAGS_DECL;
 803
 804     PERL_ARGS_ASSERT_SCAN_COMMIT;
 805
 806     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 807         SvSetMagicSV(*data->longest, data->last_found);
 808         if (*data->longest == data->longest_fixed) {
 809             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 810             if (data->flags & SF_BEFORE_EOL)
 811                 data->flags
 812                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 813             else
 814                 data->flags &= ~SF_FIX_BEFORE_EOL;
 815             data->minlen_fixed=minlenp;
 816             data->lookbehind_fixed=0;
 817         }
 818         else { /* *data->longest == data->longest_float */
 819             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 820             data->offset_float_max = (l
 821                                       ? data->last_start_max
 822                                       : (data->pos_delta == SSize_t_MAX
 823                                          ? SSize_t_MAX
 824                                          : data->pos_min + data->pos_delta));
 825             if (is_inf
 826                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 827                 data->offset_float_max = SSize_t_MAX;
 828             if (data->flags & SF_BEFORE_EOL)
 829                 data->flags
 830                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 831             else
 832                 data->flags &= ~SF_FL_BEFORE_EOL;
 833             data->minlen_float=minlenp;
 834             data->lookbehind_float=0;
 835         }
 836     }
 837     SvCUR_set(data->last_found, 0);
 838     {
 839         SV * const sv = data->last_found;
 840         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 841             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 842             if (mg)
 843                 mg->mg_len = 0;
 844         }
 845     }
 846     data->last_end = -1;
 847     data->flags &= ~SF_BEFORE_EOL;
 848     DEBUG_STUDYDATA("commit: ",data,0);
 849 }
 850
 851 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 852  * list that describes which code points it matches */
 853
 854 STATIC void
 855 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 856 {
 857     /* Set the SSC 'ssc' to match an empty string or any code point */
 858
 859     PERL_ARGS_ASSERT_SSC_ANYTHING;
 860
 861     assert(OP(ssc) == ANYOF_SYNTHETIC);
 862
 863     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 864     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 865     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 866 }
 867
 868 STATIC int
 869 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 870 {
 871     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 872      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 873      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 874      * in any way, so there's no point in using it */
 875
 876     UV start, end;
 877     bool ret;
 878
 879     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 880
 881     assert(OP(ssc) == ANYOF_SYNTHETIC);
 882
 883     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 884         return FALSE;
 885     }
 886
 887     /* See if the list consists solely of the range 0 - Infinity */
 888     invlist_iterinit(ssc->invlist);
 889     ret = invlist_iternext(ssc->invlist, &start, &end)
 890           && start == 0
 891           && end == UV_MAX;
 892
 893     invlist_iterfinish(ssc->invlist);
 894
 895     if (ret) {
 896         return TRUE;
 897     }
 898
 899     /* If e.g., both \w and \W are set, matches everything */
 900     if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
 901         int i;
 902         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 903             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 904                 return TRUE;
 905             }
 906         }
 907     }
 908
 909     return FALSE;
 910 }
 911
 912 STATIC void
 913 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 914 {
 915     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 916      * string, any code point, or any posix class under locale */
 917
 918     PERL_ARGS_ASSERT_SSC_INIT;
 919
 920     Zero(ssc, 1, regnode_ssc);
 921     OP(ssc) = ANYOF_SYNTHETIC;
 922     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 923     ssc_anything(ssc);
 924
 925     /* If any portion of the regex is to operate under locale rules,
 926      * initialization includes it.  The reason this isn't done for all regexes
 927      * is that the optimizer was written under the assumption that locale was
 928      * all-or-nothing.  Given the complexity and lack of documentation in the
 929      * optimizer, and that there are inadequate test cases for locale, many
 930      * parts of it may not work properly, it is safest to avoid locale unless
 931      * necessary. */
 932     if (RExC_contains_locale) {
 933         ANYOF_POSIXL_SETALL(ssc);
 934         ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
 935         if (RExC_contains_i) {
 936             ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
 937         }
 938     }
 939     else {
 940         ANYOF_POSIXL_ZERO(ssc);
 941     }
 942 }
 943
 944 STATIC int
 945 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 946                               const regnode_ssc *ssc)
 947 {
 948     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 949      * to the list of code points matched, and locale posix classes; hence does
 950      * not check its flags) */
 951
 952     UV start, end;
 953     bool ret;
 954
 955     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 956
 957     assert(OP(ssc) == ANYOF_SYNTHETIC);
 958
 959     invlist_iterinit(ssc->invlist);
 960     ret = invlist_iternext(ssc->invlist, &start, &end)
 961           && start == 0
 962           && end == UV_MAX;
 963
 964     invlist_iterfinish(ssc->invlist);
 965
 966     if (! ret) {
 967         return FALSE;
 968     }
 969
 970     if (RExC_contains_locale) {
 971         if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
 972             || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
 973             || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
 974         {
 975             return FALSE;
 976         }
 977         if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
 978             return FALSE;
 979         }
 980     }
 981
 982     return TRUE;
 983 }
 984
 985 STATIC SV*
 986 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
 987                                   const regnode_charclass_posixl* const node)
 988 {
 989     /* Returns a mortal inversion list defining which code points are matched
 990      * by 'node', which is of type ANYOF.  Handles complementing the result if
 991      * appropriate.  If some code points aren't knowable at this time, the
 992      * returned list must, and will, contain every possible code point. */
 993
 994     SV* invlist = sv_2mortal(_new_invlist(0));
 995     unsigned int i;
 996     const U32 n = ARG(node);
 997
 998     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
 999
1000     /* Look at the data structure created by S_set_ANYOF_arg() */
1001     if (n != ANYOF_NONBITMAP_EMPTY) {
1002         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1003         AV * const av = MUTABLE_AV(SvRV(rv));
1004         SV **const ary = AvARRAY(av);
1005         assert(RExC_rxi->data->what[n] == 's');
1006
1007         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1008             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1009         }
1010         else if (ary[0] && ary[0] != &PL_sv_undef) {
1011
1012             /* Here, no compile-time swash, and there are things that won't be
1013              * known until runtime -- we have to assume it could be anything */
1014             return _add_range_to_invlist(invlist, 0, UV_MAX);
1015         }
1016         else {
1017
1018             /* Here no compile-time swash, and no run-time only data.  Use the
1019              * node's inversion list */
1020             invlist = sv_2mortal(invlist_clone(ary[2]));
1021         }
1022     }
1023
1024     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1025      * inversion list for the others, but if there are code points that should
1026      * match only conditionally on the target string being UTF-8, those are
1027      * placed in the inversion list, and not the bitmap.  Since there are
1028      * circumstances under which they could match, they are included in the
1029      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1030      * here, so that when we invert below, the end result actually does include
1031      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1032      * before we add the unconditionally matched code points */
1033     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1034         _invlist_intersection_complement_2nd(invlist,
1035                                              PL_UpperLatin1,
1036                                              &invlist);
1037     }
1038
1039     /* Add in the points from the bit map */
1040     for (i = 0; i < 256; i++) {
1041         if (ANYOF_BITMAP_TEST(node, i)) {
1042             invlist = add_cp_to_invlist(invlist, i);
1043         }
1044     }
1045
1046     /* If this can match all upper Latin1 code points, have to add them
1047      * as well */
1048     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
1049         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1050     }
1051
1052     /* Similarly for these */
1053     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1054         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1055     }
1056
1057     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1058         _invlist_invert(invlist);
1059     }
1060
1061     return invlist;
1062 }
1063
1064 /* These two functions currently do the exact same thing */
1065 #define ssc_init_zero           ssc_init
1066
1067 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1068 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1069
1070 STATIC void
1071 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1072 {
1073     /* Take the flags 'and_with' and accumulate them anded into the flags for
1074      * the SSC 'ssc'.  The non-SSC related flags in 'and_with' are ignored.
1075      * The flags 'and_with' should not come from another SSC (otherwise the
1076      * EMPTY_STRING flag won't work) */
1077
1078     const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
1079
1080     PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1081
1082     /* Use just the SSC-related flags from 'and_with' */
1083     ANYOF_FLAGS(ssc) &= (and_with & ANYOF_LOCALE_FLAGS);
1084     ANYOF_FLAGS(ssc) |= ssc_only_flags;
1085 }
1086
1087 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1088  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1089  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1090
1091 STATIC void
1092 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1093                 const regnode_ssc *and_with)
1094 {
1095     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1096      * another SSC or a regular ANYOF class.  Can create false positives. */
1097
1098     SV* anded_cp_list;
1099     U8  anded_flags;
1100
1101     PERL_ARGS_ASSERT_SSC_AND;
1102
1103     assert(OP(ssc) == ANYOF_SYNTHETIC);
1104
1105     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1106      * the code point inversion list and just the relevant flags */
1107     if (OP(and_with) == ANYOF_SYNTHETIC) {
1108         anded_cp_list = and_with->invlist;
1109         anded_flags = ANYOF_FLAGS(and_with);
1110     }
1111     else {
1112         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1113                                         (regnode_charclass_posixl*) and_with);
1114         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
1115     }
1116
1117     ANYOF_FLAGS(ssc) &= anded_flags;
1118
1119     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1120      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1121      * 'and_with' may be inverted.  When not inverted, we have the situation of
1122      * computing:
1123      *  (C1 | P1) & (C2 | P2)
1124      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1125      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1126      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1127      *                    <=  ((C1 & C2) | P1 | P2)
1128      * Alternatively, the last few steps could be:
1129      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1130      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1131      *                    <=  (C1 | C2 | (P1 & P2))
1132      * We favor the second approach if either P1 or P2 is non-empty.  This is
1133      * because these components are a barrier to doing optimizations, as what
1134      * they match cannot be known until the moment of matching as they are
1135      * dependent on the current locale, 'AND"ing them likely will reduce or
1136      * eliminate them.
1137      * But we can do better if we know that C1,P1 are in their initial state (a
1138      * frequent occurrence), each matching everything:
1139      *  (<everything>) & (C2 | P2) =  C2 | P2
1140      * Similarly, if C2,P2 are in their initial state (again a frequent
1141      * occurrence), the result is a no-op
1142      *  (C1 | P1) & (<everything>) =  C1 | P1
1143      *
1144      * Inverted, we have
1145      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1146      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1147      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1148      * */
1149
1150     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1151         && OP(and_with) != ANYOF_SYNTHETIC)
1152     {
1153         unsigned int i;
1154
1155         ssc_intersection(ssc,
1156                          anded_cp_list,
1157                          FALSE /* Has already been inverted */
1158                          );
1159
1160         /* If either P1 or P2 is empty, the intersection will be also; can skip
1161          * the loop */
1162         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1163             ANYOF_POSIXL_ZERO(ssc);
1164         }
1165         else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1166
1167             /* Note that the Posix class component P from 'and_with' actually
1168              * looks like:
1169              *      P = Pa | Pb | ... | Pn
1170              * where each component is one posix class, such as in [\w\s].
1171              * Thus
1172              *      ~P = ~(Pa | Pb | ... | Pn)
1173              *         = ~Pa & ~Pb & ... & ~Pn
1174              *        <= ~Pa | ~Pb | ... | ~Pn
1175              * The last is something we can easily calculate, but unfortunately
1176              * is likely to have many false positives.  We could do better
1177              * in some (but certainly not all) instances if two classes in
1178              * P have known relationships.  For example
1179              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1180              * So
1181              *      :lower: & :print: = :lower:
1182              * And similarly for classes that must be disjoint.  For example,
1183              * since \s and \w can have no elements in common based on rules in
1184              * the POSIX standard,
1185              *      \w & ^\S = nothing
1186              * Unfortunately, some vendor locales do not meet the Posix
1187              * standard, in particular almost everything by Microsoft.
1188              * The loop below just changes e.g., \w into \W and vice versa */
1189
1190             regnode_charclass_posixl temp;
1191             int add = 1;    /* To calculate the index of the complement */
1192
1193             ANYOF_POSIXL_ZERO(&temp);
1194             for (i = 0; i < ANYOF_MAX; i++) {
1195                 assert(i % 2 != 0
1196                        || ! ANYOF_POSIXL_TEST(and_with, i)
1197                        || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1198
1199                 if (ANYOF_POSIXL_TEST(and_with, i)) {
1200                     ANYOF_POSIXL_SET(&temp, i + add);
1201                 }
1202                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1203             }
1204             ANYOF_POSIXL_AND(&temp, ssc);
1205
1206         } /* else ssc already has no posixes */
1207     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1208          in its initial state */
1209     else if (OP(and_with) != ANYOF_SYNTHETIC
1210              || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1211     {
1212         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1213          * copy it over 'ssc' */
1214         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1215             if (OP(and_with) == ANYOF_SYNTHETIC) {
1216                 StructCopy(and_with, ssc, regnode_ssc);
1217             }
1218             else {
1219                 ssc->invlist = anded_cp_list;
1220                 ANYOF_POSIXL_ZERO(ssc);
1221                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1222                     ANYOF_POSIXL_OR(and_with, ssc);
1223                 }
1224             }
1225         }
1226         else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1227                     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1228         {
1229             /* One or the other of P1, P2 is non-empty. */
1230             ANYOF_POSIXL_AND(and_with, ssc);
1231             ssc_union(ssc, anded_cp_list, FALSE);
1232         }
1233         else { /* P1 = P2 = empty */
1234             ssc_intersection(ssc, anded_cp_list, FALSE);
1235         }
1236     }
1237 }
1238
1239 STATIC void
1240 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1241                const regnode_ssc *or_with)
1242 {
1243     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1244      * another SSC or a regular ANYOF class.  Can create false positives if
1245      * 'or_with' is to be inverted. */
1246
1247     SV* ored_cp_list;
1248     U8 ored_flags;
1249
1250     PERL_ARGS_ASSERT_SSC_OR;
1251
1252     assert(OP(ssc) == ANYOF_SYNTHETIC);
1253
1254     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1255      * the code point inversion list and just the relevant flags */
1256     if (OP(or_with) == ANYOF_SYNTHETIC) {
1257         ored_cp_list = or_with->invlist;
1258         ored_flags = ANYOF_FLAGS(or_with);
1259     }
1260     else {
1261         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1262                                         (regnode_charclass_posixl*) or_with);
1263         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
1264     }
1265
1266     ANYOF_FLAGS(ssc) |= ored_flags;
1267
1268     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1269      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1270      * 'or_with' may be inverted.  When not inverted, we have the simple
1271      * situation of computing:
1272      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1273      * If P1|P2 yields a situation with both a class and its complement are
1274      * set, like having both \w and \W, this matches all code points, and we
1275      * can delete these from the P component of the ssc going forward.  XXX We
1276      * might be able to delete all the P components, but I (khw) am not certain
1277      * about this, and it is better to be safe.
1278      *
1279      * Inverted, we have
1280      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1281      *                         <=  (C1 | P1) | ~C2
1282      *                         <=  (C1 | ~C2) | P1
1283      * (which results in actually simpler code than the non-inverted case)
1284      * */
1285
1286     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1287         && OP(or_with) != ANYOF_SYNTHETIC)
1288     {
1289         /* We ignore P2, leaving P1 going forward */
1290     }
1291     else {  /* Not inverted */
1292         ANYOF_POSIXL_OR(or_with, ssc);
1293         if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1294             unsigned int i;
1295             for (i = 0; i < ANYOF_MAX; i += 2) {
1296                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1297                 {
1298                     ssc_match_all_cp(ssc);
1299                     ANYOF_POSIXL_CLEAR(ssc, i);
1300                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1301                     if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1302                         ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1303                     }
1304                 }
1305             }
1306         }
1307     }
1308
1309     ssc_union(ssc,
1310               ored_cp_list,
1311               FALSE /* Already has been inverted */
1312               );
1313 }
1314
1315 PERL_STATIC_INLINE void
1316 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1317 {
1318     PERL_ARGS_ASSERT_SSC_UNION;
1319
1320     assert(OP(ssc) == ANYOF_SYNTHETIC);
1321
1322     _invlist_union_maybe_complement_2nd(ssc->invlist,
1323                                         invlist,
1324                                         invert2nd,
1325                                         &ssc->invlist);
1326 }
1327
1328 PERL_STATIC_INLINE void
1329 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1330                          SV* const invlist,
1331                          const bool invert2nd)
1332 {
1333     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1334
1335     assert(OP(ssc) == ANYOF_SYNTHETIC);
1336
1337     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1338                                                invlist,
1339                                                invert2nd,
1340                                                &ssc->invlist);
1341 }
1342
1343 PERL_STATIC_INLINE void
1344 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1345 {
1346     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1347
1348     assert(OP(ssc) == ANYOF_SYNTHETIC);
1349
1350     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1351 }
1352
1353 PERL_STATIC_INLINE void
1354 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1355 {
1356     /* AND just the single code point 'cp' into the SSC 'ssc' */
1357
1358     SV* cp_list = _new_invlist(2);
1359
1360     PERL_ARGS_ASSERT_SSC_CP_AND;
1361
1362     assert(OP(ssc) == ANYOF_SYNTHETIC);
1363
1364     cp_list = add_cp_to_invlist(cp_list, cp);
1365     ssc_intersection(ssc, cp_list,
1366                      FALSE /* Not inverted */
1367                      );
1368     SvREFCNT_dec_NN(cp_list);
1369 }
1370
1371 PERL_STATIC_INLINE void
1372 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1373 {
1374     /* Set the SSC 'ssc' to not match any locale things */
1375
1376     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1377
1378     assert(OP(ssc) == ANYOF_SYNTHETIC);
1379
1380     ANYOF_POSIXL_ZERO(ssc);
1381     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1382 }
1383
1384 STATIC void
1385 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1386 {
1387     /* The inversion list in the SSC is marked mortal; now we need a more
1388      * permanent copy, which is stored the same way that is done in a regular
1389      * ANYOF node, with the first 256 code points in a bit map */
1390
1391     SV* invlist = invlist_clone(ssc->invlist);
1392
1393     PERL_ARGS_ASSERT_SSC_FINALIZE;
1394
1395     assert(OP(ssc) == ANYOF_SYNTHETIC);
1396
1397     /* The code in this file assumes that all but these flags aren't relevant
1398      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1399      * time we reach here */
1400     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
1401
1402     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1403
1404     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1405
1406     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1407 }
1408
1409 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1410 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1411 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1412 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1413
1414
1415 #ifdef DEBUGGING
1416 /*
1417    dump_trie(trie,widecharmap,revcharmap)
1418    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1419    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1420
1421    These routines dump out a trie in a somewhat readable format.
1422    The _interim_ variants are used for debugging the interim
1423    tables that are used to generate the final compressed
1424    representation which is what dump_trie expects.
1425
1426    Part of the reason for their existence is to provide a form
1427    of documentation as to how the different representations function.
1428
1429 */
1430
1431 /*
1432   Dumps the final compressed table form of the trie to Perl_debug_log.
1433   Used for debugging make_trie().
1434 */
1435
1436 STATIC void
1437 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1438             AV *revcharmap, U32 depth)
1439 {
1440     U32 state;
1441     SV *sv=sv_newmortal();
1442     int colwidth= widecharmap ? 6 : 4;
1443     U16 word;
1444     GET_RE_DEBUG_FLAGS_DECL;
1445
1446     PERL_ARGS_ASSERT_DUMP_TRIE;
1447
1448     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1449         (int)depth * 2 + 2,"",
1450         "Match","Base","Ofs" );
1451
1452     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1453         SV ** const tmp = av_fetch( revcharmap, state, 0);
1454         if ( tmp ) {
1455             PerlIO_printf( Perl_debug_log, "%*s",
1456                 colwidth,
1457                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1458                             PL_colors[0], PL_colors[1],
1459                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1460                             PERL_PV_ESCAPE_FIRSTCHAR
1461                 )
1462             );
1463         }
1464     }
1465     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1466         (int)depth * 2 + 2,"");
1467
1468     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1469         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1470     PerlIO_printf( Perl_debug_log, "\n");
1471
1472     for( state = 1 ; state < trie->statecount ; state++ ) {
1473         const U32 base = trie->states[ state ].trans.base;
1474
1475         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1476
1477         if ( trie->states[ state ].wordnum ) {
1478             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1479         } else {
1480             PerlIO_printf( Perl_debug_log, "%6s", "" );
1481         }
1482
1483         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1484
1485         if ( base ) {
1486             U32 ofs = 0;
1487
1488             while( ( base + ofs  < trie->uniquecharcount ) ||
1489                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1490                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1491                     ofs++;
1492
1493             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1494
1495             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1496                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1497                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1498                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1499                 {
1500                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1501                     colwidth,
1502                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1503                 } else {
1504                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1505                 }
1506             }
1507
1508             PerlIO_printf( Perl_debug_log, "]");
1509
1510         }
1511         PerlIO_printf( Perl_debug_log, "\n" );
1512     }
1513     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1514     for (word=1; word <= trie->wordcount; word++) {
1515         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1516             (int)word, (int)(trie->wordinfo[word].prev),
1517             (int)(trie->wordinfo[word].len));
1518     }
1519     PerlIO_printf(Perl_debug_log, "\n" );
1520 }
1521 /*
1522   Dumps a fully constructed but uncompressed trie in list form.
1523   List tries normally only are used for construction when the number of
1524   possible chars (trie->uniquecharcount) is very high.
1525   Used for debugging make_trie().
1526 */
1527 STATIC void
1528 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1529                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1530                          U32 depth)
1531 {
1532     U32 state;
1533     SV *sv=sv_newmortal();
1534     int colwidth= widecharmap ? 6 : 4;
1535     GET_RE_DEBUG_FLAGS_DECL;
1536
1537     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1538
1539     /* print out the table precompression.  */
1540     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1541         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1542         "------:-----+-----------------\n" );
1543
1544     for( state=1 ; state < next_alloc ; state ++ ) {
1545         U16 charid;
1546
1547         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1548             (int)depth * 2 + 2,"", (UV)state  );
1549         if ( ! trie->states[ state ].wordnum ) {
1550             PerlIO_printf( Perl_debug_log, "%5s| ","");
1551         } else {
1552             PerlIO_printf( Perl_debug_log, "W%4x| ",
1553                 trie->states[ state ].wordnum
1554             );
1555         }
1556         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1557             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1558             if ( tmp ) {
1559                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1560                     colwidth,
1561                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1562                             PL_colors[0], PL_colors[1],
1563                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1564                             PERL_PV_ESCAPE_FIRSTCHAR
1565                     ) ,
1566                     TRIE_LIST_ITEM(state,charid).forid,
1567                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1568                 );
1569                 if (!(charid % 10))
1570                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1571                         (int)((depth * 2) + 14), "");
1572             }
1573         }
1574         PerlIO_printf( Perl_debug_log, "\n");
1575     }
1576 }
1577
1578 /*
1579   Dumps a fully constructed but uncompressed trie in table form.
1580   This is the normal DFA style state transition table, with a few
1581   twists to facilitate compression later.
1582   Used for debugging make_trie().
1583 */
1584 STATIC void
1585 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1586                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1587                           U32 depth)
1588 {
1589     U32 state;
1590     U16 charid;
1591     SV *sv=sv_newmortal();
1592     int colwidth= widecharmap ? 6 : 4;
1593     GET_RE_DEBUG_FLAGS_DECL;
1594
1595     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1596
1597     /*
1598        print out the table precompression so that we can do a visual check
1599        that they are identical.
1600      */
1601
1602     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1603
1604     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1605         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1606         if ( tmp ) {
1607             PerlIO_printf( Perl_debug_log, "%*s",
1608                 colwidth,
1609                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1610                             PL_colors[0], PL_colors[1],
1611                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1612                             PERL_PV_ESCAPE_FIRSTCHAR
1613                 )
1614             );
1615         }
1616     }
1617
1618     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1619
1620     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1621         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1622     }
1623
1624     PerlIO_printf( Perl_debug_log, "\n" );
1625
1626     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1627
1628         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1629             (int)depth * 2 + 2,"",
1630             (UV)TRIE_NODENUM( state ) );
1631
1632         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1633             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1634             if (v)
1635                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1636             else
1637                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1638         }
1639         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1640             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1641         } else {
1642             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1643             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1644         }
1645     }
1646 }
1647
1648 #endif
1649
1650
1651 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1652   startbranch: the first branch in the whole branch sequence
1653   first      : start branch of sequence of branch-exact nodes.
1654                May be the same as startbranch
1655   last       : Thing following the last branch.
1656                May be the same as tail.
1657   tail       : item following the branch sequence
1658   count      : words in the sequence
1659   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1660   depth      : indent depth
1661
1662 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1663
1664 A trie is an N'ary tree where the branches are determined by digital
1665 decomposition of the key. IE, at the root node you look up the 1st character and
1666 follow that branch repeat until you find the end of the branches. Nodes can be
1667 marked as "accepting" meaning they represent a complete word. Eg:
1668
1669   /he|she|his|hers/
1670
1671 would convert into the following structure. Numbers represent states, letters
1672 following numbers represent valid transitions on the letter from that state, if
1673 the number is in square brackets it represents an accepting state, otherwise it
1674 will be in parenthesis.
1675
1676       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1677       |    |
1678       |   (2)
1679       |    |
1680      (1)   +-i->(6)-+-s->[7]
1681       |
1682       +-s->(3)-+-h->(4)-+-e->[5]
1683
1684       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1685
1686 This shows that when matching against the string 'hers' we will begin at state 1
1687 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1688 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1689 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1690 single traverse. We store a mapping from accepting to state to which word was
1691 matched, and then when we have multiple possibilities we try to complete the
1692 rest of the regex in the order in which they occured in the alternation.
1693
1694 The only prior NFA like behaviour that would be changed by the TRIE support is
1695 the silent ignoring of duplicate alternations which are of the form:
1696
1697  / (DUPE|DUPE) X? (?{ ... }) Y /x
1698
1699 Thus EVAL blocks following a trie may be called a different number of times with
1700 and without the optimisation. With the optimisations dupes will be silently
1701 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1702 the following demonstrates:
1703
1704  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1705
1706 which prints out 'word' three times, but
1707
1708  'words'=~/(word|word|word)(?{ print $1 })S/
1709
1710 which doesnt print it out at all. This is due to other optimisations kicking in.
1711
1712 Example of what happens on a structural level:
1713
1714 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1715
1716    1: CURLYM[1] {1,32767}(18)
1717    5:   BRANCH(8)
1718    6:     EXACT <ac>(16)
1719    8:   BRANCH(11)
1720    9:     EXACT <ad>(16)
1721   11:   BRANCH(14)
1722   12:     EXACT <ab>(16)
1723   16:   SUCCEED(0)
1724   17:   NOTHING(18)
1725   18: END(0)
1726
1727 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1728 and should turn into:
1729
1730    1: CURLYM[1] {1,32767}(18)
1731    5:   TRIE(16)
1732         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1733           <ac>
1734           <ad>
1735           <ab>
1736   16:   SUCCEED(0)
1737   17:   NOTHING(18)
1738   18: END(0)
1739
1740 Cases where tail != last would be like /(?foo|bar)baz/:
1741
1742    1: BRANCH(4)
1743    2:   EXACT <foo>(8)
1744    4: BRANCH(7)
1745    5:   EXACT <bar>(8)
1746    7: TAIL(8)
1747    8: EXACT <baz>(10)
1748   10: END(0)
1749
1750 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1751 and would end up looking like:
1752
1753     1: TRIE(8)
1754       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1755         <foo>
1756         <bar>
1757    7: TAIL(8)
1758    8: EXACT <baz>(10)
1759   10: END(0)
1760
1761     d = uvchr_to_utf8_flags(d, uv, 0);
1762
1763 is the recommended Unicode-aware way of saying
1764
1765     *(d++) = uv;
1766 */
1767
1768 #define TRIE_STORE_REVCHAR(val)                                            \
1769     STMT_START {                                                           \
1770         if (UTF) {                                                         \
1771             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1772             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1773             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1774             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1775             SvPOK_on(zlopp);                                               \
1776             SvUTF8_on(zlopp);                                              \
1777             av_push(revcharmap, zlopp);                                    \
1778         } else {                                                           \
1779             char ooooff = (char)val;                                           \
1780             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1781         }                                                                  \
1782         } STMT_END
1783
1784 /* This gets the next character from the input, folding it if not already
1785  * folded. */
1786 #define TRIE_READ_CHAR STMT_START {                                           \
1787     wordlen++;                                                                \
1788     if ( UTF ) {                                                              \
1789         /* if it is UTF then it is either already folded, or does not need    \
1790          * folding */                                                         \
1791         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1792     }                                                                         \
1793     else if (folder == PL_fold_latin1) {                                      \
1794         /* This folder implies Unicode rules, which in the range expressible  \
1795          *  by not UTF is the lower case, with the two exceptions, one of     \
1796          *  which should have been taken care of before calling this */       \
1797         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1798         uvc = toLOWER_L1(*uc);                                                \
1799         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1800         len = 1;                                                              \
1801     } else {                                                                  \
1802         /* raw data, will be folded later if needed */                        \
1803         uvc = (U32)*uc;                                                       \
1804         len = 1;                                                              \
1805     }                                                                         \
1806 } STMT_END
1807
1808
1809
1810 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1811     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1812         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1813         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1814     }                                                           \
1815     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1816     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1817     TRIE_LIST_CUR( state )++;                                   \
1818 } STMT_END
1819
1820 #define TRIE_LIST_NEW(state) STMT_START {                       \
1821     Newxz( trie->states[ state ].trans.list,               \
1822         4, reg_trie_trans_le );                                 \
1823      TRIE_LIST_CUR( state ) = 1;                                \
1824      TRIE_LIST_LEN( state ) = 4;                                \
1825 } STMT_END
1826
1827 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1828     U16 dupe= trie->states[ state ].wordnum;                    \
1829     regnode * const noper_next = regnext( noper );              \
1830                                                                 \
1831     DEBUG_r({                                                   \
1832         /* store the word for dumping */                        \
1833         SV* tmp;                                                \
1834         if (OP(noper) != NOTHING)                               \
1835             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1836         else                                                    \
1837             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1838         av_push( trie_words, tmp );                             \
1839     });                                                         \
1840                                                                 \
1841     curword++;                                                  \
1842     trie->wordinfo[curword].prev   = 0;                         \
1843     trie->wordinfo[curword].len    = wordlen;                   \
1844     trie->wordinfo[curword].accept = state;                     \
1845                                                                 \
1846     if ( noper_next < tail ) {                                  \
1847         if (!trie->jump)                                        \
1848             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1849         trie->jump[curword] = (U16)(noper_next - convert);      \
1850         if (!jumper)                                            \
1851             jumper = noper_next;                                \
1852         if (!nextbranch)                                        \
1853             nextbranch= regnext(cur);                           \
1854     }                                                           \
1855                                                                 \
1856     if ( dupe ) {                                               \
1857         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1858         /* chain, so that when the bits of chain are later    */\
1859         /* linked together, the dups appear in the chain      */\
1860         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1861         trie->wordinfo[dupe].prev = curword;                    \
1862     } else {                                                    \
1863         /* we haven't inserted this word yet.                */ \
1864         trie->states[ state ].wordnum = curword;                \
1865     }                                                           \
1866 } STMT_END
1867
1868
1869 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1870      ( ( base + charid >=  ucharcount                                   \
1871          && base + charid < ubound                                      \
1872          && state == trie->trans[ base - ucharcount + charid ].check    \
1873          && trie->trans[ base - ucharcount + charid ].next )            \
1874            ? trie->trans[ base - ucharcount + charid ].next             \
1875            : ( state==1 ? special : 0 )                                 \
1876       )
1877
1878 #define MADE_TRIE       1
1879 #define MADE_JUMP_TRIE  2
1880 #define MADE_EXACT_TRIE 4
1881
1882 STATIC I32
1883 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1884 {
1885     dVAR;
1886     /* first pass, loop through and scan words */
1887     reg_trie_data *trie;
1888     HV *widecharmap = NULL;
1889     AV *revcharmap = newAV();
1890     regnode *cur;
1891     STRLEN len = 0;
1892     UV uvc = 0;
1893     U16 curword = 0;
1894     U32 next_alloc = 0;
1895     regnode *jumper = NULL;
1896     regnode *nextbranch = NULL;
1897     regnode *convert = NULL;
1898     U32 *prev_states; /* temp array mapping each state to previous one */
1899     /* we just use folder as a flag in utf8 */
1900     const U8 * folder = NULL;
1901
1902 #ifdef DEBUGGING
1903     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1904     AV *trie_words = NULL;
1905     /* along with revcharmap, this only used during construction but both are
1906      * useful during debugging so we store them in the struct when debugging.
1907      */
1908 #else
1909     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1910     STRLEN trie_charcount=0;
1911 #endif
1912     SV *re_trie_maxbuff;
1913     GET_RE_DEBUG_FLAGS_DECL;
1914
1915     PERL_ARGS_ASSERT_MAKE_TRIE;
1916 #ifndef DEBUGGING
1917     PERL_UNUSED_ARG(depth);
1918 #endif
1919
1920     switch (flags) {
1921         case EXACT: break;
1922         case EXACTFA:
1923         case EXACTFU_SS:
1924         case EXACTFU: folder = PL_fold_latin1; break;
1925         case EXACTF:  folder = PL_fold; break;
1926         case EXACTFL: folder = PL_fold_locale; break;
1927         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1928     }
1929
1930     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1931     trie->refcount = 1;
1932     trie->startstate = 1;
1933     trie->wordcount = word_count;
1934     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1935     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1936     if (flags == EXACT)
1937         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1938     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1939                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1940
1941     DEBUG_r({
1942         trie_words = newAV();
1943     });
1944
1945     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1946     if (!SvIOK(re_trie_maxbuff)) {
1947         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1948     }
1949     DEBUG_TRIE_COMPILE_r({
1950                 PerlIO_printf( Perl_debug_log,
1951                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1952                   (int)depth * 2 + 2, "",
1953                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1954                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1955                   (int)depth);
1956     });
1957
1958    /* Find the node we are going to overwrite */
1959     if ( first == startbranch && OP( last ) != BRANCH ) {
1960         /* whole branch chain */
1961         convert = first;
1962     } else {
1963         /* branch sub-chain */
1964         convert = NEXTOPER( first );
1965     }
1966
1967     /*  -- First loop and Setup --
1968
1969        We first traverse the branches and scan each word to determine if it
1970        contains widechars, and how many unique chars there are, this is
1971        important as we have to build a table with at least as many columns as we
1972        have unique chars.
1973
1974        We use an array of integers to represent the character codes 0..255
1975        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1976        native representation of the character value as the key and IV's for the
1977        coded index.
1978
1979        *TODO* If we keep track of how many times each character is used we can
1980        remap the columns so that the table compression later on is more
1981        efficient in terms of memory by ensuring the most common value is in the
1982        middle and the least common are on the outside.  IMO this would be better
1983        than a most to least common mapping as theres a decent chance the most
1984        common letter will share a node with the least common, meaning the node
1985        will not be compressible. With a middle is most common approach the worst
1986        case is when we have the least common nodes twice.
1987
1988      */
1989
1990     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1991         regnode *noper = NEXTOPER( cur );
1992         const U8 *uc = (U8*)STRING( noper );
1993         const U8 *e  = uc + STR_LEN( noper );
1994         STRLEN foldlen = 0;
1995         U32 wordlen      = 0;         /* required init */
1996         STRLEN minbytes = 0;
1997         STRLEN maxbytes = 0;
1998         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1999
2000         if (OP(noper) == NOTHING) {
2001             regnode *noper_next= regnext(noper);
2002             if (noper_next != tail && OP(noper_next) == flags) {
2003                 noper = noper_next;
2004                 uc= (U8*)STRING(noper);
2005                 e= uc + STR_LEN(noper);
2006                 trie->minlen= STR_LEN(noper);
2007             } else {
2008                 trie->minlen= 0;
2009                 continue;
2010             }
2011         }
2012
2013         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2014             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2015                                           regardless of encoding */
2016             if (OP( noper ) == EXACTFU_SS) {
2017                 /* false positives are ok, so just set this */
2018                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2019             }
2020         }
2021         for ( ; uc < e ; uc += len ) {
2022             TRIE_CHARCOUNT(trie)++;
2023             TRIE_READ_CHAR;
2024
2025             /* Acummulate to the current values, the range in the number of
2026              * bytes that this character could match.  The max is presumed to
2027              * be the same as the folded input (which TRIE_READ_CHAR returns),
2028              * except that when this is not in UTF-8, it could be matched
2029              * against a string which is UTF-8, and the variant characters
2030              * could be 2 bytes instead of the 1 here.  Likewise, for the
2031              * minimum number of bytes when not folded.  When folding, the min
2032              * is assumed to be 1 byte could fold to match the single character
2033              * here, or in the case of a multi-char fold, 1 byte can fold to
2034              * the whole sequence.  'foldlen' is used to denote whether we are
2035              * in such a sequence, skipping the min setting if so.  XXX TODO
2036              * Use the exact list of what folds to each character, from
2037              * PL_utf8_foldclosures */
2038             if (UTF) {
2039                 maxbytes += UTF8SKIP(uc);
2040                 if (! folder) {
2041                     /* A non-UTF-8 string could be 1 byte to match our 2 */
2042                     minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2043                                 ? 1
2044                                 : UTF8SKIP(uc);
2045                 }
2046                 else {
2047                     if (foldlen) {
2048                         foldlen -= UTF8SKIP(uc);
2049                     }
2050                     else {
2051                         foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2052                         minbytes++;
2053                     }
2054                 }
2055             }
2056             else {
2057                 maxbytes += (UNI_IS_INVARIANT(*uc))
2058                              ? 1
2059                              : 2;
2060                 if (! folder) {
2061                     minbytes++;
2062                 }
2063                 else {
2064                     if (foldlen) {
2065                         foldlen--;
2066                     }
2067                     else {
2068                         foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2069                         minbytes++;
2070                     }
2071                 }
2072             }
2073             if ( uvc < 256 ) {
2074                 if ( folder ) {
2075                     U8 folded= folder[ (U8) uvc ];
2076                     if ( !trie->charmap[ folded ] ) {
2077                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2078                         TRIE_STORE_REVCHAR( folded );
2079                     }
2080                 }
2081                 if ( !trie->charmap[ uvc ] ) {
2082                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2083                     TRIE_STORE_REVCHAR( uvc );
2084                 }
2085                 if ( set_bit ) {
2086                     /* store the codepoint in the bitmap, and its folded
2087                      * equivalent. */
2088                     TRIE_BITMAP_SET(trie, uvc);
2089
2090                     /* store the folded codepoint */
2091                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2092
2093                     if ( !UTF ) {
2094                         /* store first byte of utf8 representation of
2095                            variant codepoints */
2096                         if (! UVCHR_IS_INVARIANT(uvc)) {
2097                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2098                         }
2099                     }
2100                     set_bit = 0; /* We've done our bit :-) */
2101                 }
2102             } else {
2103                 SV** svpp;
2104                 if ( !widecharmap )
2105                     widecharmap = newHV();
2106
2107                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2108
2109                 if ( !svpp )
2110                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2111
2112                 if ( !SvTRUE( *svpp ) ) {
2113                     sv_setiv( *svpp, ++trie->uniquecharcount );
2114                     TRIE_STORE_REVCHAR(uvc);
2115                 }
2116             }
2117         }
2118         if( cur == first ) {
2119             trie->minlen = minbytes;
2120             trie->maxlen = maxbytes;
2121         } else if (minbytes < trie->minlen) {
2122             trie->minlen = minbytes;
2123         } else if (maxbytes > trie->maxlen) {
2124             trie->maxlen = maxbytes;
2125         }
2126     } /* end first pass */
2127     DEBUG_TRIE_COMPILE_r(
2128         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2129                 (int)depth * 2 + 2,"",
2130                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2131                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2132                 (int)trie->minlen, (int)trie->maxlen )
2133     );
2134
2135     /*
2136         We now know what we are dealing with in terms of unique chars and
2137         string sizes so we can calculate how much memory a naive
2138         representation using a flat table  will take. If it's over a reasonable
2139         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2140         conservative but potentially much slower representation using an array
2141         of lists.
2142
2143         At the end we convert both representations into the same compressed
2144         form that will be used in regexec.c for matching with. The latter
2145         is a form that cannot be used to construct with but has memory
2146         properties similar to the list form and access properties similar
2147         to the table form making it both suitable for fast searches and
2148         small enough that its feasable to store for the duration of a program.
2149
2150         See the comment in the code where the compressed table is produced
2151         inplace from the flat tabe representation for an explanation of how
2152         the compression works.
2153
2154     */
2155
2156
2157     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2158     prev_states[1] = 0;
2159
2160     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
2161         /*
2162             Second Pass -- Array Of Lists Representation
2163
2164             Each state will be represented by a list of charid:state records
2165             (reg_trie_trans_le) the first such element holds the CUR and LEN
2166             points of the allocated array. (See defines above).
2167
2168             We build the initial structure using the lists, and then convert
2169             it into the compressed table form which allows faster lookups
2170             (but cant be modified once converted).
2171         */
2172
2173         STRLEN transcount = 1;
2174
2175         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2176             "%*sCompiling trie using list compiler\n",
2177             (int)depth * 2 + 2, ""));
2178
2179         trie->states = (reg_trie_state *)
2180             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2181                                   sizeof(reg_trie_state) );
2182         TRIE_LIST_NEW(1);
2183         next_alloc = 2;
2184
2185         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2186
2187             regnode *noper   = NEXTOPER( cur );
2188             U8 *uc           = (U8*)STRING( noper );
2189             const U8 *e      = uc + STR_LEN( noper );
2190             U32 state        = 1;         /* required init */
2191             U16 charid       = 0;         /* sanity init */
2192             U32 wordlen      = 0;         /* required init */
2193
2194             if (OP(noper) == NOTHING) {
2195                 regnode *noper_next= regnext(noper);
2196                 if (noper_next != tail && OP(noper_next) == flags) {
2197                     noper = noper_next;
2198                     uc= (U8*)STRING(noper);
2199                     e= uc + STR_LEN(noper);
2200                 }
2201             }
2202
2203             if (OP(noper) != NOTHING) {
2204                 for ( ; uc < e ; uc += len ) {
2205
2206                     TRIE_READ_CHAR;
2207
2208                     if ( uvc < 256 ) {
2209                         charid = trie->charmap[ uvc ];
2210                     } else {
2211                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2212                         if ( !svpp ) {
2213                             charid = 0;
2214                         } else {
2215                             charid=(U16)SvIV( *svpp );
2216                         }
2217                     }
2218                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2219                     if ( charid ) {
2220
2221                         U16 check;
2222                         U32 newstate = 0;
2223
2224                         charid--;
2225                         if ( !trie->states[ state ].trans.list ) {
2226                             TRIE_LIST_NEW( state );
2227                         }
2228                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2229                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2230                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2231                                 break;
2232                             }
2233                         }
2234                         if ( ! newstate ) {
2235                             newstate = next_alloc++;
2236                             prev_states[newstate] = state;
2237                             TRIE_LIST_PUSH( state, charid, newstate );
2238                             transcount++;
2239                         }
2240                         state = newstate;
2241                     } else {
2242                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2243                     }
2244                 }
2245             }
2246             TRIE_HANDLE_WORD(state);
2247
2248         } /* end second pass */
2249
2250         /* next alloc is the NEXT state to be allocated */
2251         trie->statecount = next_alloc;
2252         trie->states = (reg_trie_state *)
2253             PerlMemShared_realloc( trie->states,
2254                                    next_alloc
2255                                    * sizeof(reg_trie_state) );
2256
2257         /* and now dump it out before we compress it */
2258         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2259                                                          revcharmap, next_alloc,
2260                                                          depth+1)
2261         );
2262
2263         trie->trans = (reg_trie_trans *)
2264             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2265         {
2266             U32 state;
2267             U32 tp = 0;
2268             U32 zp = 0;
2269
2270
2271             for( state=1 ; state < next_alloc ; state ++ ) {
2272                 U32 base=0;
2273
2274                 /*
2275                 DEBUG_TRIE_COMPILE_MORE_r(
2276                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2277                 );
2278                 */
2279
2280                 if (trie->states[state].trans.list) {
2281                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2282                     U16 maxid=minid;
2283                     U16 idx;
2284
2285                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2286                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2287                         if ( forid < minid ) {
2288                             minid=forid;
2289                         } else if ( forid > maxid ) {
2290                             maxid=forid;
2291                         }
2292                     }
2293                     if ( transcount < tp + maxid - minid + 1) {
2294                         transcount *= 2;
2295                         trie->trans = (reg_trie_trans *)
2296                             PerlMemShared_realloc( trie->trans,
2297                                                      transcount
2298                                                      * sizeof(reg_trie_trans) );
2299                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2300                     }
2301                     base = trie->uniquecharcount + tp - minid;
2302                     if ( maxid == minid ) {
2303                         U32 set = 0;
2304                         for ( ; zp < tp ; zp++ ) {
2305                             if ( ! trie->trans[ zp ].next ) {
2306                                 base = trie->uniquecharcount + zp - minid;
2307                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2308                                 trie->trans[ zp ].check = state;
2309                                 set = 1;
2310                                 break;
2311                             }
2312                         }
2313                         if ( !set ) {
2314                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2315                             trie->trans[ tp ].check = state;
2316                             tp++;
2317                             zp = tp;
2318                         }
2319                     } else {
2320                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2321                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2322                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2323                             trie->trans[ tid ].check = state;
2324                         }
2325                         tp += ( maxid - minid + 1 );
2326                     }
2327                     Safefree(trie->states[ state ].trans.list);
2328                 }
2329                 /*
2330                 DEBUG_TRIE_COMPILE_MORE_r(
2331                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2332                 );
2333                 */
2334                 trie->states[ state ].trans.base=base;
2335             }
2336             trie->lasttrans = tp + 1;
2337         }
2338     } else {
2339         /*
2340            Second Pass -- Flat Table Representation.
2341
2342            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2343            each.  We know that we will need Charcount+1 trans at most to store
2344            the data (one row per char at worst case) So we preallocate both
2345            structures assuming worst case.
2346
2347            We then construct the trie using only the .next slots of the entry
2348            structs.
2349
2350            We use the .check field of the first entry of the node temporarily
2351            to make compression both faster and easier by keeping track of how
2352            many non zero fields are in the node.
2353
2354            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2355            transition.
2356
2357            There are two terms at use here: state as a TRIE_NODEIDX() which is
2358            a number representing the first entry of the node, and state as a
2359            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2360            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2361            if there are 2 entrys per node. eg:
2362
2363              A B       A B
2364           1. 2 4    1. 3 7
2365           2. 0 3    3. 0 5
2366           3. 0 0    5. 0 0
2367           4. 0 0    7. 0 0
2368
2369            The table is internally in the right hand, idx form. However as we
2370            also have to deal with the states array which is indexed by nodenum
2371            we have to use TRIE_NODENUM() to convert.
2372
2373         */
2374         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2375             "%*sCompiling trie using table compiler\n",
2376             (int)depth * 2 + 2, ""));
2377
2378         trie->trans = (reg_trie_trans *)
2379             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2380                                   * trie->uniquecharcount + 1,
2381                                   sizeof(reg_trie_trans) );
2382         trie->states = (reg_trie_state *)
2383             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2384                                   sizeof(reg_trie_state) );
2385         next_alloc = trie->uniquecharcount + 1;
2386
2387
2388         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2389
2390             regnode *noper   = NEXTOPER( cur );
2391             const U8 *uc     = (U8*)STRING( noper );
2392             const U8 *e      = uc + STR_LEN( noper );
2393
2394             U32 state        = 1;         /* required init */
2395
2396             U16 charid       = 0;         /* sanity init */
2397             U32 accept_state = 0;         /* sanity init */
2398
2399             U32 wordlen      = 0;         /* required init */
2400
2401             if (OP(noper) == NOTHING) {
2402                 regnode *noper_next= regnext(noper);
2403                 if (noper_next != tail && OP(noper_next) == flags) {
2404                     noper = noper_next;
2405                     uc= (U8*)STRING(noper);
2406                     e= uc + STR_LEN(noper);
2407                 }
2408             }
2409
2410             if ( OP(noper) != NOTHING ) {
2411                 for ( ; uc < e ; uc += len ) {
2412
2413                     TRIE_READ_CHAR;
2414
2415                     if ( uvc < 256 ) {
2416                         charid = trie->charmap[ uvc ];
2417                     } else {
2418                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2419                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2420                     }
2421                     if ( charid ) {
2422                         charid--;
2423                         if ( !trie->trans[ state + charid ].next ) {
2424                             trie->trans[ state + charid ].next = next_alloc;
2425                             trie->trans[ state ].check++;
2426                             prev_states[TRIE_NODENUM(next_alloc)]
2427                                     = TRIE_NODENUM(state);
2428                             next_alloc += trie->uniquecharcount;
2429                         }
2430                         state = trie->trans[ state + charid ].next;
2431                     } else {
2432                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2433                     }
2434                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2435                 }
2436             }
2437             accept_state = TRIE_NODENUM( state );
2438             TRIE_HANDLE_WORD(accept_state);
2439
2440         } /* end second pass */
2441
2442         /* and now dump it out before we compress it */
2443         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2444                                                           revcharmap,
2445                                                           next_alloc, depth+1));
2446
2447         {
2448         /*
2449            * Inplace compress the table.*
2450
2451            For sparse data sets the table constructed by the trie algorithm will
2452            be mostly 0/FAIL transitions or to put it another way mostly empty.
2453            (Note that leaf nodes will not contain any transitions.)
2454
2455            This algorithm compresses the tables by eliminating most such
2456            transitions, at the cost of a modest bit of extra work during lookup:
2457
2458            - Each states[] entry contains a .base field which indicates the
2459            index in the state[] array wheres its transition data is stored.
2460
2461            - If .base is 0 there are no valid transitions from that node.
2462
2463            - If .base is nonzero then charid is added to it to find an entry in
2464            the trans array.
2465
2466            -If trans[states[state].base+charid].check!=state then the
2467            transition is taken to be a 0/Fail transition. Thus if there are fail
2468            transitions at the front of the node then the .base offset will point
2469            somewhere inside the previous nodes data (or maybe even into a node
2470            even earlier), but the .check field determines if the transition is
2471            valid.
2472
2473            XXX - wrong maybe?
2474            The following process inplace converts the table to the compressed
2475            table: We first do not compress the root node 1,and mark all its
2476            .check pointers as 1 and set its .base pointer as 1 as well. This
2477            allows us to do a DFA construction from the compressed table later,
2478            and ensures that any .base pointers we calculate later are greater
2479            than 0.
2480
2481            - We set 'pos' to indicate the first entry of the second node.
2482
2483            - We then iterate over the columns of the node, finding the first and
2484            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2485            and set the .check pointers accordingly, and advance pos
2486            appropriately and repreat for the next node. Note that when we copy
2487            the next pointers we have to convert them from the original
2488            NODEIDX form to NODENUM form as the former is not valid post
2489            compression.
2490
2491            - If a node has no transitions used we mark its base as 0 and do not
2492            advance the pos pointer.
2493
2494            - If a node only has one transition we use a second pointer into the
2495            structure to fill in allocated fail transitions from other states.
2496            This pointer is independent of the main pointer and scans forward
2497            looking for null transitions that are allocated to a state. When it
2498            finds one it writes the single transition into the "hole".  If the
2499            pointer doesnt find one the single transition is appended as normal.
2500
2501            - Once compressed we can Renew/realloc the structures to release the
2502            excess space.
2503
2504            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2505            specifically Fig 3.47 and the associated pseudocode.
2506
2507            demq
2508         */
2509         const U32 laststate = TRIE_NODENUM( next_alloc );
2510         U32 state, charid;
2511         U32 pos = 0, zp=0;
2512         trie->statecount = laststate;
2513
2514         for ( state = 1 ; state < laststate ; state++ ) {
2515             U8 flag = 0;
2516             const U32 stateidx = TRIE_NODEIDX( state );
2517             const U32 o_used = trie->trans[ stateidx ].check;
2518             U32 used = trie->trans[ stateidx ].check;
2519             trie->trans[ stateidx ].check = 0;
2520
2521             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2522                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2523                     if ( trie->trans[ stateidx + charid ].next ) {
2524                         if (o_used == 1) {
2525                             for ( ; zp < pos ; zp++ ) {
2526                                 if ( ! trie->trans[ zp ].next ) {
2527                                     break;
2528                                 }
2529                             }
2530                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2531                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2532                             trie->trans[ zp ].check = state;
2533                             if ( ++zp > pos ) pos = zp;
2534                             break;
2535                         }
2536                         used--;
2537                     }
2538                     if ( !flag ) {
2539                         flag = 1;
2540                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2541                     }
2542                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2543                     trie->trans[ pos ].check = state;
2544                     pos++;
2545                 }
2546             }
2547         }
2548         trie->lasttrans = pos + 1;
2549         trie->states = (reg_trie_state *)
2550             PerlMemShared_realloc( trie->states, laststate
2551                                    * sizeof(reg_trie_state) );
2552         DEBUG_TRIE_COMPILE_MORE_r(
2553                 PerlIO_printf( Perl_debug_log,
2554                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2555                     (int)depth * 2 + 2,"",
2556                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2557                     (IV)next_alloc,
2558                     (IV)pos,
2559                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2560             );
2561
2562         } /* end table compress */
2563     }
2564     DEBUG_TRIE_COMPILE_MORE_r(
2565             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2566                 (int)depth * 2 + 2, "",
2567                 (UV)trie->statecount,
2568                 (UV)trie->lasttrans)
2569     );
2570     /* resize the trans array to remove unused space */
2571     trie->trans = (reg_trie_trans *)
2572         PerlMemShared_realloc( trie->trans, trie->lasttrans
2573                                * sizeof(reg_trie_trans) );
2574
2575     {   /* Modify the program and insert the new TRIE node */
2576         U8 nodetype =(U8)(flags & 0xFF);
2577         char *str=NULL;
2578
2579 #ifdef DEBUGGING
2580         regnode *optimize = NULL;
2581 #ifdef RE_TRACK_PATTERN_OFFSETS
2582
2583         U32 mjd_offset = 0;
2584         U32 mjd_nodelen = 0;
2585 #endif /* RE_TRACK_PATTERN_OFFSETS */
2586 #endif /* DEBUGGING */
2587         /*
2588            This means we convert either the first branch or the first Exact,
2589            depending on whether the thing following (in 'last') is a branch
2590            or not and whther first is the startbranch (ie is it a sub part of
2591            the alternation or is it the whole thing.)
2592            Assuming its a sub part we convert the EXACT otherwise we convert
2593            the whole branch sequence, including the first.
2594          */
2595         /* Find the node we are going to overwrite */
2596         if ( first != startbranch || OP( last ) == BRANCH ) {
2597             /* branch sub-chain */
2598             NEXT_OFF( first ) = (U16)(last - first);
2599 #ifdef RE_TRACK_PATTERN_OFFSETS
2600             DEBUG_r({
2601                 mjd_offset= Node_Offset((convert));
2602                 mjd_nodelen= Node_Length((convert));
2603             });
2604 #endif
2605             /* whole branch chain */
2606         }
2607 #ifdef RE_TRACK_PATTERN_OFFSETS
2608         else {
2609             DEBUG_r({
2610                 const  regnode *nop = NEXTOPER( convert );
2611                 mjd_offset= Node_Offset((nop));
2612                 mjd_nodelen= Node_Length((nop));
2613             });
2614         }
2615         DEBUG_OPTIMISE_r(
2616             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2617                 (int)depth * 2 + 2, "",
2618                 (UV)mjd_offset, (UV)mjd_nodelen)
2619         );
2620 #endif
2621         /* But first we check to see if there is a common prefix we can
2622            split out as an EXACT and put in front of the TRIE node.  */
2623         trie->startstate= 1;
2624         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2625             U32 state;
2626             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2627                 U32 ofs = 0;
2628                 I32 idx = -1;
2629                 U32 count = 0;
2630                 const U32 base = trie->states[ state ].trans.base;
2631
2632                 if ( trie->states[state].wordnum )
2633                         count = 1;
2634
2635                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2636                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2637                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2638                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2639                     {
2640                         if ( ++count > 1 ) {
2641                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2642                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2643                             if ( state == 1 ) break;
2644                             if ( count == 2 ) {
2645                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2646                                 DEBUG_OPTIMISE_r(
2647                                     PerlIO_printf(Perl_debug_log,
2648                                         "%*sNew Start State=%"UVuf" Class: [",
2649                                         (int)depth * 2 + 2, "",
2650                                         (UV)state));
2651                                 if (idx >= 0) {
2652                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2653                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2654
2655                                     TRIE_BITMAP_SET(trie,*ch);
2656                                     if ( folder )
2657                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2658                                     DEBUG_OPTIMISE_r(
2659                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2660                                     );
2661                                 }
2662                             }
2663                             TRIE_BITMAP_SET(trie,*ch);
2664                             if ( folder )
2665                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2666                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2667                         }
2668                         idx = ofs;
2669                     }
2670                 }
2671                 if ( count == 1 ) {
2672                     SV **tmp = av_fetch( revcharmap, idx, 0);
2673                     STRLEN len;
2674                     char *ch = SvPV( *tmp, len );
2675                     DEBUG_OPTIMISE_r({
2676                         SV *sv=sv_newmortal();
2677                         PerlIO_printf( Perl_debug_log,
2678                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2679                             (int)depth * 2 + 2, "",
2680                             (UV)state, (UV)idx,
2681                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2682                                 PL_colors[0], PL_colors[1],
2683                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2684                                 PERL_PV_ESCAPE_FIRSTCHAR
2685                             )
2686                         );
2687                     });
2688                     if ( state==1 ) {
2689                         OP( convert ) = nodetype;
2690                         str=STRING(convert);
2691                         STR_LEN(convert)=0;
2692                     }
2693                     STR_LEN(convert) += len;
2694                     while (len--)
2695                         *str++ = *ch++;
2696                 } else {
2697 #ifdef DEBUGGING
2698                     if (state>1)
2699                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2700 #endif
2701                     break;
2702                 }
2703             }
2704             trie->prefixlen = (state-1);
2705             if (str) {
2706                 regnode *n = convert+NODE_SZ_STR(convert);
2707                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2708                 trie->startstate = state;
2709                 trie->minlen -= (state - 1);
2710                 trie->maxlen -= (state - 1);
2711 #ifdef DEBUGGING
2712                /* At least the UNICOS C compiler choked on this
2713                 * being argument to DEBUG_r(), so let's just have
2714                 * it right here. */
2715                if (
2716 #ifdef PERL_EXT_RE_BUILD
2717                    1
2718 #else
2719                    DEBUG_r_TEST
2720 #endif
2721                    ) {
2722                    regnode *fix = convert;
2723                    U32 word = trie->wordcount;
2724                    mjd_nodelen++;
2725                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2726                    while( ++fix < n ) {
2727                        Set_Node_Offset_Length(fix, 0, 0);
2728                    }
2729                    while (word--) {
2730                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2731                        if (tmp) {
2732                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2733                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2734                            else
2735                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2736                        }
2737                    }
2738                }
2739 #endif
2740                 if (trie->maxlen) {
2741                     convert = n;
2742                 } else {
2743                     NEXT_OFF(convert) = (U16)(tail - convert);
2744                     DEBUG_r(optimize= n);
2745                 }
2746             }
2747         }
2748         if (!jumper)
2749             jumper = last;
2750         if ( trie->maxlen ) {
2751             NEXT_OFF( convert ) = (U16)(tail - convert);
2752             ARG_SET( convert, data_slot );
2753             /* Store the offset to the first unabsorbed branch in
2754                jump[0], which is otherwise unused by the jump logic.
2755                We use this when dumping a trie and during optimisation. */
2756             if (trie->jump)
2757                 trie->jump[0] = (U16)(nextbranch - convert);
2758
2759             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2760              *   and there is a bitmap
2761              *   and the first "jump target" node we found leaves enough room
2762              * then convert the TRIE node into a TRIEC node, with the bitmap
2763              * embedded inline in the opcode - this is hypothetically faster.
2764              */
2765             if ( !trie->states[trie->startstate].wordnum
2766                  && trie->bitmap
2767                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2768             {
2769                 OP( convert ) = TRIEC;
2770                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2771                 PerlMemShared_free(trie->bitmap);
2772                 trie->bitmap= NULL;
2773             } else
2774                 OP( convert ) = TRIE;
2775
2776             /* store the type in the flags */
2777             convert->flags = nodetype;
2778             DEBUG_r({
2779             optimize = convert
2780                       + NODE_STEP_REGNODE
2781                       + regarglen[ OP( convert ) ];
2782             });
2783             /* XXX We really should free up the resource in trie now,
2784                    as we won't use them - (which resources?) dmq */
2785         }
2786         /* needed for dumping*/
2787         DEBUG_r(if (optimize) {
2788             regnode *opt = convert;
2789
2790             while ( ++opt < optimize) {
2791                 Set_Node_Offset_Length(opt,0,0);
2792             }
2793             /*
2794                 Try to clean up some of the debris left after the
2795                 optimisation.
2796              */
2797             while( optimize < jumper ) {
2798                 mjd_nodelen += Node_Length((optimize));
2799                 OP( optimize ) = OPTIMIZED;
2800                 Set_Node_Offset_Length(optimize,0,0);
2801                 optimize++;
2802             }
2803             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2804         });
2805     } /* end node insert */
2806
2807     /*  Finish populating the prev field of the wordinfo array.  Walk back
2808      *  from each accept state until we find another accept state, and if
2809      *  so, point the first word's .prev field at the second word. If the
2810      *  second already has a .prev field set, stop now. This will be the
2811      *  case either if we've already processed that word's accept state,
2812      *  or that state had multiple words, and the overspill words were
2813      *  already linked up earlier.
2814      */
2815     {
2816         U16 word;
2817         U32 state;
2818         U16 prev;
2819
2820         for (word=1; word <= trie->wordcount; word++) {
2821             prev = 0;
2822             if (trie->wordinfo[word].prev)
2823                 continue;
2824             state = trie->wordinfo[word].accept;
2825             while (state) {
2826                 state = prev_states[state];
2827                 if (!state)
2828                     break;
2829                 prev = trie->states[state].wordnum;
2830                 if (prev)
2831                     break;
2832             }
2833             trie->wordinfo[word].prev = prev;
2834         }
2835         Safefree(prev_states);
2836     }
2837
2838
2839     /* and now dump out the compressed format */
2840     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2841
2842     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2843 #ifdef DEBUGGING
2844     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2845     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2846 #else
2847     SvREFCNT_dec_NN(revcharmap);
2848 #endif
2849     return trie->jump
2850            ? MADE_JUMP_TRIE
2851            : trie->startstate>1
2852              ? MADE_EXACT_TRIE
2853              : MADE_TRIE;
2854 }
2855
2856 STATIC void
2857 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2858 {
2859 /* The Trie is constructed and compressed now so we can build a fail array if
2860  * it's needed
2861
2862    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2863    3.32 in the
2864    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2865    Ullman 1985/88
2866    ISBN 0-201-10088-6
2867
2868    We find the fail state for each state in the trie, this state is the longest
2869    proper suffix of the current state's 'word' that is also a proper prefix of
2870    another word in our trie. State 1 represents the word '' and is thus the
2871    default fail state. This allows the DFA not to have to restart after its
2872    tried and failed a word at a given point, it simply continues as though it
2873    had been matching the other word in the first place.
2874    Consider
2875       'abcdgu'=~/abcdefg|cdgu/
2876    When we get to 'd' we are still matching the first word, we would encounter
2877    'g' which would fail, which would bring us to the state representing 'd' in
2878    the second word where we would try 'g' and succeed, proceeding to match
2879    'cdgu'.
2880  */
2881  /* add a fail transition */
2882     const U32 trie_offset = ARG(source);
2883     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2884     U32 *q;
2885     const U32 ucharcount = trie->uniquecharcount;
2886     const U32 numstates = trie->statecount;
2887     const U32 ubound = trie->lasttrans + ucharcount;
2888     U32 q_read = 0;
2889     U32 q_write = 0;
2890     U32 charid;
2891     U32 base = trie->states[ 1 ].trans.base;
2892     U32 *fail;
2893     reg_ac_data *aho;
2894     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
2895     GET_RE_DEBUG_FLAGS_DECL;
2896
2897     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2898 #ifndef DEBUGGING
2899     PERL_UNUSED_ARG(depth);
2900 #endif
2901
2902
2903     ARG_SET( stclass, data_slot );
2904     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2905     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2906     aho->trie=trie_offset;
2907     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2908     Copy( trie->states, aho->states, numstates, reg_trie_state );
2909     Newxz( q, numstates, U32);
2910     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2911     aho->refcount = 1;
2912     fail = aho->fail;
2913     /* initialize fail[0..1] to be 1 so that we always have
2914        a valid final fail state */
2915     fail[ 0 ] = fail[ 1 ] = 1;
2916
2917     for ( charid = 0; charid < ucharcount ; charid++ ) {
2918         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2919         if ( newstate ) {
2920             q[ q_write ] = newstate;
2921             /* set to point at the root */
2922             fail[ q[ q_write++ ] ]=1;
2923         }
2924     }
2925     while ( q_read < q_write) {
2926         const U32 cur = q[ q_read++ % numstates ];
2927         base = trie->states[ cur ].trans.base;
2928
2929         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2930             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2931             if (ch_state) {
2932                 U32 fail_state = cur;
2933                 U32 fail_base;
2934                 do {
2935                     fail_state = fail[ fail_state ];
2936                     fail_base = aho->states[ fail_state ].trans.base;
2937                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2938
2939                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2940                 fail[ ch_state ] = fail_state;
2941                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2942                 {
2943                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2944                 }
2945                 q[ q_write++ % numstates] = ch_state;
2946             }
2947         }
2948     }
2949     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2950        when we fail in state 1, this allows us to use the
2951        charclass scan to find a valid start char. This is based on the principle
2952        that theres a good chance the string being searched contains lots of stuff
2953        that cant be a start char.
2954      */
2955     fail[ 0 ] = fail[ 1 ] = 0;
2956     DEBUG_TRIE_COMPILE_r({
2957         PerlIO_printf(Perl_debug_log,
2958                       "%*sStclass Failtable (%"UVuf" states): 0",
2959                       (int)(depth * 2), "", (UV)numstates
2960         );
2961         for( q_read=1; q_read<numstates; q_read++ ) {
2962             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2963         }
2964         PerlIO_printf(Perl_debug_log, "\n");
2965     });
2966     Safefree(q);
2967     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2968 }
2969
2970
2971 #define DEBUG_PEEP(str,scan,depth) \
2972     DEBUG_OPTIMISE_r({if (scan){ \
2973        SV * const mysv=sv_newmortal(); \
2974        regnode *Next = regnext(scan); \
2975        regprop(RExC_rx, mysv, scan); \
2976        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2977        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2978        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2979    }});
2980
2981
2982 /* The below joins as many adjacent EXACTish nodes as possible into a single
2983  * one.  The regop may be changed if the node(s) contain certain sequences that
2984  * require special handling.  The joining is only done if:
2985  * 1) there is room in the current conglomerated node to entirely contain the
2986  *    next one.
2987  * 2) they are the exact same node type
2988  *
2989  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2990  * these get optimized out
2991  *
2992  * If a node is to match under /i (folded), the number of characters it matches
2993  * can be different than its character length if it contains a multi-character
2994  * fold.  *min_subtract is set to the total delta of the input nodes.
2995  *
2996  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2997  * and contains LATIN SMALL LETTER SHARP S
2998  *
2999  * This is as good a place as any to discuss the design of handling these
3000  * multi-character fold sequences.  It's been wrong in Perl for a very long
3001  * time.  There are three code points in Unicode whose multi-character folds
3002  * were long ago discovered to mess things up.  The previous designs for
3003  * dealing with these involved assigning a special node for them.  This
3004  * approach doesn't work, as evidenced by this example:
3005  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3006  * Both these fold to "sss", but if the pattern is parsed to create a node that
3007  * would match just the \xDF, it won't be able to handle the case where a
3008  * successful match would have to cross the node's boundary.  The new approach
3009  * that hopefully generally solves the problem generates an EXACTFU_SS node
3010  * that is "sss".
3011  *
3012  * It turns out that there are problems with all multi-character folds, and not
3013  * just these three.  Now the code is general, for all such cases.  The
3014  * approach taken is:
3015  * 1)   This routine examines each EXACTFish node that could contain multi-
3016  *      character fold sequences.  It returns in *min_subtract how much to
3017  *      subtract from the the actual length of the string to get a real minimum
3018  *      match length; it is 0 if there are no multi-char folds.  This delta is
3019  *      used by the caller to adjust the min length of the match, and the delta
3020  *      between min and max, so that the optimizer doesn't reject these
3021  *      possibilities based on size constraints.
3022  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3023  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3024  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3025  *      there is a possible fold length change.  That means that a regular
3026  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3027  *      with length changes, and so can be processed faster.  regexec.c takes
3028  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3029  *      pre-folded by regcomp.c.  This saves effort in regex matching.
3030  *      However, the pre-folding isn't done for non-UTF8 patterns because the
3031  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
3032  *      down by forcing the pattern into UTF8 unless necessary.  Also what
3033  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
3034  *      possibilities for the non-UTF8 patterns are quite simple, except for
3035  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3036  *      members of a fold-pair, and arrays are set up for all of them so that
3037  *      the other member of the pair can be found quickly.  Code elsewhere in
3038  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3039  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3040  *      described in the next item.
3041  * 3)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
3042  *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3043  *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
3044  *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
3045  *      character in the pattern corresponds to at most a single character in
3046  *      the target string.  (And I do mean character, and not byte here, unlike
3047  *      other parts of the documentation that have never been updated to
3048  *      account for multibyte Unicode.)  sharp s in EXACTF nodes can match the
3049  *      two character string 'ss'; in EXACTFA nodes it can match
3050  *      "\x{17F}\x{17F}".  These violate the assumption, and they are the only
3051  *      instances where it is violated.  I'm reluctant to try to change the
3052  *      assumption, as the code involved is impenetrable to me (khw), so
3053  *      instead the code here punts.  This routine examines (when the pattern
3054  *      isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3055  *      boolean indicating whether or not the node contains a sharp s.  When it
3056  *      is true, the caller sets a flag that later causes the optimizer in this
3057  *      file to not set values for the floating and fixed string lengths, and
3058  *      thus avoids the optimizer code in regexec.c that makes the invalid
3059  *      assumption.  Thus, there is no optimization based on string lengths for
3060  *      non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3061  *      (The reason the assumption is wrong only in these two cases is that all
3062  *      other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3063  *      other folds to their expanded versions.  We can't prefold sharp s to
3064  *      'ss' in EXACTF nodes because we don't know at compile time if it
3065  *      actually matches 'ss' or not.  It will match iff the target string is
3066  *      in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3067  *      EXACTFA and EXACTFL where it never does.  In an EXACTFA node in a UTF-8
3068  *      pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3069  *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3070  *      require the pattern to be forced into UTF-8, the overhead of which we
3071  *      want to avoid.)
3072  *
3073  *      Similarly, the code that generates tries doesn't currently handle
3074  *      not-already-folded multi-char folds, and it looks like a pain to change
3075  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3076  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3077  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3078  *      using /iaa matching will be doing so almost entirely with ASCII
3079  *      strings, so this should rarely be encountered in practice */
3080
3081 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
3082     if (PL_regkind[OP(scan)] == EXACT) \
3083         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
3084
3085 STATIC U32
3086 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
3087     /* Merge several consecutive EXACTish nodes into one. */
3088     regnode *n = regnext(scan);
3089     U32 stringok = 1;
3090     regnode *next = scan + NODE_SZ_STR(scan);
3091     U32 merged = 0;
3092     U32 stopnow = 0;
3093 #ifdef DEBUGGING
3094     regnode *stop = scan;
3095     GET_RE_DEBUG_FLAGS_DECL;
3096 #else
3097     PERL_UNUSED_ARG(depth);
3098 #endif
3099
3100     PERL_ARGS_ASSERT_JOIN_EXACT;
3101 #ifndef EXPERIMENTAL_INPLACESCAN
3102     PERL_UNUSED_ARG(flags);
3103     PERL_UNUSED_ARG(val);
3104 #endif
3105     DEBUG_PEEP("join",scan,depth);
3106
3107     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3108      * EXACT ones that are mergeable to the current one. */
3109     while (n
3110            && (PL_regkind[OP(n)] == NOTHING
3111                || (stringok && OP(n) == OP(scan)))
3112            && NEXT_OFF(n)
3113            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3114     {
3115
3116         if (OP(n) == TAIL || n > next)
3117             stringok = 0;
3118         if (PL_regkind[OP(n)] == NOTHING) {
3119             DEBUG_PEEP("skip:",n,depth);
3120             NEXT_OFF(scan) += NEXT_OFF(n);
3121             next = n + NODE_STEP_REGNODE;
3122 #ifdef DEBUGGING
3123             if (stringok)
3124                 stop = n;
3125 #endif
3126             n = regnext(n);
3127         }
3128         else if (stringok) {
3129             const unsigned int oldl = STR_LEN(scan);
3130             regnode * const nnext = regnext(n);
3131
3132             /* XXX I (khw) kind of doubt that this works on platforms where
3133              * U8_MAX is above 255 because of lots of other assumptions */
3134             /* Don't join if the sum can't fit into a single node */
3135             if (oldl + STR_LEN(n) > U8_MAX)
3136                 break;
3137
3138             DEBUG_PEEP("merg",n,depth);
3139             merged++;
3140
3141             NEXT_OFF(scan) += NEXT_OFF(n);
3142             STR_LEN(scan) += STR_LEN(n);
3143             next = n + NODE_SZ_STR(n);
3144             /* Now we can overwrite *n : */
3145             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3146 #ifdef DEBUGGING
3147             stop = next - 1;
3148 #endif
3149             n = nnext;
3150             if (stopnow) break;
3151         }
3152
3153 #ifdef EXPERIMENTAL_INPLACESCAN
3154         if (flags && !NEXT_OFF(n)) {
3155             DEBUG_PEEP("atch", val, depth);
3156             if (reg_off_by_arg[OP(n)]) {
3157                 ARG_SET(n, val - n);
3158             }
3159             else {
3160                 NEXT_OFF(n) = val - n;
3161             }
3162             stopnow = 1;
3163         }
3164 #endif
3165     }
3166
3167     *min_subtract = 0;
3168     *has_exactf_sharp_s = FALSE;
3169
3170     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3171      * can now analyze for sequences of problematic code points.  (Prior to
3172      * this final joining, sequences could have been split over boundaries, and
3173      * hence missed).  The sequences only happen in folding, hence for any
3174      * non-EXACT EXACTish node */
3175     if (OP(scan) != EXACT) {
3176         const U8 * const s0 = (U8*) STRING(scan);
3177         const U8 * s = s0;
3178         const U8 * const s_end = s0 + STR_LEN(scan);
3179
3180         /* One pass is made over the node's string looking for all the
3181          * possibilities.  to avoid some tests in the loop, there are two main
3182          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3183          * non-UTF-8 */
3184         if (UTF) {
3185
3186             /* Examine the string for a multi-character fold sequence.  UTF-8
3187              * patterns have all characters pre-folded by the time this code is
3188              * executed */
3189             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3190                                      length sequence we are looking for is 2 */
3191             {
3192                 int count = 0;
3193                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3194                 if (! len) {    /* Not a multi-char fold: get next char */
3195                     s += UTF8SKIP(s);
3196                     continue;
3197                 }
3198
3199                 /* Nodes with 'ss' require special handling, except for EXACTFL
3200                  * and EXACTFA-ish for which there is no multi-char fold to
3201                  * this */
3202                 if (len == 2 && *s == 's' && *(s+1) == 's'
3203                     && OP(scan) != EXACTFL
3204                     && OP(scan) != EXACTFA
3205                     && OP(scan) != EXACTFA_NO_TRIE)
3206                 {
3207                     count = 2;
3208                     OP(scan) = EXACTFU_SS;
3209                     s += 2;
3210                 }
3211                 else { /* Here is a generic multi-char fold. */
3212                     const U8* multi_end  = s + len;
3213
3214                     /* Count how many characters in it.  In the case of /l and
3215                      * /aa, no folds which contain ASCII code points are
3216                      * allowed, so check for those, and skip if found.  (In
3217                      * EXACTFL, no folds are allowed to any Latin1 code point,
3218                      * not just ASCII.  But there aren't any of these
3219                      * currently, nor ever likely, so don't take the time to
3220                      * test for them.  The code that generates the
3221                      * is_MULTI_foo() macros croaks should one actually get put
3222                      * into Unicode .) */
3223                     if (OP(scan) != EXACTFL
3224                         && OP(scan) != EXACTFA
3225                         && OP(scan) != EXACTFA_NO_TRIE)
3226                     {
3227                         count = utf8_length(s, multi_end);
3228                         s = multi_end;
3229                     }
3230                     else {
3231                         while (s < multi_end) {
3232                             if (isASCII(*s)) {
3233                                 s++;
3234                                 goto next_iteration;
3235                             }
3236                             else {
3237                                 s += UTF8SKIP(s);
3238                             }
3239                             count++;
3240                         }
3241                     }
3242                 }
3243
3244                 /* The delta is how long the sequence is minus 1 (1 is how long
3245                  * the character that folds to the sequence is) */
3246                 *min_subtract += count - 1;
3247             next_iteration: ;
3248             }
3249         }
3250         else if (OP(scan) == EXACTFA) {
3251
3252             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3253              * fold to the ASCII range (and there are no existing ones in the
3254              * upper latin1 range).  But, as outlined in the comments preceding
3255              * this function, we need to flag any occurrences of the sharp s.
3256              * This character forbids trie formation (because of added
3257              * complexity) */
3258             while (s < s_end) {
3259                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3260                     OP(scan) = EXACTFA_NO_TRIE;
3261                     *has_exactf_sharp_s = TRUE;
3262                     break;
3263                 }
3264                 s++;
3265                 continue;
3266             }
3267         }
3268         else if (OP(scan) != EXACTFL) {
3269
3270             /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node.  Look for the
3271              * multi-char folds that are all Latin1.  (This code knows that
3272              * there are no current multi-char folds possible with EXACTFL,
3273              * relying on fold_grind.t to catch any errors if the very unlikely
3274              * event happens that some get added in future Unicode versions.)
3275              * As explained in the comments preceding this function, we look
3276              * also for the sharp s in EXACTF nodes; it can be in the final
3277              * position.  Otherwise we can stop looking 1 byte earlier because
3278              * have to find at least two characters for a multi-fold */
3279             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3280
3281             while (s < upper) {
3282                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3283                 if (! len) {    /* Not a multi-char fold. */
3284                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3285                     {
3286                         *has_exactf_sharp_s = TRUE;
3287                     }
3288                     s++;
3289                     continue;
3290                 }
3291
3292                 if (len == 2
3293                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3294                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3295                 {
3296
3297                     /* EXACTF nodes need to know that the minimum length
3298                      * changed so that a sharp s in the string can match this
3299                      * ss in the pattern, but they remain EXACTF nodes, as they
3300                      * won't match this unless the target string is is UTF-8,
3301                      * which we don't know until runtime */
3302                     if (OP(scan) != EXACTF) {
3303                         OP(scan) = EXACTFU_SS;
3304                     }
3305                 }
3306
3307                 *min_subtract += len - 1;
3308                 s += len;
3309             }
3310         }
3311     }
3312
3313 #ifdef DEBUGGING
3314     /* Allow dumping but overwriting the collection of skipped
3315      * ops and/or strings with fake optimized ops */
3316     n = scan + NODE_SZ_STR(scan);
3317     while (n <= stop) {
3318         OP(n) = OPTIMIZED;
3319         FLAGS(n) = 0;
3320         NEXT_OFF(n) = 0;
3321         n++;
3322     }
3323 #endif
3324     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3325     return stopnow;
3326 }
3327
3328 /* REx optimizer.  Converts nodes into quicker variants "in place".
3329    Finds fixed substrings.  */
3330
3331 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3332    to the position after last scanned or to NULL. */
3333
3334 #define INIT_AND_WITHP \
3335     assert(!and_withp); \
3336     Newx(and_withp,1, regnode_ssc); \
3337     SAVEFREEPV(and_withp)
3338
3339 /* this is a chain of data about sub patterns we are processing that
3340    need to be handled separately/specially in study_chunk. Its so
3341    we can simulate recursion without losing state.  */
3342 struct scan_frame;
3343 typedef struct scan_frame {
3344     regnode *last;  /* last node to process in this frame */
3345     regnode *next;  /* next node to process when last is reached */
3346     struct scan_frame *prev; /*previous frame*/
3347     U32 prev_recursed_depth;
3348     I32 stop; /* what stopparen do we use */
3349 } scan_frame;
3350
3351
3352 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3353
3354 STATIC SSize_t
3355 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3356                         SSize_t *minlenp, SSize_t *deltap,
3357                         regnode *last,
3358                         scan_data_t *data,
3359                         I32 stopparen,
3360                         U32 recursed_depth,
3361                         regnode_ssc *and_withp,
3362                         U32 flags, U32 depth)
3363                         /* scanp: Start here (read-write). */
3364                         /* deltap: Write maxlen-minlen here. */
3365                         /* last: Stop before this one. */
3366                         /* data: string data about the pattern */
3367                         /* stopparen: treat close N as END */
3368                         /* recursed: which subroutines have we recursed into */
3369                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3370 {
3371     dVAR;
3372     /* There must be at least this number of characters to match */
3373     SSize_t min = 0;
3374     I32 pars = 0, code;
3375     regnode *scan = *scanp, *next;
3376     SSize_t delta = 0;
3377     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3378     int is_inf_internal = 0;            /* The studied chunk is infinite */
3379     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3380     scan_data_t data_fake;
3381     SV *re_trie_maxbuff = NULL;
3382     regnode *first_non_open = scan;
3383     SSize_t stopmin = SSize_t_MAX;
3384     scan_frame *frame = NULL;
3385     GET_RE_DEBUG_FLAGS_DECL;
3386
3387     PERL_ARGS_ASSERT_STUDY_CHUNK;
3388
3389 #ifdef DEBUGGING
3390     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3391 #endif
3392     if ( depth == 0 ) {
3393         while (first_non_open && OP(first_non_open) == OPEN)
3394             first_non_open=regnext(first_non_open);
3395     }
3396
3397
3398   fake_study_recurse:
3399     while ( scan && OP(scan) != END && scan < last ){
3400         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3401                                    node length to get a real minimum (because
3402                                    the folded version may be shorter) */
3403         bool has_exactf_sharp_s = FALSE;
3404         /* Peephole optimizer: */
3405         DEBUG_OPTIMISE_MORE_r(
3406         {
3407             PerlIO_printf(Perl_debug_log,"%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3408                           ((int) depth*2), "", (long)stopparen,
3409                           (unsigned long)depth, (unsigned long)recursed_depth);
3410             if (recursed_depth) {
3411                 U32 i;
3412                 U32 j;
3413                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3414                     PerlIO_printf(Perl_debug_log,"[");
3415                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3416                         PerlIO_printf(Perl_debug_log,"%d",
3417                             PAREN_TEST(RExC_study_chunk_recursed +
3418                                        (j * RExC_study_chunk_recursed_bytes), i)
3419                             ? 1 : 0
3420                         );
3421                     PerlIO_printf(Perl_debug_log,"]");
3422                 }
3423             }
3424             PerlIO_printf(Perl_debug_log,"\n");
3425         }
3426         );
3427         DEBUG_STUDYDATA("Peep:", data, depth);
3428         DEBUG_PEEP("Peep", scan, depth);
3429
3430
3431         /* Its not clear to khw or hv why this is done here, and not in the
3432          * clauses that deal with EXACT nodes.  khw's guess is that it's
3433          * because of a previous design */
3434         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3435
3436         /* Follow the next-chain of the current node and optimize
3437            away all the NOTHINGs from it.  */
3438         if (OP(scan) != CURLYX) {
3439             const int max = (reg_off_by_arg[OP(scan)]
3440                        ? I32_MAX
3441                        /* I32 may be smaller than U16 on CRAYs! */
3442                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3443             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3444             int noff;
3445             regnode *n = scan;
3446
3447             /* Skip NOTHING and LONGJMP. */
3448             while ((n = regnext(n))
3449                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3450                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3451                    && off + noff < max)
3452                 off += noff;
3453             if (reg_off_by_arg[OP(scan)])
3454                 ARG(scan) = off;
3455             else
3456                 NEXT_OFF(scan) = off;
3457         }
3458
3459
3460
3461         /* The principal pseudo-switch.  Cannot be a switch, since we
3462            look into several different things.  */
3463         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3464                    || OP(scan) == IFTHEN) {
3465             next = regnext(scan);
3466             code = OP(scan);
3467             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3468
3469             if (OP(next) == code || code == IFTHEN) {
3470                 /* NOTE - There is similar code to this block below for
3471                  * handling TRIE nodes on a re-study.  If you change stuff here
3472                  * check there too. */
3473                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3474                 regnode_ssc accum;
3475                 regnode * const startbranch=scan;
3476
3477                 if (flags & SCF_DO_SUBSTR)
3478                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3479                 if (flags & SCF_DO_STCLASS)
3480                     ssc_init_zero(pRExC_state, &accum);
3481
3482                 while (OP(scan) == code) {
3483                     SSize_t deltanext, minnext, fake;
3484                     I32 f = 0;
3485                     regnode_ssc this_class;
3486
3487                     num++;
3488                     data_fake.flags = 0;
3489                     if (data) {
3490                         data_fake.whilem_c = data->whilem_c;
3491                         data_fake.last_closep = data->last_closep;
3492                     }
3493                     else
3494                         data_fake.last_closep = &fake;
3495
3496                     data_fake.pos_delta = delta;
3497                     next = regnext(scan);
3498                     scan = NEXTOPER(scan);
3499                     if (code != BRANCH)
3500                         scan = NEXTOPER(scan);
3501                     if (flags & SCF_DO_STCLASS) {
3502                         ssc_init(pRExC_state, &this_class);
3503                         data_fake.start_class = &this_class;
3504                         f = SCF_DO_STCLASS_AND;
3505                     }
3506                     if (flags & SCF_WHILEM_VISITED_POS)
3507                         f |= SCF_WHILEM_VISITED_POS;
3508
3509                     /* we suppose the run is continuous, last=next...*/
3510                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3511                                           next, &data_fake,
3512                                           stopparen, recursed_depth, NULL, f,depth+1);
3513                     if (min1 > minnext)
3514                         min1 = minnext;
3515                     if (deltanext == SSize_t_MAX) {
3516                         is_inf = is_inf_internal = 1;
3517                         max1 = SSize_t_MAX;
3518                     } else if (max1 < minnext + deltanext)
3519                         max1 = minnext + deltanext;
3520                     scan = next;
3521                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3522                         pars++;
3523                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3524                         if ( stopmin > minnext)
3525                             stopmin = min + min1;
3526                         flags &= ~SCF_DO_SUBSTR;
3527                         if (data)
3528                             data->flags |= SCF_SEEN_ACCEPT;
3529                     }
3530                     if (data) {
3531                         if (data_fake.flags & SF_HAS_EVAL)
3532                             data->flags |= SF_HAS_EVAL;
3533                         data->whilem_c = data_fake.whilem_c;
3534                     }
3535                     if (flags & SCF_DO_STCLASS)
3536                         ssc_or(pRExC_state, &accum, &this_class);
3537                 }
3538                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3539                     min1 = 0;
3540                 if (flags & SCF_DO_SUBSTR) {
3541                     data->pos_min += min1;
3542                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3543                         data->pos_delta = SSize_t_MAX;
3544                     else
3545                         data->pos_delta += max1 - min1;
3546                     if (max1 != min1 || is_inf)
3547                         data->longest = &(data->longest_float);
3548                 }
3549                 min += min1;
3550                 if (delta == SSize_t_MAX
3551                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3552                     delta = SSize_t_MAX;
3553                 else
3554                     delta += max1 - min1;
3555                 if (flags & SCF_DO_STCLASS_OR) {
3556                     ssc_or(pRExC_state, data->start_class, &accum);
3557                     if (min1) {
3558                         ssc_and(pRExC_state, data->start_class, and_withp);
3559                         flags &= ~SCF_DO_STCLASS;
3560                     }
3561                 }
3562                 else if (flags & SCF_DO_STCLASS_AND) {
3563                     if (min1) {
3564                         ssc_and(pRExC_state, data->start_class, &accum);
3565                         flags &= ~SCF_DO_STCLASS;
3566                     }
3567                     else {
3568                         /* Switch to OR mode: cache the old value of
3569                          * data->start_class */
3570                         INIT_AND_WITHP;
3571                         StructCopy(data->start_class, and_withp, regnode_ssc);
3572                         flags &= ~SCF_DO_STCLASS_AND;
3573                         StructCopy(&accum, data->start_class, regnode_ssc);
3574                         flags |= SCF_DO_STCLASS_OR;
3575                     }
3576                 }
3577
3578                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3579                 /* demq.
3580
3581                    Assuming this was/is a branch we are dealing with: 'scan'
3582                    now points at the item that follows the branch sequence,
3583                    whatever it is. We now start at the beginning of the
3584                    sequence and look for subsequences of
3585
3586                    BRANCH->EXACT=>x1
3587                    BRANCH->EXACT=>x2
3588                    tail
3589
3590                    which would be constructed from a pattern like
3591                    /A|LIST|OF|WORDS/
3592
3593                    If we can find such a subsequence we need to turn the first
3594                    element into a trie and then add the subsequent branch exact
3595                    strings to the trie.
3596
3597                    We have two cases
3598
3599                      1. patterns where the whole set of branches can be
3600                         converted.
3601
3602                      2. patterns where only a subset can be converted.
3603
3604                    In case 1 we can replace the whole set with a single regop
3605                    for the trie. In case 2 we need to keep the start and end
3606                    branches so
3607
3608                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3609                      becomes BRANCH TRIE; BRANCH X;
3610
3611                   There is an additional case, that being where there is a
3612                   common prefix, which gets split out into an EXACT like node
3613                   preceding the TRIE node.
3614
3615                   If x(1..n)==tail then we can do a simple trie, if not we make
3616                   a "jump" trie, such that when we match the appropriate word
3617                   we "jump" to the appropriate tail node. Essentially we turn
3618                   a nested if into a case structure of sorts.
3619
3620                 */
3621
3622                     int made=0;
3623                     if (!re_trie_maxbuff) {
3624                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3625                         if (!SvIOK(re_trie_maxbuff))
3626                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3627                     }
3628                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3629                         regnode *cur;
3630                         regnode *first = (regnode *)NULL;
3631                         regnode *last = (regnode *)NULL;
3632                         regnode *tail = scan;
3633                         U8 trietype = 0;
3634                         U32 count=0;
3635
3636 #ifdef DEBUGGING
3637                         SV * const mysv = sv_newmortal();       /* for dumping */
3638 #endif
3639                         /* var tail is used because there may be a TAIL
3640                            regop in the way. Ie, the exacts will point to the
3641                            thing following the TAIL, but the last branch will
3642                            point at the TAIL. So we advance tail. If we
3643                            have nested (?:) we may have to move through several
3644                            tails.
3645                          */
3646
3647                         while ( OP( tail ) == TAIL ) {
3648                             /* this is the TAIL generated by (?:) */
3649                             tail = regnext( tail );
3650                         }
3651
3652
3653                         DEBUG_TRIE_COMPILE_r({
3654                             regprop(RExC_rx, mysv, tail );
3655                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3656                                 (int)depth * 2 + 2, "",
3657                                 "Looking for TRIE'able sequences. Tail node is: ",
3658                                 SvPV_nolen_const( mysv )
3659                             );
3660                         });
3661
3662                         /*
3663
3664                             Step through the branches
3665                                 cur represents each branch,
3666                                 noper is the first thing to be matched as part
3667                                       of that branch
3668                                 noper_next is the regnext() of that node.
3669
3670                             We normally handle a case like this
3671                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3672                             support building with NOJUMPTRIE, which restricts
3673                             the trie logic to structures like /FOO|BAR/.
3674
3675                             If noper is a trieable nodetype then the branch is
3676                             a possible optimization target. If we are building
3677                             under NOJUMPTRIE then we require that noper_next is
3678                             the same as scan (our current position in the regex
3679                             program).
3680
3681                             Once we have two or more consecutive such branches
3682                             we can create a trie of the EXACT's contents and
3683                             stitch it in place into the program.
3684
3685                             If the sequence represents all of the branches in
3686                             the alternation we replace the entire thing with a
3687                             single TRIE node.
3688
3689                             Otherwise when it is a subsequence we need to
3690                             stitch it in place and replace only the relevant
3691                             branches. This means the first branch has to remain
3692                             as it is used by the alternation logic, and its
3693                             next pointer, and needs to be repointed at the item
3694                             on the branch chain following the last branch we
3695                             have optimized away.
3696
3697                             This could be either a BRANCH, in which case the
3698                             subsequence is internal, or it could be the item
3699                             following the branch sequence in which case the
3700                             subsequence is at the end (which does not
3701                             necessarily mean the first node is the start of the
3702                             alternation).
3703
3704                             TRIE_TYPE(X) is a define which maps the optype to a
3705                             trietype.
3706
3707                                 optype          |  trietype
3708                                 ----------------+-----------
3709                                 NOTHING         | NOTHING
3710                                 EXACT           | EXACT
3711                                 EXACTFU         | EXACTFU
3712                                 EXACTFU_SS      | EXACTFU
3713                                 EXACTFA         | EXACTFA
3714
3715
3716                         */
3717 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3718                        ( EXACT == (X) )   ? EXACT :        \
3719                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3720                        ( EXACTFA == (X) ) ? EXACTFA :        \
3721                        0 )
3722
3723                         /* dont use tail as the end marker for this traverse */
3724                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3725                             regnode * const noper = NEXTOPER( cur );
3726                             U8 noper_type = OP( noper );
3727                             U8 noper_trietype = TRIE_TYPE( noper_type );
3728 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3729                             regnode * const noper_next = regnext( noper );
3730                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3731                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3732 #endif
3733
3734                             DEBUG_TRIE_COMPILE_r({
3735                                 regprop(RExC_rx, mysv, cur);
3736                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3737                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3738
3739                                 regprop(RExC_rx, mysv, noper);
3740                                 PerlIO_printf( Perl_debug_log, " -> %s",
3741                                     SvPV_nolen_const(mysv));
3742
3743                                 if ( noper_next ) {
3744                                   regprop(RExC_rx, mysv, noper_next );
3745                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3746                                     SvPV_nolen_const(mysv));
3747                                 }
3748                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3749                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3750                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3751                                 );
3752                             });
3753
3754                             /* Is noper a trieable nodetype that can be merged
3755                              * with the current trie (if there is one)? */
3756                             if ( noper_trietype
3757                                   &&
3758                                   (
3759                                         ( noper_trietype == NOTHING)
3760                                         || ( trietype == NOTHING )
3761                                         || ( trietype == noper_trietype )
3762                                   )
3763 #ifdef NOJUMPTRIE
3764                                   && noper_next == tail
3765 #endif
3766                                   && count < U16_MAX)
3767                             {
3768                                 /* Handle mergable triable node Either we are
3769                                  * the first node in a new trieable sequence,
3770                                  * in which case we do some bookkeeping,
3771                                  * otherwise we update the end pointer. */
3772                                 if ( !first ) {
3773                                     first = cur;
3774                                     if ( noper_trietype == NOTHING ) {
3775 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3776                                         regnode * const noper_next = regnext( noper );
3777                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3778                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3779 #endif
3780
3781                                         if ( noper_next_trietype ) {
3782                                             trietype = noper_next_trietype;
3783                                         } else if (noper_next_type)  {
3784                                             /* a NOTHING regop is 1 regop wide.
3785                                              * We need at least two for a trie
3786                                              * so we can't merge this in */
3787                                             first = NULL;
3788                                         }
3789                                     } else {
3790                                         trietype = noper_trietype;
3791                                     }
3792                                 } else {
3793                                     if ( trietype == NOTHING )
3794                                         trietype = noper_trietype;
3795                                     last = cur;
3796                                 }
3797                                 if (first)
3798                                     count++;
3799                             } /* end handle mergable triable node */
3800                             else {
3801                                 /* handle unmergable node -
3802                                  * noper may either be a triable node which can
3803                                  * not be tried together with the current trie,
3804                                  * or a non triable node */
3805                                 if ( last ) {
3806                                     /* If last is set and trietype is not
3807                                      * NOTHING then we have found at least two
3808                                      * triable branch sequences in a row of a
3809                                      * similar trietype so we can turn them
3810                                      * into a trie. If/when we allow NOTHING to
3811                                      * start a trie sequence this condition
3812                                      * will be required, and it isn't expensive
3813                                      * so we leave it in for now. */
3814                                     if ( trietype && trietype != NOTHING )
3815                                         make_trie( pRExC_state,
3816                                                 startbranch, first, cur, tail, count,
3817                                                 trietype, depth+1 );
3818                                     last = NULL; /* note: we clear/update
3819                                                     first, trietype etc below,
3820                                                     so we dont do it here */
3821                                 }
3822                                 if ( noper_trietype
3823 #ifdef NOJUMPTRIE
3824                                      && noper_next == tail
3825 #endif
3826                                 ){
3827                                     /* noper is triable, so we can start a new
3828                                      * trie sequence */
3829                                     count = 1;
3830                                     first = cur;
3831                                     trietype = noper_trietype;
3832                                 } else if (first) {
3833                                     /* if we already saw a first but the
3834                                      * current node is not triable then we have
3835                                      * to reset the first information. */
3836                                     count = 0;
3837                                     first = NULL;
3838                                     trietype = 0;
3839                                 }
3840                             } /* end handle unmergable node */
3841                         } /* loop over branches */
3842                         DEBUG_TRIE_COMPILE_r({
3843                             regprop(RExC_rx, mysv, cur);
3844                             PerlIO_printf( Perl_debug_log,
3845                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3846                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3847
3848                         });
3849                         if ( last && trietype ) {
3850                             if ( trietype != NOTHING ) {
3851                                 /* the last branch of the sequence was part of
3852                                  * a trie, so we have to construct it here
3853                                  * outside of the loop */
3854                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3855 #ifdef TRIE_STUDY_OPT
3856                                 if ( ((made == MADE_EXACT_TRIE &&
3857                                      startbranch == first)
3858                                      || ( first_non_open == first )) &&
3859                                      depth==0 ) {
3860                                     flags |= SCF_TRIE_RESTUDY;
3861                                     if ( startbranch == first
3862                                          && scan == tail )
3863                                     {
3864                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3865                                     }
3866                                 }
3867 #endif
3868                             } else {
3869                                 /* at this point we know whatever we have is a
3870                                  * NOTHING sequence/branch AND if 'startbranch'
3871                                  * is 'first' then we can turn the whole thing
3872                                  * into a NOTHING
3873                                  */
3874                                 if ( startbranch == first ) {
3875                                     regnode *opt;
3876                                     /* the entire thing is a NOTHING sequence,
3877                                      * something like this: (?:|) So we can
3878                                      * turn it into a plain NOTHING op. */
3879                                     DEBUG_TRIE_COMPILE_r({
3880                                         regprop(RExC_rx, mysv, cur);
3881                                         PerlIO_printf( Perl_debug_log,
3882                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3883                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3884
3885                                     });
3886                                     OP(startbranch)= NOTHING;
3887                                     NEXT_OFF(startbranch)= tail - startbranch;
3888                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3889                                         OP(opt)= OPTIMIZED;
3890                                 }
3891                             }
3892                         } /* end if ( last) */
3893                     } /* TRIE_MAXBUF is non zero */
3894
3895                 } /* do trie */
3896
3897             }
3898             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3899                 scan = NEXTOPER(NEXTOPER(scan));
3900             } else                      /* single branch is optimized. */
3901                 scan = NEXTOPER(scan);
3902             continue;
3903         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3904             scan_frame *newframe = NULL;
3905             I32 paren;
3906             regnode *start;
3907             regnode *end;
3908             U32 my_recursed_depth= recursed_depth;
3909
3910             if (OP(scan) != SUSPEND) {
3911                 /* set the pointer */
3912                 if (OP(scan) == GOSUB) {
3913                     paren = ARG(scan);
3914                     RExC_recurse[ARG2L(scan)] = scan;
3915                     start = RExC_open_parens[paren-1];
3916                     end   = RExC_close_parens[paren-1];
3917                 } else {
3918                     paren = 0;
3919                     start = RExC_rxi->program + 1;
3920                     end   = RExC_opend;
3921                 }
3922                 if (!recursed_depth
3923                     ||
3924                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
3925                 ) {
3926                     if (!recursed_depth) {
3927                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
3928                     } else {
3929                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
3930                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
3931                              RExC_study_chunk_recursed_bytes, U8);
3932                     }
3933                     /* we havent recursed into this paren yet, so recurse into it */
3934                     DEBUG_STUDYDATA("set:", data,depth);
3935                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
3936                     my_recursed_depth= recursed_depth + 1;
3937                     Newx(newframe,1,scan_frame);
3938                 } else {
3939                     DEBUG_STUDYDATA("inf:", data,depth);
3940                     /* some form of infinite recursion, assume infinite length */
3941                     if (flags & SCF_DO_SUBSTR) {
3942                         SCAN_COMMIT(pRExC_state,data,minlenp);
3943                         data->longest = &(data->longest_float);
3944                     }
3945                     is_inf = is_inf_internal = 1;
3946                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3947                         ssc_anything(data->start_class);
3948                     flags &= ~SCF_DO_STCLASS;
3949                 }
3950             } else {
3951                 Newx(newframe,1,scan_frame);
3952                 paren = stopparen;
3953                 start = scan+2;
3954                 end = regnext(scan);
3955             }
3956             if (newframe) {
3957                 assert(start);
3958                 assert(end);
3959                 SAVEFREEPV(newframe);
3960                 newframe->next = regnext(scan);
3961                 newframe->last = last;
3962                 newframe->stop = stopparen;
3963                 newframe->prev = frame;
3964                 newframe->prev_recursed_depth = recursed_depth;
3965
3966                 DEBUG_STUDYDATA("frame-new:",data,depth);
3967                 DEBUG_PEEP("fnew", scan, depth);
3968
3969                 frame = newframe;
3970                 scan =  start;
3971                 stopparen = paren;
3972                 last = end;
3973                 depth = depth + 1;
3974                 recursed_depth= my_recursed_depth;
3975
3976                 continue;
3977             }
3978         }
3979         else if (OP(scan) == EXACT) {
3980             SSize_t l = STR_LEN(scan);
3981             UV uc;
3982             if (UTF) {
3983                 const U8 * const s = (U8*)STRING(scan);
3984                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3985                 l = utf8_length(s, s + l);
3986             } else {
3987                 uc = *((U8*)STRING(scan));
3988             }
3989             min += l;
3990             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3991                 /* The code below prefers earlier match for fixed
3992                    offset, later match for variable offset.  */
3993                 if (data->last_end == -1) { /* Update the start info. */
3994                     data->last_start_min = data->pos_min;
3995                     data->last_start_max = is_inf
3996                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
3997                 }
3998                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3999                 if (UTF)
4000                     SvUTF8_on(data->last_found);
4001                 {
4002                     SV * const sv = data->last_found;
4003                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4004                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4005                     if (mg && mg->mg_len >= 0)
4006                         mg->mg_len += utf8_length((U8*)STRING(scan),
4007                                                   (U8*)STRING(scan)+STR_LEN(scan));
4008                 }
4009                 data->last_end = data->pos_min + l;
4010                 data->pos_min += l; /* As in the first entry. */
4011                 data->flags &= ~SF_BEFORE_EOL;
4012             }
4013
4014             /* ANDing the code point leaves at most it, and not in locale, and
4015              * can't match null string */
4016             if (flags & SCF_DO_STCLASS_AND) {
4017                 ssc_cp_and(data->start_class, uc);
4018                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4019                 ssc_clear_locale(data->start_class);
4020             }
4021             else if (flags & SCF_DO_STCLASS_OR) {
4022                 ssc_add_cp(data->start_class, uc);
4023                 ssc_and(pRExC_state, data->start_class, and_withp);
4024             }
4025             flags &= ~SCF_DO_STCLASS;
4026         }
4027         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4028             SSize_t l = STR_LEN(scan);
4029             UV uc = *((U8*)STRING(scan));
4030             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4031                                                      separate code points */
4032
4033             /* Search for fixed substrings supports EXACT only. */
4034             if (flags & SCF_DO_SUBSTR) {
4035                 assert(data);
4036                 SCAN_COMMIT(pRExC_state, data, minlenp);
4037             }
4038             if (UTF) {
4039                 const U8 * const s = (U8 *)STRING(scan);
4040                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4041                 l = utf8_length(s, s + l);
4042             }
4043             if (has_exactf_sharp_s) {
4044                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
4045             }
4046             min += l - min_subtract;
4047             assert (min >= 0);
4048             delta += min_subtract;
4049             if (flags & SCF_DO_SUBSTR) {
4050                 data->pos_min += l - min_subtract;
4051                 if (data->pos_min < 0) {
4052                     data->pos_min = 0;
4053                 }
4054                 data->pos_delta += min_subtract;
4055                 if (min_subtract) {
4056                     data->longest = &(data->longest_float);
4057                 }
4058             }
4059             if (OP(scan) == EXACTFL) {
4060                 if (flags & SCF_DO_STCLASS_AND) {
4061                     ssc_flags_and(data->start_class,
4062                                                 ANYOF_LOCALE|ANYOF_LOC_FOLD);
4063                 }
4064                 else if (flags & SCF_DO_STCLASS_OR) {
4065                     ANYOF_FLAGS(data->start_class)
4066                                                 |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
4067                 }
4068
4069                 /* We don't know what the folds are; it could be anything. XXX
4070                  * Actually, we only support UTF-8 encoding for code points
4071                  * above Latin1, so we could know what those folds are. */
4072                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4073                                                        0,
4074                                                        UV_MAX);
4075             }
4076             else {  /* Non-locale EXACTFish */
4077                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4078                 if (flags & SCF_DO_STCLASS_AND) {
4079                     ssc_clear_locale(data->start_class);
4080                 }
4081                 if (uc < 256) { /* We know what the Latin1 folds are ... */
4082                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4083                                                        know if anything folds
4084                                                        with this */
4085                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4086                                                            PL_fold_latin1[uc]);
4087                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4088                                                       legal under /iaa */
4089                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4090                                 EXACTF_invlist
4091                                     = add_cp_to_invlist(EXACTF_invlist,
4092                                                 LATIN_SMALL_LETTER_SHARP_S);
4093                             }
4094                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4095                                 EXACTF_invlist
4096                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4097                                 EXACTF_invlist
4098                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4099                             }
4100                         }
4101
4102                         /* We also know if there are above-Latin1 code points
4103                          * that fold to this (none legal for ASCII and /iaa) */
4104                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4105                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4106                         {
4107                             /* XXX We could know exactly what does fold to this
4108                              * if the reverse folds are loaded, as currently in
4109                              * S_regclass() */
4110                             _invlist_union(EXACTF_invlist,
4111                                            PL_AboveLatin1,
4112                                            &EXACTF_invlist);
4113                         }
4114                     }
4115                 }
4116                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4117                            know what participates in folds with this, so have
4118                            to assume anything could */
4119
4120                     /* XXX We could know exactly what does fold to this if the
4121                      * reverse folds are loaded, as currently in S_regclass().
4122                      * But we do know that under /iaa nothing in the ASCII
4123                      * range can participate */
4124                     if (OP(scan) == EXACTFA) {
4125                         _invlist_union_complement_2nd(EXACTF_invlist,
4126                                                       PL_Posix_ptrs[_CC_ASCII],
4127                                                       &EXACTF_invlist);
4128                     }
4129                     else {
4130                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4131                                                                0, UV_MAX);
4132                     }
4133                 }
4134             }
4135             if (flags & SCF_DO_STCLASS_AND) {
4136                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4137                 ANYOF_POSIXL_ZERO(data->start_class);
4138                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4139             }
4140             else if (flags & SCF_DO_STCLASS_OR) {
4141                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4142                 ssc_and(pRExC_state, data->start_class, and_withp);
4143             }
4144             flags &= ~SCF_DO_STCLASS;
4145             SvREFCNT_dec(EXACTF_invlist);
4146         }
4147         else if (REGNODE_VARIES(OP(scan))) {
4148             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4149             I32 fl = 0, f = flags;
4150             regnode * const oscan = scan;
4151             regnode_ssc this_class;
4152             regnode_ssc *oclass = NULL;
4153             I32 next_is_eval = 0;
4154
4155             switch (PL_regkind[OP(scan)]) {
4156             case WHILEM:                /* End of (?:...)* . */
4157                 scan = NEXTOPER(scan);
4158                 goto finish;
4159             case PLUS:
4160                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4161                     next = NEXTOPER(scan);
4162                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4163                         mincount = 1;
4164                         maxcount = REG_INFTY;
4165                         next = regnext(scan);
4166                         scan = NEXTOPER(scan);
4167                         goto do_curly;
4168                     }
4169                 }
4170                 if (flags & SCF_DO_SUBSTR)
4171                     data->pos_min++;
4172                 min++;
4173                 /* Fall through. */
4174             case STAR:
4175                 if (flags & SCF_DO_STCLASS) {
4176                     mincount = 0;
4177                     maxcount = REG_INFTY;
4178                     next = regnext(scan);
4179                     scan = NEXTOPER(scan);
4180                     goto do_curly;
4181                 }
4182                 is_inf = is_inf_internal = 1;
4183                 scan = regnext(scan);
4184                 if (flags & SCF_DO_SUBSTR) {
4185                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
4186                     data->longest = &(data->longest_float);
4187                 }
4188                 goto optimize_curly_tail;
4189             case CURLY:
4190                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4191                     && (scan->flags == stopparen))
4192                 {
4193                     mincount = 1;
4194                     maxcount = 1;
4195                 } else {
4196                     mincount = ARG1(scan);
4197                     maxcount = ARG2(scan);
4198                 }
4199                 next = regnext(scan);
4200                 if (OP(scan) == CURLYX) {
4201                     I32 lp = (data ? *(data->last_closep) : 0);
4202                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4203                 }
4204                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4205                 next_is_eval = (OP(scan) == EVAL);
4206               do_curly:
4207                 if (flags & SCF_DO_SUBSTR) {
4208                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
4209                     pos_before = data->pos_min;
4210                 }
4211                 if (data) {
4212                     fl = data->flags;
4213                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4214                     if (is_inf)
4215                         data->flags |= SF_IS_INF;
4216                 }
4217                 if (flags & SCF_DO_STCLASS) {
4218                     ssc_init(pRExC_state, &this_class);
4219                     oclass = data->start_class;
4220                     data->start_class = &this_class;
4221                     f |= SCF_DO_STCLASS_AND;
4222                     f &= ~SCF_DO_STCLASS_OR;
4223                 }
4224                 /* Exclude from super-linear cache processing any {n,m}
4225                    regops for which the combination of input pos and regex
4226                    pos is not enough information to determine if a match
4227                    will be possible.
4228
4229                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4230                    regex pos at the \s*, the prospects for a match depend not
4231                    only on the input position but also on how many (bar\s*)
4232                    repeats into the {4,8} we are. */
4233                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4234                     f &= ~SCF_WHILEM_VISITED_POS;
4235
4236                 /* This will finish on WHILEM, setting scan, or on NULL: */
4237                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4238                                       last, data, stopparen, recursed_depth, NULL,
4239                                       (mincount == 0
4240                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
4241
4242                 if (flags & SCF_DO_STCLASS)
4243                     data->start_class = oclass;
4244                 if (mincount == 0 || minnext == 0) {
4245                     if (flags & SCF_DO_STCLASS_OR) {
4246                         ssc_or(pRExC_state, data->start_class, &this_class);
4247                     }
4248                     else if (flags & SCF_DO_STCLASS_AND) {
4249                         /* Switch to OR mode: cache the old value of
4250                          * data->start_class */
4251                         INIT_AND_WITHP;
4252                         StructCopy(data->start_class, and_withp, regnode_ssc);
4253                         flags &= ~SCF_DO_STCLASS_AND;
4254                         StructCopy(&this_class, data->start_class, regnode_ssc);
4255                         flags |= SCF_DO_STCLASS_OR;
4256                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4257                     }
4258                 } else {                /* Non-zero len */
4259                     if (flags & SCF_DO_STCLASS_OR) {
4260                         ssc_or(pRExC_state, data->start_class, &this_class);
4261                         ssc_and(pRExC_state, data->start_class, and_withp);
4262                     }
4263                     else if (flags & SCF_DO_STCLASS_AND)
4264                         ssc_and(pRExC_state, data->start_class, &this_class);
4265                     flags &= ~SCF_DO_STCLASS;
4266                 }
4267                 if (!scan)              /* It was not CURLYX, but CURLY. */
4268                     scan = next;
4269                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4270                     /* ? quantifier ok, except for (?{ ... }) */
4271                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4272                     && (minnext == 0) && (deltanext == 0)
4273                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4274                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
4275                 {
4276                     /* Fatal warnings may leak the regexp without this: */
4277                     SAVEFREESV(RExC_rx_sv);
4278                     ckWARNreg(RExC_parse,
4279                               "Quantifier unexpected on zero-length expression");
4280                     (void)ReREFCNT_inc(RExC_rx_sv);
4281                 }
4282
4283                 min += minnext * mincount;
4284                 is_inf_internal |= deltanext == SSize_t_MAX
4285                                      || (maxcount == REG_INFTY && minnext + deltanext > 0);
4286                 is_inf |= is_inf_internal;
4287                 if (is_inf)
4288                     delta = SSize_t_MAX;
4289                 else
4290                     delta += (minnext + deltanext) * maxcount - minnext * mincount;
4291
4292                 /* Try powerful optimization CURLYX => CURLYN. */
4293                 if (  OP(oscan) == CURLYX && data
4294                       && data->flags & SF_IN_PAR
4295                       && !(data->flags & SF_HAS_EVAL)
4296                       && !deltanext && minnext == 1 ) {
4297                     /* Try to optimize to CURLYN.  */
4298                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4299                     regnode * const nxt1 = nxt;
4300 #ifdef DEBUGGING
4301                     regnode *nxt2;
4302 #endif
4303
4304                     /* Skip open. */
4305                     nxt = regnext(nxt);
4306                     if (!REGNODE_SIMPLE(OP(nxt))
4307                         && !(PL_regkind[OP(nxt)] == EXACT
4308                              && STR_LEN(nxt) == 1))
4309                         goto nogo;
4310 #ifdef DEBUGGING
4311                     nxt2 = nxt;
4312 #endif
4313                     nxt = regnext(nxt);
4314                     if (OP(nxt) != CLOSE)
4315                         goto nogo;
4316                     if (RExC_open_parens) {
4317                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4318                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4319                     }
4320                     /* Now we know that nxt2 is the only contents: */
4321                     oscan->flags = (U8)ARG(nxt);
4322                     OP(oscan) = CURLYN;
4323                     OP(nxt1) = NOTHING; /* was OPEN. */
4324
4325 #ifdef DEBUGGING
4326                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4327                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4328                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4329                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4330                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4331                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4332 #endif
4333                 }
4334               nogo:
4335
4336                 /* Try optimization CURLYX => CURLYM. */
4337                 if (  OP(oscan) == CURLYX && data
4338                       && !(data->flags & SF_HAS_PAR)
4339                       && !(data->flags & SF_HAS_EVAL)
4340                       && !deltanext     /* atom is fixed width */
4341                       && minnext != 0   /* CURLYM can't handle zero width */
4342                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4343                 ) {
4344                     /* XXXX How to optimize if data == 0? */
4345                     /* Optimize to a simpler form.  */
4346                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4347                     regnode *nxt2;
4348
4349                     OP(oscan) = CURLYM;
4350                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4351                             && (OP(nxt2) != WHILEM))
4352                         nxt = nxt2;
4353                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4354                     /* Need to optimize away parenths. */
4355                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4356                         /* Set the parenth number.  */
4357                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4358
4359                         oscan->flags = (U8)ARG(nxt);
4360                         if (RExC_open_parens) {
4361                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4362                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4363                         }
4364                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4365                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4366
4367 #ifdef DEBUGGING
4368                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4369                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4370                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4371                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4372 #endif
4373 #if 0
4374                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4375                             regnode *nnxt = regnext(nxt1);
4376                             if (nnxt == nxt) {
4377                                 if (reg_off_by_arg[OP(nxt1)])
4378                                     ARG_SET(nxt1, nxt2 - nxt1);
4379                                 else if (nxt2 - nxt1 < U16_MAX)
4380                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4381                                 else
4382                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4383                             }
4384                             nxt1 = nnxt;
4385                         }
4386 #endif
4387                         /* Optimize again: */
4388                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4389                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4390                     }
4391                     else
4392                         oscan->flags = 0;
4393                 }
4394                 else if ((OP(oscan) == CURLYX)
4395                          && (flags & SCF_WHILEM_VISITED_POS)
4396                          /* See the comment on a similar expression above.
4397                             However, this time it's not a subexpression
4398                             we care about, but the expression itself. */
4399                          && (maxcount == REG_INFTY)
4400                          && data && ++data->whilem_c < 16) {
4401                     /* This stays as CURLYX, we can put the count/of pair. */
4402                     /* Find WHILEM (as in regexec.c) */
4403                     regnode *nxt = oscan + NEXT_OFF(oscan);
4404
4405                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4406                         nxt += ARG(nxt);
4407                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4408                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4409                 }
4410                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4411                     pars++;
4412                 if (flags & SCF_DO_SUBSTR) {
4413                     SV *last_str = NULL;
4414                     int counted = mincount != 0;
4415
4416                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4417                         SSize_t b = pos_before >= data->last_start_min
4418                             ? pos_before : data->last_start_min;
4419                         STRLEN l;
4420                         const char * const s = SvPV_const(data->last_found, l);
4421                         SSize_t old = b - data->last_start_min;
4422
4423                         if (UTF)
4424                             old = utf8_hop((U8*)s, old) - (U8*)s;
4425                         l -= old;
4426                         /* Get the added string: */
4427                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4428                         if (deltanext == 0 && pos_before == b) {
4429                             /* What was added is a constant string */
4430                             if (mincount > 1) {
4431                                 SvGROW(last_str, (mincount * l) + 1);
4432                                 repeatcpy(SvPVX(last_str) + l,
4433                                           SvPVX_const(last_str), l, mincount - 1);
4434                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4435                                 /* Add additional parts. */
4436                                 SvCUR_set(data->last_found,
4437                                           SvCUR(data->last_found) - l);
4438                                 sv_catsv(data->last_found, last_str);
4439                                 {
4440                                     SV * sv = data->last_found;
4441                                     MAGIC *mg =
4442                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4443                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4444                                     if (mg && mg->mg_len >= 0)
4445                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4446                                 }
4447                                 data->last_end += l * (mincount - 1);
4448                             }
4449                         } else {
4450                             /* start offset must point into the last copy */
4451                             data->last_start_min += minnext * (mincount - 1);
4452                             data->last_start_max += is_inf ? SSize_t_MAX
4453                                 : (maxcount - 1) * (minnext + data->pos_delta);
4454                         }
4455                     }
4456                     /* It is counted once already... */
4457                     data->pos_min += minnext * (mincount - counted);
4458 #if 0
4459 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4460                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4461                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4462     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4463     (UV)mincount);
4464 if (deltanext != SSize_t_MAX)
4465 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4466     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4467           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4468 #endif
4469                     if (deltanext == SSize_t_MAX ||
4470                         -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4471                         data->pos_delta = SSize_t_MAX;
4472                     else
4473                         data->pos_delta += - counted * deltanext +
4474                         (minnext + deltanext) * maxcount - minnext * mincount;
4475                     if (mincount != maxcount) {
4476                          /* Cannot extend fixed substrings found inside
4477                             the group.  */
4478                         SCAN_COMMIT(pRExC_state,data,minlenp);
4479                         if (mincount && last_str) {
4480                             SV * const sv = data->last_found;
4481                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4482                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4483
4484                             if (mg)
4485                                 mg->mg_len = -1;
4486                             sv_setsv(sv, last_str);
4487                             data->last_end = data->pos_min;
4488                             data->last_start_min =
4489                                 data->pos_min - CHR_SVLEN(last_str);
4490                             data->last_start_max = is_inf
4491                                 ? SSize_t_MAX
4492                                 : data->pos_min + data->pos_delta
4493                                 - CHR_SVLEN(last_str);
4494                         }
4495                         data->longest = &(data->longest_float);
4496                     }
4497                     SvREFCNT_dec(last_str);
4498                 }
4499                 if (data && (fl & SF_HAS_EVAL))
4500                     data->flags |= SF_HAS_EVAL;
4501               optimize_curly_tail:
4502                 if (OP(oscan) != CURLYX) {
4503                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4504                            && NEXT_OFF(next))
4505                         NEXT_OFF(oscan) += NEXT_OFF(next);
4506                 }
4507                 continue;
4508
4509             default:
4510 #ifdef DEBUGGING
4511                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4512                                                                     OP(scan));
4513 #endif
4514             case REF:
4515             case CLUMP:
4516                 if (flags & SCF_DO_SUBSTR) {
4517                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4518                     data->longest = &(data->longest_float);
4519                 }
4520                 is_inf = is_inf_internal = 1;
4521                 if (flags & SCF_DO_STCLASS_OR) {
4522                     if (OP(scan) == CLUMP) {
4523                         /* Actually is any start char, but very few code points
4524                          * aren't start characters */
4525                         ssc_match_all_cp(data->start_class);
4526                     }
4527                     else {
4528                         ssc_anything(data->start_class);
4529                     }
4530                 }
4531                 flags &= ~SCF_DO_STCLASS;
4532                 break;
4533             }
4534         }
4535         else if (OP(scan) == LNBREAK) {
4536             if (flags & SCF_DO_STCLASS) {
4537                 if (flags & SCF_DO_STCLASS_AND) {
4538                     ssc_intersection(data->start_class,
4539                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4540                     ssc_clear_locale(data->start_class);
4541                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4542                 }
4543                 else if (flags & SCF_DO_STCLASS_OR) {
4544                     ssc_union(data->start_class,
4545                               PL_XPosix_ptrs[_CC_VERTSPACE],
4546                               FALSE);
4547                     ssc_and(pRExC_state, data->start_class, and_withp);
4548                 }
4549                 flags &= ~SCF_DO_STCLASS;
4550             }
4551             min++;
4552             delta++;    /* Because of the 2 char string cr-lf */
4553             if (flags & SCF_DO_SUBSTR) {
4554                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4555                 data->pos_min += 1;
4556                 data->pos_delta += 1;
4557                 data->longest = &(data->longest_float);
4558             }
4559         }
4560         else if (REGNODE_SIMPLE(OP(scan))) {
4561
4562             if (flags & SCF_DO_SUBSTR) {
4563                 SCAN_COMMIT(pRExC_state,data,minlenp);
4564                 data->pos_min++;
4565             }
4566             min++;
4567             if (flags & SCF_DO_STCLASS) {
4568                 bool invert = 0;
4569                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4570                 U8 classnum;
4571                 U8 namedclass;
4572
4573                 if (flags & SCF_DO_STCLASS_AND) {
4574                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4575                 }
4576
4577                 /* Some of the logic below assumes that switching
4578                    locale on will only add false positives. */
4579                 switch (OP(scan)) {
4580
4581                 default:
4582 #ifdef DEBUGGING
4583                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4584 #endif
4585                 case CANY:
4586                 case SANY:
4587                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4588                         ssc_match_all_cp(data->start_class);
4589                     break;
4590
4591                 case REG_ANY:
4592                     {
4593                         SV* REG_ANY_invlist = _new_invlist(2);
4594                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4595                                                             '\n');
4596                         if (flags & SCF_DO_STCLASS_OR) {
4597                             ssc_union(data->start_class,
4598                                       REG_ANY_invlist,
4599                                       TRUE /* TRUE => invert, hence all but \n
4600                                             */
4601                                       );
4602                         }
4603                         else if (flags & SCF_DO_STCLASS_AND) {
4604                             ssc_intersection(data->start_class,
4605                                              REG_ANY_invlist,
4606                                              TRUE  /* TRUE => invert */
4607                                              );
4608                             ssc_clear_locale(data->start_class);
4609                         }
4610                         SvREFCNT_dec_NN(REG_ANY_invlist);
4611                     }
4612                     break;
4613
4614                 case ANYOF_WARN_SUPER:
4615                 case ANYOF:
4616                     if (flags & SCF_DO_STCLASS_AND)
4617                         ssc_and(pRExC_state, data->start_class,
4618                                 (regnode_ssc*) scan);
4619                     else
4620                         ssc_or(pRExC_state, data->start_class,
4621                                                           (regnode_ssc*)scan);
4622                     break;
4623
4624                 case NPOSIXL:
4625                     invert = 1;
4626                     /* FALL THROUGH */
4627
4628                 case POSIXL:
4629                     classnum = FLAGS(scan);
4630                     namedclass = classnum_to_namedclass(classnum) + invert;
4631                     if (flags & SCF_DO_STCLASS_AND) {
4632                         bool was_there = cBOOL(
4633                                           ANYOF_POSIXL_TEST(data->start_class,
4634                                                                  namedclass));
4635                         ANYOF_POSIXL_ZERO(data->start_class);
4636                         if (was_there) {    /* Do an AND */
4637                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4638                         }
4639                         /* No individual code points can now match */
4640                         data->start_class->invlist
4641                                                 = sv_2mortal(_new_invlist(0));
4642                     }
4643                     else {
4644                         int complement = namedclass + ((invert) ? -1 : 1);
4645
4646                         assert(flags & SCF_DO_STCLASS_OR);
4647
4648                         /* If the complement of this class was already there,
4649                          * the result is that they match all code points,
4650                          * (\d + \D == everything).  Remove the classes from
4651                          * future consideration.  Locale is not relevant in
4652                          * this case */
4653                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4654                             ssc_match_all_cp(data->start_class);
4655                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4656                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4657                             if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
4658                             {
4659                                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
4660                             }
4661                         }
4662                         else {  /* The usual case; just add this class to the
4663                                    existing set */
4664                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4665                             ANYOF_FLAGS(data->start_class)
4666                                                 |= ANYOF_LOCALE|ANYOF_POSIXL;
4667                         }
4668                     }
4669                     break;
4670
4671                 case NPOSIXA:   /* For these, we always know the exact set of
4672                                    what's matched */
4673                     invert = 1;
4674                     /* FALL THROUGH */
4675                 case POSIXA:
4676                     classnum = FLAGS(scan);
4677                     my_invlist = PL_Posix_ptrs[classnum];
4678                     goto join_posix;
4679
4680                 case NPOSIXD:
4681                 case NPOSIXU:
4682                     invert = 1;
4683                     /* FALL THROUGH */
4684                 case POSIXD:
4685                 case POSIXU:
4686                     classnum = FLAGS(scan);
4687
4688                     /* If we know all the code points that match the class, use
4689                      * that; otherwise use the Latin1 code points, plus we have
4690                      * to assume that it could match anything above Latin1 */
4691                     if (PL_XPosix_ptrs[classnum]) {
4692                         my_invlist = invlist_clone(PL_XPosix_ptrs[classnum]);
4693                     }
4694                     else {
4695                         _invlist_union(PL_L1Posix_ptrs[classnum],
4696                                        PL_AboveLatin1, &my_invlist);
4697                     }
4698
4699                     /* NPOSIXD matches all upper Latin1 code points unless the
4700                      * target string being matched is UTF-8, which is
4701                      * unknowable until match time */
4702                     if (PL_regkind[OP(scan)] == NPOSIXD) {
4703                         _invlist_union_complement_2nd(my_invlist,
4704                                         PL_Posix_ptrs[_CC_ASCII], &my_invlist);
4705                     }
4706
4707                   join_posix:
4708
4709                     if (flags & SCF_DO_STCLASS_AND) {
4710                         ssc_intersection(data->start_class, my_invlist, invert);
4711                         ssc_clear_locale(data->start_class);
4712                     }
4713                     else {
4714                         assert(flags & SCF_DO_STCLASS_OR);
4715                         ssc_union(data->start_class, my_invlist, invert);
4716                     }
4717                 }
4718                 if (flags & SCF_DO_STCLASS_OR)
4719                     ssc_and(pRExC_state, data->start_class, and_withp);
4720                 flags &= ~SCF_DO_STCLASS;
4721             }
4722         }
4723         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4724             data->flags |= (OP(scan) == MEOL
4725                             ? SF_BEFORE_MEOL
4726                             : SF_BEFORE_SEOL);
4727             SCAN_COMMIT(pRExC_state, data, minlenp);
4728
4729         }
4730         else if (  PL_regkind[OP(scan)] == BRANCHJ
4731                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4732                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4733                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4734             if ( OP(scan) == UNLESSM &&
4735                  scan->flags == 0 &&
4736                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4737                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4738             ) {
4739                 regnode *opt;
4740                 regnode *upto= regnext(scan);
4741                 DEBUG_PARSE_r({
4742                     SV * const mysv_val=sv_newmortal();
4743                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4744
4745                     /*DEBUG_PARSE_MSG("opfail");*/
4746                     regprop(RExC_rx, mysv_val, upto);
4747                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4748                                   SvPV_nolen_const(mysv_val),
4749                                   (IV)REG_NODE_NUM(upto),
4750                                   (IV)(upto - scan)
4751                     );
4752                 });
4753                 OP(scan) = OPFAIL;
4754                 NEXT_OFF(scan) = upto - scan;
4755                 for (opt= scan + 1; opt < upto ; opt++)
4756                     OP(opt) = OPTIMIZED;
4757                 scan= upto;
4758                 continue;
4759             }
4760             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4761                 || OP(scan) == UNLESSM )
4762             {
4763                 /* Negative Lookahead/lookbehind
4764                    In this case we can't do fixed string optimisation.
4765                 */
4766
4767                 SSize_t deltanext, minnext, fake = 0;
4768                 regnode *nscan;
4769                 regnode_ssc intrnl;
4770                 int f = 0;
4771
4772                 data_fake.flags = 0;
4773                 if (data) {
4774                     data_fake.whilem_c = data->whilem_c;
4775                     data_fake.last_closep = data->last_closep;
4776                 }
4777                 else
4778                     data_fake.last_closep = &fake;
4779                 data_fake.pos_delta = delta;
4780                 if ( flags & SCF_DO_STCLASS && !scan->flags
4781                      && OP(scan) == IFMATCH ) { /* Lookahead */
4782                     ssc_init(pRExC_state, &intrnl);
4783                     data_fake.start_class = &intrnl;
4784                     f |= SCF_DO_STCLASS_AND;
4785                 }
4786                 if (flags & SCF_WHILEM_VISITED_POS)
4787                     f |= SCF_WHILEM_VISITED_POS;
4788                 next = regnext(scan);
4789                 nscan = NEXTOPER(NEXTOPER(scan));
4790                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4791                     last, &data_fake, stopparen, recursed_depth, NULL, f, depth+1);
4792                 if (scan->flags) {
4793                     if (deltanext) {
4794                         FAIL("Variable length lookbehind not implemented");
4795                     }
4796                     else if (minnext > (I32)U8_MAX) {
4797                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4798                     }
4799                     scan->flags = (U8)minnext;
4800                 }
4801                 if (data) {
4802                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4803                         pars++;
4804                     if (data_fake.flags & SF_HAS_EVAL)
4805                         data->flags |= SF_HAS_EVAL;
4806                     data->whilem_c = data_fake.whilem_c;
4807                 }
4808                 if (f & SCF_DO_STCLASS_AND) {
4809                     if (flags & SCF_DO_STCLASS_OR) {
4810                         /* OR before, AND after: ideally we would recurse with
4811                          * data_fake to get the AND applied by study of the
4812                          * remainder of the pattern, and then derecurse;
4813                          * *** HACK *** for now just treat as "no information".
4814                          * See [perl #56690].
4815                          */
4816                         ssc_init(pRExC_state, data->start_class);
4817                     }  else {
4818                         /* AND before and after: combine and continue */
4819                         ssc_and(pRExC_state, data->start_class, &intrnl);
4820                     }
4821                 }
4822             }
4823 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4824             else {
4825                 /* Positive Lookahead/lookbehind
4826                    In this case we can do fixed string optimisation,
4827                    but we must be careful about it. Note in the case of
4828                    lookbehind the positions will be offset by the minimum
4829                    length of the pattern, something we won't know about
4830                    until after the recurse.
4831                 */
4832                 SSize_t deltanext, fake = 0;
4833                 regnode *nscan;
4834                 regnode_ssc intrnl;
4835                 int f = 0;
4836                 /* We use SAVEFREEPV so that when the full compile
4837                     is finished perl will clean up the allocated
4838                     minlens when it's all done. This way we don't
4839                     have to worry about freeing them when we know
4840                     they wont be used, which would be a pain.
4841                  */
4842                 SSize_t *minnextp;
4843                 Newx( minnextp, 1, SSize_t );
4844                 SAVEFREEPV(minnextp);
4845
4846                 if (data) {
4847                     StructCopy(data, &data_fake, scan_data_t);
4848                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4849                         f |= SCF_DO_SUBSTR;
4850                         if (scan->flags)
4851                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4852                         data_fake.last_found=newSVsv(data->last_found);
4853                     }
4854                 }
4855                 else
4856                     data_fake.last_closep = &fake;
4857                 data_fake.flags = 0;
4858                 data_fake.pos_delta = delta;
4859                 if (is_inf)
4860                     data_fake.flags |= SF_IS_INF;
4861                 if ( flags & SCF_DO_STCLASS && !scan->flags
4862                      && OP(scan) == IFMATCH ) { /* Lookahead */
4863                     ssc_init(pRExC_state, &intrnl);
4864                     data_fake.start_class = &intrnl;
4865                     f |= SCF_DO_STCLASS_AND;
4866                 }
4867                 if (flags & SCF_WHILEM_VISITED_POS)
4868                     f |= SCF_WHILEM_VISITED_POS;
4869                 next = regnext(scan);
4870                 nscan = NEXTOPER(NEXTOPER(scan));
4871
4872                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4873                     last, &data_fake, stopparen, recursed_depth, NULL, f,depth+1);
4874                 if (scan->flags) {
4875                     if (deltanext) {
4876                         FAIL("Variable length lookbehind not implemented");
4877                     }
4878                     else if (*minnextp > (I32)U8_MAX) {
4879                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4880                     }
4881                     scan->flags = (U8)*minnextp;
4882                 }
4883
4884                 *minnextp += min;
4885
4886                 if (f & SCF_DO_STCLASS_AND) {
4887                     ssc_and(pRExC_state, data->start_class, &intrnl);
4888                 }
4889                 if (data) {
4890                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4891                         pars++;
4892                     if (data_fake.flags & SF_HAS_EVAL)
4893                         data->flags |= SF_HAS_EVAL;
4894                     data->whilem_c = data_fake.whilem_c;
4895                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4896                         if (RExC_rx->minlen<*minnextp)
4897                             RExC_rx->minlen=*minnextp;
4898                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4899                         SvREFCNT_dec_NN(data_fake.last_found);
4900
4901                         if ( data_fake.minlen_fixed != minlenp )
4902                         {
4903                             data->offset_fixed= data_fake.offset_fixed;
4904                             data->minlen_fixed= data_fake.minlen_fixed;
4905                             data->lookbehind_fixed+= scan->flags;
4906                         }
4907                         if ( data_fake.minlen_float != minlenp )
4908                         {
4909                             data->minlen_float= data_fake.minlen_float;
4910                             data->offset_float_min=data_fake.offset_float_min;
4911                             data->offset_float_max=data_fake.offset_float_max;
4912                             data->lookbehind_float+= scan->flags;
4913                         }
4914                     }
4915                 }
4916             }
4917 #endif
4918         }
4919         else if (OP(scan) == OPEN) {
4920             if (stopparen != (I32)ARG(scan))
4921                 pars++;
4922         }
4923         else if (OP(scan) == CLOSE) {
4924             if (stopparen == (I32)ARG(scan)) {
4925                 break;
4926             }
4927             if ((I32)ARG(scan) == is_par) {
4928                 next = regnext(scan);
4929
4930                 if ( next && (OP(next) != WHILEM) && next < last)
4931                     is_par = 0;         /* Disable optimization */
4932             }
4933             if (data)
4934                 *(data->last_closep) = ARG(scan);
4935         }
4936         else if (OP(scan) == EVAL) {
4937                 if (data)
4938                     data->flags |= SF_HAS_EVAL;
4939         }
4940         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4941             if (flags & SCF_DO_SUBSTR) {
4942                 SCAN_COMMIT(pRExC_state,data,minlenp);
4943                 flags &= ~SCF_DO_SUBSTR;
4944             }
4945             if (data && OP(scan)==ACCEPT) {
4946                 data->flags |= SCF_SEEN_ACCEPT;
4947                 if (stopmin > min)
4948                     stopmin = min;
4949             }
4950         }
4951         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4952         {
4953                 if (flags & SCF_DO_SUBSTR) {
4954                     SCAN_COMMIT(pRExC_state,data,minlenp);
4955                     data->longest = &(data->longest_float);
4956                 }
4957                 is_inf = is_inf_internal = 1;
4958                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4959                     ssc_anything(data->start_class);
4960                 flags &= ~SCF_DO_STCLASS;
4961         }
4962         else if (OP(scan) == GPOS) {
4963             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4964                 !(delta || is_inf || (data && data->pos_delta)))
4965             {
4966                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4967                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4968                 if (RExC_rx->gofs < (STRLEN)min)
4969                     RExC_rx->gofs = min;
4970             } else {
4971                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4972                 RExC_rx->gofs = 0;
4973             }
4974         }
4975 #ifdef TRIE_STUDY_OPT
4976 #ifdef FULL_TRIE_STUDY
4977         else if (PL_regkind[OP(scan)] == TRIE) {
4978             /* NOTE - There is similar code to this block above for handling
4979                BRANCH nodes on the initial study.  If you change stuff here
4980                check there too. */
4981             regnode *trie_node= scan;
4982             regnode *tail= regnext(scan);
4983             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4984             SSize_t max1 = 0, min1 = SSize_t_MAX;
4985             regnode_ssc accum;
4986
4987             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4988                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4989             if (flags & SCF_DO_STCLASS)
4990                 ssc_init_zero(pRExC_state, &accum);
4991
4992             if (!trie->jump) {
4993                 min1= trie->minlen;
4994                 max1= trie->maxlen;
4995             } else {
4996                 const regnode *nextbranch= NULL;
4997                 U32 word;
4998
4999                 for ( word=1 ; word <= trie->wordcount ; word++)
5000                 {
5001                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5002                     regnode_ssc this_class;
5003
5004                     data_fake.flags = 0;
5005                     if (data) {
5006                         data_fake.whilem_c = data->whilem_c;
5007                         data_fake.last_closep = data->last_closep;
5008                     }
5009                     else
5010                         data_fake.last_closep = &fake;
5011                     data_fake.pos_delta = delta;
5012                     if (flags & SCF_DO_STCLASS) {
5013                         ssc_init(pRExC_state, &this_class);
5014                         data_fake.start_class = &this_class;
5015                         f = SCF_DO_STCLASS_AND;
5016                     }
5017                     if (flags & SCF_WHILEM_VISITED_POS)
5018                         f |= SCF_WHILEM_VISITED_POS;
5019
5020                     if (trie->jump[word]) {
5021                         if (!nextbranch)
5022                             nextbranch = trie_node + trie->jump[0];
5023                         scan= trie_node + trie->jump[word];
5024                         /* We go from the jump point to the branch that follows
5025                            it. Note this means we need the vestigal unused branches
5026                            even though they arent otherwise used.
5027                          */
5028                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5029                             &deltanext, (regnode *)nextbranch, &data_fake,
5030                             stopparen, recursed_depth, NULL, f,depth+1);
5031                     }
5032                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5033                         nextbranch= regnext((regnode*)nextbranch);
5034
5035                     if (min1 > (SSize_t)(minnext + trie->minlen))
5036                         min1 = minnext + trie->minlen;
5037                     if (deltanext == SSize_t_MAX) {
5038                         is_inf = is_inf_internal = 1;
5039                         max1 = SSize_t_MAX;
5040                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5041                         max1 = minnext + deltanext + trie->maxlen;
5042
5043                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5044                         pars++;
5045                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5046                         if ( stopmin > min + min1)
5047                             stopmin = min + min1;
5048                         flags &= ~SCF_DO_SUBSTR;
5049                         if (data)
5050                             data->flags |= SCF_SEEN_ACCEPT;
5051                     }
5052                     if (data) {
5053                         if (data_fake.flags & SF_HAS_EVAL)
5054                             data->flags |= SF_HAS_EVAL;
5055                         data->whilem_c = data_fake.whilem_c;
5056                     }
5057                     if (flags & SCF_DO_STCLASS)
5058                         ssc_or(pRExC_state, &accum, &this_class);
5059                 }
5060             }
5061             if (flags & SCF_DO_SUBSTR) {
5062                 data->pos_min += min1;
5063                 data->pos_delta += max1 - min1;
5064                 if (max1 != min1 || is_inf)
5065                     data->longest = &(data->longest_float);
5066             }
5067             min += min1;
5068             delta += max1 - min1;
5069             if (flags & SCF_DO_STCLASS_OR) {
5070                 ssc_or(pRExC_state, data->start_class, &accum);
5071                 if (min1) {
5072                     ssc_and(pRExC_state, data->start_class, and_withp);
5073                     flags &= ~SCF_DO_STCLASS;
5074                 }
5075             }
5076             else if (flags & SCF_DO_STCLASS_AND) {
5077                 if (min1) {
5078                     ssc_and(pRExC_state, data->start_class, &accum);
5079                     flags &= ~SCF_DO_STCLASS;
5080                 }
5081                 else {
5082                     /* Switch to OR mode: cache the old value of
5083                      * data->start_class */
5084                     INIT_AND_WITHP;
5085                     StructCopy(data->start_class, and_withp, regnode_ssc);
5086                     flags &= ~SCF_DO_STCLASS_AND;
5087                     StructCopy(&accum, data->start_class, regnode_ssc);
5088                     flags |= SCF_DO_STCLASS_OR;
5089                 }
5090             }
5091             scan= tail;
5092             continue;
5093         }
5094 #else
5095         else if (PL_regkind[OP(scan)] == TRIE) {
5096             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5097             U8*bang=NULL;
5098
5099             min += trie->minlen;
5100             delta += (trie->maxlen - trie->minlen);
5101             flags &= ~SCF_DO_STCLASS; /* xxx */
5102             if (flags & SCF_DO_SUBSTR) {
5103                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
5104                 data->pos_min += trie->minlen;
5105                 data->pos_delta += (trie->maxlen - trie->minlen);
5106                 if (trie->maxlen != trie->minlen)
5107                     data->longest = &(data->longest_float);
5108             }
5109             if (trie->jump) /* no more substrings -- for now /grr*/
5110                 flags &= ~SCF_DO_SUBSTR;
5111         }
5112 #endif /* old or new */
5113 #endif /* TRIE_STUDY_OPT */
5114
5115         /* Else: zero-length, ignore. */
5116         scan = regnext(scan);
5117     }
5118     /* If we are exiting a recursion we can unset its recursed bit
5119      * and allow ourselves to enter it again - no danger of an
5120      * infinite loop there.
5121     if (stopparen > -1 && recursed) {
5122         DEBUG_STUDYDATA("unset:", data,depth);
5123         PAREN_UNSET( recursed, stopparen);
5124     }
5125     */
5126     if (frame) {
5127         DEBUG_STUDYDATA("frame-end:",data,depth);
5128         DEBUG_PEEP("fend", scan, depth);
5129         /* restore previous context */
5130         last = frame->last;
5131         scan = frame->next;
5132         stopparen = frame->stop;
5133         recursed_depth = frame->prev_recursed_depth;
5134         depth = depth - 1;
5135
5136         frame = frame->prev;
5137         goto fake_study_recurse;
5138     }
5139
5140   finish:
5141     assert(!frame);
5142     DEBUG_STUDYDATA("pre-fin:",data,depth);
5143
5144     *scanp = scan;
5145     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5146     if (flags & SCF_DO_SUBSTR && is_inf)
5147         data->pos_delta = SSize_t_MAX - data->pos_min;
5148     if (is_par > (I32)U8_MAX)
5149         is_par = 0;
5150     if (is_par && pars==1 && data) {
5151         data->flags |= SF_IN_PAR;
5152         data->flags &= ~SF_HAS_PAR;
5153     }
5154     else if (pars && data) {
5155         data->flags |= SF_HAS_PAR;
5156         data->flags &= ~SF_IN_PAR;
5157     }
5158     if (flags & SCF_DO_STCLASS_OR)
5159         ssc_and(pRExC_state, data->start_class, and_withp);
5160     if (flags & SCF_TRIE_RESTUDY)
5161         data->flags |=  SCF_TRIE_RESTUDY;
5162
5163     DEBUG_STUDYDATA("post-fin:",data,depth);
5164
5165     return min < stopmin ? min : stopmin;
5166 }
5167
5168 STATIC U32
5169 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5170 {
5171     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5172
5173     PERL_ARGS_ASSERT_ADD_DATA;
5174
5175     Renewc(RExC_rxi->data,
5176            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5177            char, struct reg_data);
5178     if(count)
5179         Renew(RExC_rxi->data->what, count + n, U8);
5180     else
5181         Newx(RExC_rxi->data->what, n, U8);
5182     RExC_rxi->data->count = count + n;
5183     Copy(s, RExC_rxi->data->what + count, n, U8);
5184     return count;
5185 }
5186
5187 /*XXX: todo make this not included in a non debugging perl */
5188 #ifndef PERL_IN_XSUB_RE
5189 void
5190 Perl_reginitcolors(pTHX)
5191 {
5192     dVAR;
5193     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5194     if (s) {
5195         char *t = savepv(s);
5196         int i = 0;
5197         PL_colors[0] = t;
5198         while (++i < 6) {
5199             t = strchr(t, '\t');
5200             if (t) {
5201                 *t = '\0';
5202                 PL_colors[i] = ++t;
5203             }
5204             else
5205                 PL_colors[i] = t = (char *)"";
5206         }
5207     } else {
5208         int i = 0;
5209         while (i < 6)
5210             PL_colors[i++] = (char *)"";
5211     }
5212     PL_colorset = 1;
5213 }
5214 #endif
5215
5216
5217 #ifdef TRIE_STUDY_OPT
5218 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5219     STMT_START {                                            \
5220         if (                                                \
5221               (data.flags & SCF_TRIE_RESTUDY)               \
5222               && ! restudied++                              \
5223         ) {                                                 \
5224             dOsomething;                                    \
5225             goto reStudy;                                   \
5226         }                                                   \
5227     } STMT_END
5228 #else
5229 #define CHECK_RESTUDY_GOTO_butfirst
5230 #endif
5231
5232 /*
5233  * pregcomp - compile a regular expression into internal code
5234  *
5235  * Decides which engine's compiler to call based on the hint currently in
5236  * scope
5237  */
5238
5239 #ifndef PERL_IN_XSUB_RE
5240
5241 /* return the currently in-scope regex engine (or the default if none)  */
5242
5243 regexp_engine const *
5244 Perl_current_re_engine(pTHX)
5245 {
5246     dVAR;
5247
5248     if (IN_PERL_COMPILETIME) {
5249         HV * const table = GvHV(PL_hintgv);
5250         SV **ptr;
5251
5252         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5253             return &PL_core_reg_engine;
5254         ptr = hv_fetchs(table, "regcomp", FALSE);
5255         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5256             return &PL_core_reg_engine;
5257         return INT2PTR(regexp_engine*,SvIV(*ptr));
5258     }
5259     else {
5260         SV *ptr;
5261         if (!PL_curcop->cop_hints_hash)
5262             return &PL_core_reg_engine;
5263         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5264         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5265             return &PL_core_reg_engine;
5266         return INT2PTR(regexp_engine*,SvIV(ptr));
5267     }
5268 }
5269
5270
5271 REGEXP *
5272 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5273 {
5274     dVAR;
5275     regexp_engine const *eng = current_re_engine();
5276     GET_RE_DEBUG_FLAGS_DECL;
5277
5278     PERL_ARGS_ASSERT_PREGCOMP;
5279
5280     /* Dispatch a request to compile a regexp to correct regexp engine. */
5281     DEBUG_COMPILE_r({
5282         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5283                         PTR2UV(eng));
5284     });
5285     return CALLREGCOMP_ENG(eng, pattern, flags);
5286 }
5287 #endif
5288
5289 /* public(ish) entry point for the perl core's own regex compiling code.
5290  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5291  * pattern rather than a list of OPs, and uses the internal engine rather
5292  * than the current one */
5293
5294 REGEXP *
5295 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5296 {
5297     SV *pat = pattern; /* defeat constness! */
5298     PERL_ARGS_ASSERT_RE_COMPILE;
5299     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5300 #ifdef PERL_IN_XSUB_RE
5301                                 &my_reg_engine,
5302 #else
5303                                 &PL_core_reg_engine,
5304 #endif
5305                                 NULL, NULL, rx_flags, 0);
5306 }
5307
5308
5309 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5310  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5311  * point to the realloced string and length.
5312  *
5313  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5314  * stuff added */
5315
5316 static void
5317 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5318                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5319 {
5320     U8 *const src = (U8*)*pat_p;
5321     U8 *dst;
5322     int n=0;
5323     STRLEN s = 0, d = 0;
5324     bool do_end = 0;
5325     GET_RE_DEBUG_FLAGS_DECL;
5326
5327     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5328         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5329
5330     Newx(dst, *plen_p * 2 + 1, U8);
5331
5332     while (s < *plen_p) {
5333         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5334             dst[d]   = src[s];
5335         else {
5336             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5337             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5338         }
5339         if (n < num_code_blocks) {
5340             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5341                 pRExC_state->code_blocks[n].start = d;
5342                 assert(dst[d] == '(');
5343                 do_end = 1;
5344             }
5345             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5346                 pRExC_state->code_blocks[n].end = d;
5347                 assert(dst[d] == ')');
5348                 do_end = 0;
5349                 n++;
5350             }
5351         }
5352         s++;
5353         d++;
5354     }
5355     dst[d] = '\0';
5356     *plen_p = d;
5357     *pat_p = (char*) dst;
5358     SAVEFREEPV(*pat_p);
5359     RExC_orig_utf8 = RExC_utf8 = 1;
5360 }
5361
5362
5363
5364 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5365  * while recording any code block indices, and handling overloading,
5366  * nested qr// objects etc.  If pat is null, it will allocate a new
5367  * string, or just return the first arg, if there's only one.
5368  *
5369  * Returns the malloced/updated pat.
5370  * patternp and pat_count is the array of SVs to be concatted;
5371  * oplist is the optional list of ops that generated the SVs;
5372  * recompile_p is a pointer to a boolean that will be set if
5373  *   the regex will need to be recompiled.
5374  * delim, if non-null is an SV that will be inserted between each element
5375  */
5376
5377 static SV*
5378 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5379                 SV *pat, SV ** const patternp, int pat_count,
5380                 OP *oplist, bool *recompile_p, SV *delim)
5381 {
5382     SV **svp;
5383     int n = 0;
5384     bool use_delim = FALSE;
5385     bool alloced = FALSE;
5386
5387     /* if we know we have at least two args, create an empty string,
5388      * then concatenate args to that. For no args, return an empty string */
5389     if (!pat && pat_count != 1) {
5390         pat = newSVpvn("", 0);
5391         SAVEFREESV(pat);
5392         alloced = TRUE;
5393     }
5394
5395     for (svp = patternp; svp < patternp + pat_count; svp++) {
5396         SV *sv;
5397         SV *rx  = NULL;
5398         STRLEN orig_patlen = 0;
5399         bool code = 0;
5400         SV *msv = use_delim ? delim : *svp;
5401         if (!msv) msv = &PL_sv_undef;
5402
5403         /* if we've got a delimiter, we go round the loop twice for each
5404          * svp slot (except the last), using the delimiter the second
5405          * time round */
5406         if (use_delim) {
5407             svp--;
5408             use_delim = FALSE;
5409         }
5410         else if (delim)
5411             use_delim = TRUE;
5412
5413         if (SvTYPE(msv) == SVt_PVAV) {
5414             /* we've encountered an interpolated array within
5415              * the pattern, e.g. /...@a..../. Expand the list of elements,
5416              * then recursively append elements.
5417              * The code in this block is based on S_pushav() */
5418
5419             AV *const av = (AV*)msv;
5420             const SSize_t maxarg = AvFILL(av) + 1;
5421             SV **array;
5422
5423             if (oplist) {
5424                 assert(oplist->op_type == OP_PADAV
5425                     || oplist->op_type == OP_RV2AV);
5426                 oplist = oplist->op_sibling;;
5427             }
5428
5429             if (SvRMAGICAL(av)) {
5430                 SSize_t i;
5431
5432                 Newx(array, maxarg, SV*);
5433                 SAVEFREEPV(array);
5434                 for (i=0; i < maxarg; i++) {
5435                     SV ** const svp = av_fetch(av, i, FALSE);
5436                     array[i] = svp ? *svp : &PL_sv_undef;
5437                 }
5438             }
5439             else
5440                 array = AvARRAY(av);
5441
5442             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5443                                 array, maxarg, NULL, recompile_p,
5444                                 /* $" */
5445                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5446
5447             continue;
5448         }
5449
5450
5451         /* we make the assumption here that each op in the list of
5452          * op_siblings maps to one SV pushed onto the stack,
5453          * except for code blocks, with have both an OP_NULL and
5454          * and OP_CONST.
5455          * This allows us to match up the list of SVs against the
5456          * list of OPs to find the next code block.
5457          *
5458          * Note that       PUSHMARK PADSV PADSV ..
5459          * is optimised to
5460          *                 PADRANGE PADSV  PADSV  ..
5461          * so the alignment still works. */
5462
5463         if (oplist) {
5464             if (oplist->op_type == OP_NULL
5465                 && (oplist->op_flags & OPf_SPECIAL))
5466             {
5467                 assert(n < pRExC_state->num_code_blocks);
5468                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5469                 pRExC_state->code_blocks[n].block = oplist;
5470                 pRExC_state->code_blocks[n].src_regex = NULL;
5471                 n++;
5472                 code = 1;
5473                 oplist = oplist->op_sibling; /* skip CONST */
5474                 assert(oplist);
5475             }
5476             oplist = oplist->op_sibling;;
5477         }
5478
5479         /* apply magic and QR overloading to arg */
5480
5481         SvGETMAGIC(msv);
5482         if (SvROK(msv) && SvAMAGIC(msv)) {
5483             SV *sv = AMG_CALLunary(msv, regexp_amg);
5484             if (sv) {
5485                 if (SvROK(sv))
5486                     sv = SvRV(sv);
5487                 if (SvTYPE(sv) != SVt_REGEXP)
5488                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5489                 msv = sv;
5490             }
5491         }
5492
5493         /* try concatenation overload ... */
5494         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5495                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5496         {
5497             sv_setsv(pat, sv);
5498             /* overloading involved: all bets are off over literal
5499              * code. Pretend we haven't seen it */
5500             pRExC_state->num_code_blocks -= n;
5501             n = 0;
5502         }
5503         else  {
5504             /* ... or failing that, try "" overload */
5505             while (SvAMAGIC(msv)
5506                     && (sv = AMG_CALLunary(msv, string_amg))
5507                     && sv != msv
5508                     &&  !(   SvROK(msv)
5509                           && SvROK(sv)
5510                           && SvRV(msv) == SvRV(sv))
5511             ) {
5512                 msv = sv;
5513                 SvGETMAGIC(msv);
5514             }
5515             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5516                 msv = SvRV(msv);
5517
5518             if (pat) {
5519                 /* this is a partially unrolled
5520                  *     sv_catsv_nomg(pat, msv);
5521                  * that allows us to adjust code block indices if
5522                  * needed */
5523                 STRLEN dlen;
5524                 char *dst = SvPV_force_nomg(pat, dlen);
5525                 orig_patlen = dlen;
5526                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5527                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5528                     sv_setpvn(pat, dst, dlen);
5529                     SvUTF8_on(pat);
5530                 }
5531                 sv_catsv_nomg(pat, msv);
5532                 rx = msv;
5533             }
5534             else
5535                 pat = msv;
5536
5537             if (code)
5538                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5539         }
5540
5541         /* extract any code blocks within any embedded qr//'s */
5542         if (rx && SvTYPE(rx) == SVt_REGEXP
5543             && RX_ENGINE((REGEXP*)rx)->op_comp)
5544         {
5545
5546             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5547             if (ri->num_code_blocks) {
5548                 int i;
5549                 /* the presence of an embedded qr// with code means
5550                  * we should always recompile: the text of the
5551                  * qr// may not have changed, but it may be a
5552                  * different closure than last time */
5553                 *recompile_p = 1;
5554                 Renew(pRExC_state->code_blocks,
5555                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5556                     struct reg_code_block);
5557                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5558
5559                 for (i=0; i < ri->num_code_blocks; i++) {
5560                     struct reg_code_block *src, *dst;
5561                     STRLEN offset =  orig_patlen
5562                         + ReANY((REGEXP *)rx)->pre_prefix;
5563                     assert(n < pRExC_state->num_code_blocks);
5564                     src = &ri->code_blocks[i];
5565                     dst = &pRExC_state->code_blocks[n];
5566                     dst->start      = src->start + offset;
5567                     dst->end        = src->end   + offset;
5568                     dst->block      = src->block;
5569                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5570                                             src->src_regex
5571                                                 ? src->src_regex
5572                                                 : (REGEXP*)rx);
5573                     n++;
5574                 }
5575             }
5576         }
5577     }
5578     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5579     if (alloced)
5580         SvSETMAGIC(pat);
5581
5582     return pat;
5583 }
5584
5585
5586
5587 /* see if there are any run-time code blocks in the pattern.
5588  * False positives are allowed */
5589
5590 static bool
5591 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5592                     char *pat, STRLEN plen)
5593 {
5594     int n = 0;
5595     STRLEN s;
5596
5597     for (s = 0; s < plen; s++) {
5598         if (n < pRExC_state->num_code_blocks
5599             && s == pRExC_state->code_blocks[n].start)
5600         {
5601             s = pRExC_state->code_blocks[n].end;
5602             n++;
5603             continue;
5604         }
5605         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5606          * positives here */
5607         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5608             (pat[s+2] == '{'
5609                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5610         )
5611             return 1;
5612     }
5613     return 0;
5614 }
5615
5616 /* Handle run-time code blocks. We will already have compiled any direct
5617  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5618  * copy of it, but with any literal code blocks blanked out and
5619  * appropriate chars escaped; then feed it into
5620  *
5621  *    eval "qr'modified_pattern'"
5622  *
5623  * For example,
5624  *
5625  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5626  *
5627  * becomes
5628  *
5629  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5630  *
5631  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5632  * and merge them with any code blocks of the original regexp.
5633  *
5634  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5635  * instead, just save the qr and return FALSE; this tells our caller that
5636  * the original pattern needs upgrading to utf8.
5637  */
5638
5639 static bool
5640 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5641     char *pat, STRLEN plen)
5642 {
5643     SV *qr;
5644
5645     GET_RE_DEBUG_FLAGS_DECL;
5646
5647     if (pRExC_state->runtime_code_qr) {
5648         /* this is the second time we've been called; this should
5649          * only happen if the main pattern got upgraded to utf8
5650          * during compilation; re-use the qr we compiled first time
5651          * round (which should be utf8 too)
5652          */
5653         qr = pRExC_state->runtime_code_qr;
5654         pRExC_state->runtime_code_qr = NULL;
5655         assert(RExC_utf8 && SvUTF8(qr));
5656     }
5657     else {
5658         int n = 0;
5659         STRLEN s;
5660         char *p, *newpat;
5661         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5662         SV *sv, *qr_ref;
5663         dSP;
5664
5665         /* determine how many extra chars we need for ' and \ escaping */
5666         for (s = 0; s < plen; s++) {
5667             if (pat[s] == '\'' || pat[s] == '\\')
5668                 newlen++;
5669         }
5670
5671         Newx(newpat, newlen, char);
5672         p = newpat;
5673         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5674
5675         for (s = 0; s < plen; s++) {
5676             if (n < pRExC_state->num_code_blocks
5677                 && s == pRExC_state->code_blocks[n].start)
5678             {
5679                 /* blank out literal code block */
5680                 assert(pat[s] == '(');
5681                 while (s <= pRExC_state->code_blocks[n].end) {
5682                     *p++ = '_';
5683                     s++;
5684                 }
5685                 s--;
5686                 n++;
5687                 continue;
5688             }
5689             if (pat[s] == '\'' || pat[s] == '\\')
5690                 *p++ = '\\';
5691             *p++ = pat[s];
5692         }
5693         *p++ = '\'';
5694         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5695             *p++ = 'x';
5696         *p++ = '\0';
5697         DEBUG_COMPILE_r({
5698             PerlIO_printf(Perl_debug_log,
5699                 "%sre-parsing pattern for runtime code:%s %s\n",
5700                 PL_colors[4],PL_colors[5],newpat);
5701         });
5702
5703         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5704         Safefree(newpat);
5705
5706         ENTER;
5707         SAVETMPS;
5708         save_re_context();
5709         PUSHSTACKi(PERLSI_REQUIRE);
5710         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5711          * parsing qr''; normally only q'' does this. It also alters
5712          * hints handling */
5713         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5714         SvREFCNT_dec_NN(sv);
5715         SPAGAIN;
5716         qr_ref = POPs;
5717         PUTBACK;
5718         {
5719             SV * const errsv = ERRSV;
5720             if (SvTRUE_NN(errsv))
5721             {
5722                 Safefree(pRExC_state->code_blocks);
5723                 /* use croak_sv ? */
5724                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5725             }
5726         }
5727         assert(SvROK(qr_ref));
5728         qr = SvRV(qr_ref);
5729         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5730         /* the leaving below frees the tmp qr_ref.
5731          * Give qr a life of its own */
5732         SvREFCNT_inc(qr);
5733         POPSTACK;
5734         FREETMPS;
5735         LEAVE;
5736
5737     }
5738
5739     if (!RExC_utf8 && SvUTF8(qr)) {
5740         /* first time through; the pattern got upgraded; save the
5741          * qr for the next time through */
5742         assert(!pRExC_state->runtime_code_qr);
5743         pRExC_state->runtime_code_qr = qr;
5744         return 0;
5745     }
5746
5747
5748     /* extract any code blocks within the returned qr//  */
5749
5750
5751     /* merge the main (r1) and run-time (r2) code blocks into one */
5752     {
5753         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5754         struct reg_code_block *new_block, *dst;
5755         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5756         int i1 = 0, i2 = 0;
5757
5758         if (!r2->num_code_blocks) /* we guessed wrong */
5759         {
5760             SvREFCNT_dec_NN(qr);
5761             return 1;
5762         }
5763
5764         Newx(new_block,
5765             r1->num_code_blocks + r2->num_code_blocks,
5766             struct reg_code_block);
5767         dst = new_block;
5768
5769         while (    i1 < r1->num_code_blocks
5770                 || i2 < r2->num_code_blocks)
5771         {
5772             struct reg_code_block *src;
5773             bool is_qr = 0;
5774
5775             if (i1 == r1->num_code_blocks) {
5776                 src = &r2->code_blocks[i2++];
5777                 is_qr = 1;
5778             }
5779             else if (i2 == r2->num_code_blocks)
5780                 src = &r1->code_blocks[i1++];
5781             else if (  r1->code_blocks[i1].start
5782                      < r2->code_blocks[i2].start)
5783             {
5784                 src = &r1->code_blocks[i1++];
5785                 assert(src->end < r2->code_blocks[i2].start);
5786             }
5787             else {
5788                 assert(  r1->code_blocks[i1].start
5789                        > r2->code_blocks[i2].start);
5790                 src = &r2->code_blocks[i2++];
5791                 is_qr = 1;
5792                 assert(src->end < r1->code_blocks[i1].start);
5793             }
5794
5795             assert(pat[src->start] == '(');
5796             assert(pat[src->end]   == ')');
5797             dst->start      = src->start;
5798             dst->end        = src->end;
5799             dst->block      = src->block;
5800             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5801                                     : src->src_regex;
5802             dst++;
5803         }
5804         r1->num_code_blocks += r2->num_code_blocks;
5805         Safefree(r1->code_blocks);
5806         r1->code_blocks = new_block;
5807     }
5808
5809     SvREFCNT_dec_NN(qr);
5810     return 1;
5811 }
5812
5813
5814 STATIC bool
5815 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5816                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5817 {
5818     /* This is the common code for setting up the floating and fixed length
5819      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5820      * as to whether succeeded or not */
5821
5822     I32 t;
5823     SSize_t ml;
5824
5825     if (! (longest_length
5826            || (eol /* Can't have SEOL and MULTI */
5827                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5828           )
5829             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5830         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5831     {
5832         return FALSE;
5833     }
5834
5835     /* copy the information about the longest from the reg_scan_data
5836         over to the program. */
5837     if (SvUTF8(sv_longest)) {
5838         *rx_utf8 = sv_longest;
5839         *rx_substr = NULL;
5840     } else {
5841         *rx_substr = sv_longest;
5842         *rx_utf8 = NULL;
5843     }
5844     /* end_shift is how many chars that must be matched that
5845         follow this item. We calculate it ahead of time as once the
5846         lookbehind offset is added in we lose the ability to correctly
5847         calculate it.*/
5848     ml = minlen ? *(minlen) : (SSize_t)longest_length;
5849     *rx_end_shift = ml - offset
5850         - longest_length + (SvTAIL(sv_longest) != 0)
5851         + lookbehind;
5852
5853     t = (eol/* Can't have SEOL and MULTI */
5854          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5855     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5856
5857     return TRUE;
5858 }
5859
5860 /*
5861  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5862  * regular expression into internal code.
5863  * The pattern may be passed either as:
5864  *    a list of SVs (patternp plus pat_count)
5865  *    a list of OPs (expr)
5866  * If both are passed, the SV list is used, but the OP list indicates
5867  * which SVs are actually pre-compiled code blocks
5868  *
5869  * The SVs in the list have magic and qr overloading applied to them (and
5870  * the list may be modified in-place with replacement SVs in the latter
5871  * case).
5872  *
5873  * If the pattern hasn't changed from old_re, then old_re will be
5874  * returned.
5875  *
5876  * eng is the current engine. If that engine has an op_comp method, then
5877  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5878  * do the initial concatenation of arguments and pass on to the external
5879  * engine.
5880  *
5881  * If is_bare_re is not null, set it to a boolean indicating whether the
5882  * arg list reduced (after overloading) to a single bare regex which has
5883  * been returned (i.e. /$qr/).
5884  *
5885  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5886  *
5887  * pm_flags contains the PMf_* flags, typically based on those from the
5888  * pm_flags field of the related PMOP. Currently we're only interested in
5889  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5890  *
5891  * We can't allocate space until we know how big the compiled form will be,
5892  * but we can't compile it (and thus know how big it is) until we've got a
5893  * place to put the code.  So we cheat:  we compile it twice, once with code
5894  * generation turned off and size counting turned on, and once "for real".
5895  * This also means that we don't allocate space until we are sure that the
5896  * thing really will compile successfully, and we never have to move the
5897  * code and thus invalidate pointers into it.  (Note that it has to be in
5898  * one piece because free() must be able to free it all.) [NB: not true in perl]
5899  *
5900  * Beware that the optimization-preparation code in here knows about some
5901  * of the structure of the compiled regexp.  [I'll say.]
5902  */
5903
5904 REGEXP *
5905 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5906                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
5907                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5908 {
5909     dVAR;
5910     REGEXP *rx;
5911     struct regexp *r;
5912     regexp_internal *ri;
5913     STRLEN plen;
5914     char *exp;
5915     regnode *scan;
5916     I32 flags;
5917     SSize_t minlen = 0;
5918     U32 rx_flags;
5919     SV *pat;
5920     SV *code_blocksv = NULL;
5921     SV** new_patternp = patternp;
5922
5923     /* these are all flags - maybe they should be turned
5924      * into a single int with different bit masks */
5925     I32 sawlookahead = 0;
5926     I32 sawplus = 0;
5927     I32 sawopen = 0;
5928     I32 sawminmod = 0;
5929
5930     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5931     bool recompile = 0;
5932     bool runtime_code = 0;
5933     scan_data_t data;
5934     RExC_state_t RExC_state;
5935     RExC_state_t * const pRExC_state = &RExC_state;
5936 #ifdef TRIE_STUDY_OPT
5937     int restudied = 0;
5938     RExC_state_t copyRExC_state;
5939 #endif
5940     GET_RE_DEBUG_FLAGS_DECL;
5941
5942     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5943
5944     DEBUG_r(if (!PL_colorset) reginitcolors());
5945
5946 #ifndef PERL_IN_XSUB_RE
5947     /* Initialize these here instead of as-needed, as is quick and avoids
5948      * having to test them each time otherwise */
5949     if (! PL_AboveLatin1) {
5950         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5951         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5952         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
5953
5954         PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5955         PL_L1Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5956         PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5957
5958         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5959                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5960         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5961                                 = _new_invlist_C_array(PosixAlnum_invlist);
5962
5963         PL_L1Posix_ptrs[_CC_ALPHA]
5964                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5965         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5966
5967         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5968         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5969
5970         /* Cased is the same as Alpha in the ASCII range */
5971         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5972         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5973
5974         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5975         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5976
5977         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5978         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5979
5980         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5981         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5982
5983         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5984         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5985
5986         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5987         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5988
5989         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5990         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5991
5992         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5993         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5994         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5995         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5996
5997         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5998         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5999
6000         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
6001
6002         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
6003         PL_L1Posix_ptrs[_CC_WORDCHAR]
6004                                 = _new_invlist_C_array(L1PosixWord_invlist);
6005
6006         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
6007         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
6008
6009         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
6010     }
6011 #endif
6012
6013     pRExC_state->code_blocks = NULL;
6014     pRExC_state->num_code_blocks = 0;
6015
6016     if (is_bare_re)
6017         *is_bare_re = FALSE;
6018
6019     if (expr && (expr->op_type == OP_LIST ||
6020                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6021         /* allocate code_blocks if needed */
6022         OP *o;
6023         int ncode = 0;
6024
6025         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6026             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6027                 ncode++; /* count of DO blocks */
6028         if (ncode) {
6029             pRExC_state->num_code_blocks = ncode;
6030             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6031         }
6032     }
6033
6034     if (!pat_count) {
6035         /* compile-time pattern with just OP_CONSTs and DO blocks */
6036
6037         int n;
6038         OP *o;
6039
6040         /* find how many CONSTs there are */
6041         assert(expr);
6042         n = 0;
6043         if (expr->op_type == OP_CONST)
6044             n = 1;
6045         else
6046             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6047                 if (o->op_type == OP_CONST)
6048                     n++;
6049             }
6050
6051         /* fake up an SV array */
6052
6053         assert(!new_patternp);
6054         Newx(new_patternp, n, SV*);
6055         SAVEFREEPV(new_patternp);
6056         pat_count = n;
6057
6058         n = 0;
6059         if (expr->op_type == OP_CONST)
6060             new_patternp[n] = cSVOPx_sv(expr);
6061         else
6062             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6063                 if (o->op_type == OP_CONST)
6064                     new_patternp[n++] = cSVOPo_sv;
6065             }
6066
6067     }
6068
6069     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6070         "Assembling pattern from %d elements%s\n", pat_count,
6071             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6072
6073     /* set expr to the first arg op */
6074
6075     if (pRExC_state->num_code_blocks
6076          && expr->op_type != OP_CONST)
6077     {
6078             expr = cLISTOPx(expr)->op_first;
6079             assert(   expr->op_type == OP_PUSHMARK
6080                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6081                    || expr->op_type == OP_PADRANGE);
6082             expr = expr->op_sibling;
6083     }
6084
6085     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6086                         expr, &recompile, NULL);
6087
6088     /* handle bare (possibly after overloading) regex: foo =~ $re */
6089     {
6090         SV *re = pat;
6091         if (SvROK(re))
6092             re = SvRV(re);
6093         if (SvTYPE(re) == SVt_REGEXP) {
6094             if (is_bare_re)
6095                 *is_bare_re = TRUE;
6096             SvREFCNT_inc(re);
6097             Safefree(pRExC_state->code_blocks);
6098             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6099                 "Precompiled pattern%s\n",
6100                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6101
6102             return (REGEXP*)re;
6103         }
6104     }
6105
6106     exp = SvPV_nomg(pat, plen);
6107
6108     if (!eng->op_comp) {
6109         if ((SvUTF8(pat) && IN_BYTES)
6110                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6111         {
6112             /* make a temporary copy; either to convert to bytes,
6113              * or to avoid repeating get-magic / overloaded stringify */
6114             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6115                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6116         }
6117         Safefree(pRExC_state->code_blocks);
6118         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6119     }
6120
6121     /* ignore the utf8ness if the pattern is 0 length */
6122     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6123     RExC_uni_semantics = 0;
6124     RExC_contains_locale = 0;
6125     RExC_contains_i = 0;
6126     pRExC_state->runtime_code_qr = NULL;
6127
6128     DEBUG_COMPILE_r({
6129             SV *dsv= sv_newmortal();
6130             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6131             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6132                           PL_colors[4],PL_colors[5],s);
6133         });
6134
6135   redo_first_pass:
6136     /* we jump here if we upgrade the pattern to utf8 and have to
6137      * recompile */
6138
6139     if ((pm_flags & PMf_USE_RE_EVAL)
6140                 /* this second condition covers the non-regex literal case,
6141                  * i.e.  $foo =~ '(?{})'. */
6142                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6143     )
6144         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6145
6146     /* return old regex if pattern hasn't changed */
6147     /* XXX: note in the below we have to check the flags as well as the pattern.
6148      *
6149      * Things get a touch tricky as we have to compare the utf8 flag independently
6150      * from the compile flags.
6151      */
6152
6153     if (   old_re
6154         && !recompile
6155         && !!RX_UTF8(old_re) == !!RExC_utf8
6156         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6157         && RX_PRECOMP(old_re)
6158         && RX_PRELEN(old_re) == plen
6159         && memEQ(RX_PRECOMP(old_re), exp, plen)
6160         && !runtime_code /* with runtime code, always recompile */ )
6161     {
6162         Safefree(pRExC_state->code_blocks);
6163         return old_re;
6164     }
6165
6166     rx_flags = orig_rx_flags;
6167
6168     if (rx_flags & PMf_FOLD) {
6169         RExC_contains_i = 1;
6170     }
6171     if (initial_charset == REGEX_LOCALE_CHARSET) {
6172         RExC_contains_locale = 1;
6173     }
6174     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6175
6176         /* Set to use unicode semantics if the pattern is in utf8 and has the
6177          * 'depends' charset specified, as it means unicode when utf8  */
6178         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6179     }
6180
6181     RExC_precomp = exp;
6182     RExC_flags = rx_flags;
6183     RExC_pm_flags = pm_flags;
6184
6185     if (runtime_code) {
6186         if (TAINTING_get && TAINT_get)
6187             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6188
6189         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6190             /* whoops, we have a non-utf8 pattern, whilst run-time code
6191              * got compiled as utf8. Try again with a utf8 pattern */
6192             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6193                                     pRExC_state->num_code_blocks);
6194             goto redo_first_pass;
6195         }
6196     }
6197     assert(!pRExC_state->runtime_code_qr);
6198
6199     RExC_sawback = 0;
6200
6201     RExC_seen = 0;
6202     RExC_in_lookbehind = 0;
6203     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6204     RExC_extralen = 0;
6205     RExC_override_recoding = 0;
6206     RExC_in_multi_char_class = 0;
6207
6208     /* First pass: determine size, legality. */
6209     RExC_parse = exp;
6210     RExC_start = exp;
6211     RExC_end = exp + plen;
6212     RExC_naughty = 0;
6213     RExC_npar = 1;
6214     RExC_nestroot = 0;
6215     RExC_size = 0L;
6216     RExC_emit = (regnode *) &RExC_emit_dummy;
6217     RExC_whilem_seen = 0;
6218     RExC_open_parens = NULL;
6219     RExC_close_parens = NULL;
6220     RExC_opend = NULL;
6221     RExC_paren_names = NULL;
6222 #ifdef DEBUGGING
6223     RExC_paren_name_list = NULL;
6224 #endif
6225     RExC_recurse = NULL;
6226     RExC_study_chunk_recursed = NULL;
6227     RExC_study_chunk_recursed_bytes= 0;
6228     RExC_recurse_count = 0;
6229     pRExC_state->code_index = 0;
6230
6231 #if 0 /* REGC() is (currently) a NOP at the first pass.
6232        * Clever compilers notice this and complain. --jhi */
6233     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6234 #endif
6235     DEBUG_PARSE_r(
6236         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6237         RExC_lastnum=0;
6238         RExC_lastparse=NULL;
6239     );
6240     /* reg may croak on us, not giving us a chance to free
6241        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6242        need it to survive as long as the regexp (qr/(?{})/).
6243        We must check that code_blocksv is not already set, because we may
6244        have jumped back to restart the sizing pass. */
6245     if (pRExC_state->code_blocks && !code_blocksv) {
6246         code_blocksv = newSV_type(SVt_PV);
6247         SAVEFREESV(code_blocksv);
6248         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6249         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6250     }
6251     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6252         /* It's possible to write a regexp in ascii that represents Unicode
6253         codepoints outside of the byte range, such as via \x{100}. If we
6254         detect such a sequence we have to convert the entire pattern to utf8
6255         and then recompile, as our sizing calculation will have been based
6256         on 1 byte == 1 character, but we will need to use utf8 to encode
6257         at least some part of the pattern, and therefore must convert the whole
6258         thing.
6259         -- dmq */
6260         if (flags & RESTART_UTF8) {
6261             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6262                                     pRExC_state->num_code_blocks);
6263             goto redo_first_pass;
6264         }
6265         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6266     }
6267     if (code_blocksv)
6268         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6269
6270     DEBUG_PARSE_r({
6271         PerlIO_printf(Perl_debug_log,
6272             "Required size %"IVdf" nodes\n"
6273             "Starting second pass (creation)\n",
6274             (IV)RExC_size);
6275         RExC_lastnum=0;
6276         RExC_lastparse=NULL;
6277     });
6278
6279     /* The first pass could have found things that force Unicode semantics */
6280     if ((RExC_utf8 || RExC_uni_semantics)
6281          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6282     {
6283         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6284     }
6285
6286     /* Small enough for pointer-storage convention?
6287        If extralen==0, this means that we will not need long jumps. */
6288     if (RExC_size >= 0x10000L && RExC_extralen)
6289         RExC_size += RExC_extralen;
6290     else
6291         RExC_extralen = 0;
6292     if (RExC_whilem_seen > 15)
6293         RExC_whilem_seen = 15;
6294
6295     /* Allocate space and zero-initialize. Note, the two step process
6296        of zeroing when in debug mode, thus anything assigned has to
6297        happen after that */
6298     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6299     r = ReANY(rx);
6300     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6301          char, regexp_internal);
6302     if ( r == NULL || ri == NULL )
6303         FAIL("Regexp out of space");
6304 #ifdef DEBUGGING
6305     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6306     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
6307 #else
6308     /* bulk initialize base fields with 0. */
6309     Zero(ri, sizeof(regexp_internal), char);
6310 #endif
6311
6312     /* non-zero initialization begins here */
6313     RXi_SET( r, ri );
6314     r->engine= eng;
6315     r->extflags = rx_flags;
6316     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6317
6318     if (pm_flags & PMf_IS_QR) {
6319         ri->code_blocks = pRExC_state->code_blocks;
6320         ri->num_code_blocks = pRExC_state->num_code_blocks;
6321     }
6322     else
6323     {
6324         int n;
6325         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6326             if (pRExC_state->code_blocks[n].src_regex)
6327                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6328         SAVEFREEPV(pRExC_state->code_blocks);
6329     }
6330
6331     {
6332         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6333         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
6334
6335         /* The caret is output if there are any defaults: if not all the STD
6336          * flags are set, or if no character set specifier is needed */
6337         bool has_default =
6338                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6339                     || ! has_charset);
6340         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
6341         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6342                             >> RXf_PMf_STD_PMMOD_SHIFT);
6343         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6344         char *p;
6345         /* Allocate for the worst case, which is all the std flags are turned
6346          * on.  If more precision is desired, we could do a population count of
6347          * the flags set.  This could be done with a small lookup table, or by
6348          * shifting, masking and adding, or even, when available, assembly
6349          * language for a machine-language population count.
6350          * We never output a minus, as all those are defaults, so are
6351          * covered by the caret */
6352         const STRLEN wraplen = plen + has_p + has_runon
6353             + has_default       /* If needs a caret */
6354
6355                 /* If needs a character set specifier */
6356             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6357             + (sizeof(STD_PAT_MODS) - 1)
6358             + (sizeof("(?:)") - 1);
6359
6360         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6361         r->xpv_len_u.xpvlenu_pv = p;
6362         if (RExC_utf8)
6363             SvFLAGS(rx) |= SVf_UTF8;
6364         *p++='('; *p++='?';
6365
6366         /* If a default, cover it using the caret */
6367         if (has_default) {
6368             *p++= DEFAULT_PAT_MOD;
6369         }
6370         if (has_charset) {
6371             STRLEN len;
6372             const char* const name = get_regex_charset_name(r->extflags, &len);
6373             Copy(name, p, len, char);
6374             p += len;
6375         }
6376         if (has_p)
6377             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6378         {
6379             char ch;
6380             while((ch = *fptr++)) {
6381                 if(reganch & 1)
6382                     *p++ = ch;
6383                 reganch >>= 1;
6384             }
6385         }
6386
6387         *p++ = ':';
6388         Copy(RExC_precomp, p, plen, char);
6389         assert ((RX_WRAPPED(rx) - p) < 16);
6390         r->pre_prefix = p - RX_WRAPPED(rx);
6391         p += plen;
6392         if (has_runon)
6393             *p++ = '\n';
6394         *p++ = ')';
6395         *p = 0;
6396         SvCUR_set(rx, p - RX_WRAPPED(rx));
6397     }
6398
6399     r->intflags = 0;
6400     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6401
6402     /* setup various meta data about recursion, this all requires
6403      * RExC_npar to be correctly set, and a bit later on we clear it */
6404     if (RExC_seen & REG_SEEN_RECURSE) {
6405         Newxz(RExC_open_parens, RExC_npar,regnode *);
6406         SAVEFREEPV(RExC_open_parens);
6407         Newxz(RExC_close_parens,RExC_npar,regnode *);
6408         SAVEFREEPV(RExC_close_parens);
6409     }
6410     if (RExC_seen & (REG_SEEN_RECURSE | REG_SEEN_GOSTART)) {
6411         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6412          * So its 1 if there are no parens. */
6413         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6414                                          ((RExC_npar & 0x07) != 0);
6415         Newx(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6416         SAVEFREEPV(RExC_study_chunk_recursed);
6417     }
6418
6419     /* Useful during FAIL. */
6420 #ifdef RE_TRACK_PATTERN_OFFSETS
6421     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6422     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6423                           "%s %"UVuf" bytes for offset annotations.\n",
6424                           ri->u.offsets ? "Got" : "Couldn't get",
6425                           (UV)((2*RExC_size+1) * sizeof(U32))));
6426 #endif
6427     SetProgLen(ri,RExC_size);
6428     RExC_rx_sv = rx;
6429     RExC_rx = r;
6430     RExC_rxi = ri;
6431
6432     /* Second pass: emit code. */
6433     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6434     RExC_pm_flags = pm_flags;
6435     RExC_parse = exp;
6436     RExC_end = exp + plen;
6437     RExC_naughty = 0;
6438     RExC_npar = 1;
6439     RExC_emit_start = ri->program;
6440     RExC_emit = ri->program;
6441     RExC_emit_bound = ri->program + RExC_size + 1;
6442     pRExC_state->code_index = 0;
6443
6444     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6445     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6446         ReREFCNT_dec(rx);
6447         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6448     }
6449     /* XXXX To minimize changes to RE engine we always allocate
6450        3-units-long substrs field. */
6451     Newx(r->substrs, 1, struct reg_substr_data);
6452     if (RExC_recurse_count) {
6453         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6454         SAVEFREEPV(RExC_recurse);
6455     }
6456
6457 reStudy:
6458     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6459     Zero(r->substrs, 1, struct reg_substr_data);
6460     if (RExC_study_chunk_recursed)
6461         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6462
6463 #ifdef TRIE_STUDY_OPT
6464     if (!restudied) {
6465         StructCopy(&zero_scan_data, &data, scan_data_t);
6466         copyRExC_state = RExC_state;
6467     } else {
6468         U32 seen=RExC_seen;
6469         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6470
6471         RExC_state = copyRExC_state;
6472         if (seen & REG_TOP_LEVEL_BRANCHES)
6473             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6474         else
6475             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6476         StructCopy(&zero_scan_data, &data, scan_data_t);
6477     }
6478 #else
6479     StructCopy(&zero_scan_data, &data, scan_data_t);
6480 #endif
6481
6482     /* Dig out information for optimizations. */
6483     r->extflags = RExC_flags; /* was pm_op */
6484     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6485
6486     if (UTF)
6487         SvUTF8_on(rx);  /* Unicode in it? */
6488     ri->regstclass = NULL;
6489     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6490         r->intflags |= PREGf_NAUGHTY;
6491     scan = ri->program + 1;             /* First BRANCH. */
6492
6493     /* testing for BRANCH here tells us whether there is "must appear"
6494        data in the pattern. If there is then we can use it for optimisations */
6495     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6496         SSize_t fake;
6497         STRLEN longest_float_length, longest_fixed_length;
6498         regnode_ssc ch_class; /* pointed to by data */
6499         int stclass_flag;
6500         SSize_t last_close = 0; /* pointed to by data */
6501         regnode *first= scan;
6502         regnode *first_next= regnext(first);
6503         /*
6504          * Skip introductions and multiplicators >= 1
6505          * so that we can extract the 'meat' of the pattern that must
6506          * match in the large if() sequence following.
6507          * NOTE that EXACT is NOT covered here, as it is normally
6508          * picked up by the optimiser separately.
6509          *
6510          * This is unfortunate as the optimiser isnt handling lookahead
6511          * properly currently.
6512          *
6513          */
6514         while ((OP(first) == OPEN && (sawopen = 1)) ||
6515                /* An OR of *one* alternative - should not happen now. */
6516             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6517             /* for now we can't handle lookbehind IFMATCH*/
6518             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6519             (OP(first) == PLUS) ||
6520             (OP(first) == MINMOD) ||
6521                /* An {n,m} with n>0 */
6522             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6523             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6524         {
6525                 /*
6526                  * the only op that could be a regnode is PLUS, all the rest
6527                  * will be regnode_1 or regnode_2.
6528                  *
6529                  * (yves doesn't think this is true)
6530                  */
6531                 if (OP(first) == PLUS)
6532                     sawplus = 1;
6533                 else {
6534                     if (OP(first) == MINMOD)
6535                         sawminmod = 1;
6536                     first += regarglen[OP(first)];
6537                 }
6538                 first = NEXTOPER(first);
6539                 first_next= regnext(first);
6540         }
6541
6542         /* Starting-point info. */
6543       again:
6544         DEBUG_PEEP("first:",first,0);
6545         /* Ignore EXACT as we deal with it later. */
6546         if (PL_regkind[OP(first)] == EXACT) {
6547             if (OP(first) == EXACT)
6548                 NOOP;   /* Empty, get anchored substr later. */
6549             else
6550                 ri->regstclass = first;
6551         }
6552 #ifdef TRIE_STCLASS
6553         else if (PL_regkind[OP(first)] == TRIE &&
6554                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6555         {
6556             regnode *trie_op;
6557             /* this can happen only on restudy */
6558             if ( OP(first) == TRIE ) {
6559                 struct regnode_1 *trieop = (struct regnode_1 *)
6560                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6561                 StructCopy(first,trieop,struct regnode_1);
6562                 trie_op=(regnode *)trieop;
6563             } else {
6564                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6565                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6566                 StructCopy(first,trieop,struct regnode_charclass);
6567                 trie_op=(regnode *)trieop;
6568             }
6569             OP(trie_op)+=2;
6570             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6571             ri->regstclass = trie_op;
6572         }
6573 #endif
6574         else if (REGNODE_SIMPLE(OP(first)))
6575             ri->regstclass = first;
6576         else if (PL_regkind[OP(first)] == BOUND ||
6577                  PL_regkind[OP(first)] == NBOUND)
6578             ri->regstclass = first;
6579         else if (PL_regkind[OP(first)] == BOL) {
6580             r->extflags |= (OP(first) == MBOL
6581                            ? RXf_ANCH_MBOL
6582                            : (OP(first) == SBOL
6583                               ? RXf_ANCH_SBOL
6584                               : RXf_ANCH_BOL));
6585             first = NEXTOPER(first);
6586             goto again;
6587         }
6588         else if (OP(first) == GPOS) {
6589             r->extflags |= RXf_ANCH_GPOS;
6590             first = NEXTOPER(first);
6591             goto again;
6592         }
6593         else if ((!sawopen || !RExC_sawback) &&
6594             (OP(first) == STAR &&
6595             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6596             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6597         {
6598             /* turn .* into ^.* with an implied $*=1 */
6599             const int type =
6600                 (OP(NEXTOPER(first)) == REG_ANY)
6601                     ? RXf_ANCH_MBOL
6602                     : RXf_ANCH_SBOL;
6603             r->extflags |= type;
6604             r->intflags |= PREGf_IMPLICIT;
6605             first = NEXTOPER(first);
6606             goto again;
6607         }
6608         if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6609             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6610             /* x+ must match at the 1st pos of run of x's */
6611             r->intflags |= PREGf_SKIP;
6612
6613         /* Scan is after the zeroth branch, first is atomic matcher. */
6614 #ifdef TRIE_STUDY_OPT
6615         DEBUG_PARSE_r(
6616             if (!restudied)
6617                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6618                               (IV)(first - scan + 1))
6619         );
6620 #else
6621         DEBUG_PARSE_r(
6622             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6623                 (IV)(first - scan + 1))
6624         );
6625 #endif
6626
6627
6628         /*
6629         * If there's something expensive in the r.e., find the
6630         * longest literal string that must appear and make it the
6631         * regmust.  Resolve ties in favor of later strings, since
6632         * the regstart check works with the beginning of the r.e.
6633         * and avoiding duplication strengthens checking.  Not a
6634         * strong reason, but sufficient in the absence of others.
6635         * [Now we resolve ties in favor of the earlier string if
6636         * it happens that c_offset_min has been invalidated, since the
6637         * earlier string may buy us something the later one won't.]
6638         */
6639
6640         data.longest_fixed = newSVpvs("");
6641         data.longest_float = newSVpvs("");
6642         data.last_found = newSVpvs("");
6643         data.longest = &(data.longest_fixed);
6644         ENTER_with_name("study_chunk");
6645         SAVEFREESV(data.longest_fixed);
6646         SAVEFREESV(data.longest_float);
6647         SAVEFREESV(data.last_found);
6648         first = scan;
6649         if (!ri->regstclass) {
6650             ssc_init(pRExC_state, &ch_class);
6651             data.start_class = &ch_class;
6652             stclass_flag = SCF_DO_STCLASS_AND;
6653         } else                          /* XXXX Check for BOUND? */
6654             stclass_flag = 0;
6655         data.last_closep = &last_close;
6656
6657         DEBUG_RExC_seen();
6658         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6659             &data, -1, 0, NULL,
6660             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6661                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6662             0);
6663
6664
6665         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6666
6667
6668         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6669              && data.last_start_min == 0 && data.last_end > 0
6670              && !RExC_seen_zerolen
6671              && !(RExC_seen & REG_SEEN_VERBARG)
6672              && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6673             r->extflags |= RXf_CHECK_ALL;
6674         scan_commit(pRExC_state, &data,&minlen,0);
6675
6676         longest_float_length = CHR_SVLEN(data.longest_float);
6677
6678         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6679                    && data.offset_fixed == data.offset_float_min
6680                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6681             && S_setup_longest (aTHX_ pRExC_state,
6682                                     data.longest_float,
6683                                     &(r->float_utf8),
6684                                     &(r->float_substr),
6685                                     &(r->float_end_shift),
6686                                     data.lookbehind_float,
6687                                     data.offset_float_min,
6688                                     data.minlen_float,
6689                                     longest_float_length,
6690                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6691                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6692         {
6693             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6694             r->float_max_offset = data.offset_float_max;
6695             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6696                 r->float_max_offset -= data.lookbehind_float;
6697             SvREFCNT_inc_simple_void_NN(data.longest_float);
6698         }
6699         else {
6700             r->float_substr = r->float_utf8 = NULL;
6701             longest_float_length = 0;
6702         }
6703
6704         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6705
6706         if (S_setup_longest (aTHX_ pRExC_state,
6707                                 data.longest_fixed,
6708                                 &(r->anchored_utf8),
6709                                 &(r->anchored_substr),
6710                                 &(r->anchored_end_shift),
6711                                 data.lookbehind_fixed,
6712                                 data.offset_fixed,
6713                                 data.minlen_fixed,
6714                                 longest_fixed_length,
6715                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6716                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6717         {
6718             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6719             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6720         }
6721         else {
6722             r->anchored_substr = r->anchored_utf8 = NULL;
6723             longest_fixed_length = 0;
6724         }
6725         LEAVE_with_name("study_chunk");
6726
6727         if (ri->regstclass
6728             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6729             ri->regstclass = NULL;
6730
6731         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6732             && stclass_flag
6733             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6734             && !ssc_is_anything(data.start_class))
6735         {
6736             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6737
6738             ssc_finalize(pRExC_state, data.start_class);
6739
6740             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6741             StructCopy(data.start_class,
6742                        (regnode_ssc*)RExC_rxi->data->data[n],
6743                        regnode_ssc);
6744             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6745             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6746             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6747                       regprop(r, sv, (regnode*)data.start_class);
6748                       PerlIO_printf(Perl_debug_log,
6749                                     "synthetic stclass \"%s\".\n",
6750                                     SvPVX_const(sv));});
6751             data.start_class = NULL;
6752         }
6753
6754         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6755         if (longest_fixed_length > longest_float_length) {
6756             r->check_end_shift = r->anchored_end_shift;
6757             r->check_substr = r->anchored_substr;
6758             r->check_utf8 = r->anchored_utf8;
6759             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6760             if (r->extflags & RXf_ANCH_SINGLE)
6761                 r->extflags |= RXf_NOSCAN;
6762         }
6763         else {
6764             r->check_end_shift = r->float_end_shift;
6765             r->check_substr = r->float_substr;
6766             r->check_utf8 = r->float_utf8;
6767             r->check_offset_min = r->float_min_offset;
6768             r->check_offset_max = r->float_max_offset;
6769         }
6770         if ((r->check_substr || r->check_utf8) ) {
6771             r->extflags |= RXf_USE_INTUIT;
6772             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6773                 r->extflags |= RXf_INTUIT_TAIL;
6774         }
6775         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6776         if ( (STRLEN)minlen < longest_float_length )
6777             minlen= longest_float_length;
6778         if ( (STRLEN)minlen < longest_fixed_length )
6779             minlen= longest_fixed_length;
6780         */
6781     }
6782     else {
6783         /* Several toplevels. Best we can is to set minlen. */
6784         SSize_t fake;
6785         regnode_ssc ch_class;
6786         SSize_t last_close = 0;
6787
6788         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6789
6790         scan = ri->program + 1;
6791         ssc_init(pRExC_state, &ch_class);
6792         data.start_class = &ch_class;
6793         data.last_closep = &last_close;
6794
6795         DEBUG_RExC_seen();
6796         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6797             &data, -1, 0, NULL,
6798             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6799                               |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6800             0);
6801
6802         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6803
6804         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6805                 = r->float_substr = r->float_utf8 = NULL;
6806
6807         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6808             && ! ssc_is_anything(data.start_class))
6809         {
6810             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6811
6812             ssc_finalize(pRExC_state, data.start_class);
6813
6814             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6815             StructCopy(data.start_class,
6816                        (regnode_ssc*)RExC_rxi->data->data[n],
6817                        regnode_ssc);
6818             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6819             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6820             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6821                       regprop(r, sv, (regnode*)data.start_class);
6822                       PerlIO_printf(Perl_debug_log,
6823                                     "synthetic stclass \"%s\".\n",
6824                                     SvPVX_const(sv));});
6825             data.start_class = NULL;
6826         }
6827     }
6828
6829     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6830        the "real" pattern. */
6831     DEBUG_OPTIMISE_r({
6832         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6833                       (IV)minlen, (IV)r->minlen);
6834     });
6835     r->minlenret = minlen;
6836     if (r->minlen < minlen)
6837         r->minlen = minlen;
6838
6839     if (RExC_seen & REG_SEEN_GPOS)
6840         r->extflags |= RXf_GPOS_SEEN;
6841     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6842         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6843     if (pRExC_state->num_code_blocks)
6844         r->extflags |= RXf_EVAL_SEEN;
6845     if (RExC_seen & REG_SEEN_CANY)
6846         r->extflags |= RXf_CANY_SEEN;
6847     if (RExC_seen & REG_SEEN_VERBARG)
6848     {
6849         r->intflags |= PREGf_VERBARG_SEEN;
6850         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6851     }
6852     if (RExC_seen & REG_SEEN_CUTGROUP)
6853         r->intflags |= PREGf_CUTGROUP_SEEN;
6854     if (pm_flags & PMf_USE_RE_EVAL)
6855         r->intflags |= PREGf_USE_RE_EVAL;
6856     if (RExC_paren_names)
6857         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6858     else
6859         RXp_PAREN_NAMES(r) = NULL;
6860
6861     {
6862         regnode *first = ri->program + 1;
6863         U8 fop = OP(first);
6864         regnode *next = NEXTOPER(first);
6865         U8 nop = OP(next);
6866
6867         if (PL_regkind[fop] == NOTHING && nop == END)
6868             r->extflags |= RXf_NULL;
6869         else if (PL_regkind[fop] == BOL && nop == END)
6870             r->extflags |= RXf_START_ONLY;
6871         else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6872             r->extflags |= RXf_WHITE;
6873         else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6874             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6875
6876     }
6877 #ifdef DEBUGGING
6878     if (RExC_paren_names) {
6879         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
6880         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6881     } else
6882 #endif
6883         ri->name_list_idx = 0;
6884
6885     if (RExC_recurse_count) {
6886         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6887             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6888             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6889         }
6890     }
6891     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6892     /* assume we don't need to swap parens around before we match */
6893
6894     DEBUG_DUMP_r({
6895         DEBUG_RExC_seen();
6896         PerlIO_printf(Perl_debug_log,"Final program:\n");
6897         regdump(r);
6898     });
6899 #ifdef RE_TRACK_PATTERN_OFFSETS
6900     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6901         const STRLEN len = ri->u.offsets[0];
6902         STRLEN i;
6903         GET_RE_DEBUG_FLAGS_DECL;
6904         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6905         for (i = 1; i <= len; i++) {
6906             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6907                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6908                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6909             }
6910         PerlIO_printf(Perl_debug_log, "\n");
6911     });
6912 #endif
6913
6914 #ifdef USE_ITHREADS
6915     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6916      * by setting the regexp SV to readonly-only instead. If the
6917      * pattern's been recompiled, the USEDness should remain. */
6918     if (old_re && SvREADONLY(old_re))
6919         SvREADONLY_on(rx);
6920 #endif
6921     return rx;
6922 }
6923
6924
6925 SV*
6926 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6927                     const U32 flags)
6928 {
6929     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6930
6931     PERL_UNUSED_ARG(value);
6932
6933     if (flags & RXapif_FETCH) {
6934         return reg_named_buff_fetch(rx, key, flags);
6935     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6936         Perl_croak_no_modify();
6937         return NULL;
6938     } else if (flags & RXapif_EXISTS) {
6939         return reg_named_buff_exists(rx, key, flags)
6940             ? &PL_sv_yes
6941             : &PL_sv_no;
6942     } else if (flags & RXapif_REGNAMES) {
6943         return reg_named_buff_all(rx, flags);
6944     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6945         return reg_named_buff_scalar(rx, flags);
6946     } else {
6947         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6948         return NULL;
6949     }
6950 }
6951
6952 SV*
6953 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6954                          const U32 flags)
6955 {
6956     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6957     PERL_UNUSED_ARG(lastkey);
6958
6959     if (flags & RXapif_FIRSTKEY)
6960         return reg_named_buff_firstkey(rx, flags);
6961     else if (flags & RXapif_NEXTKEY)
6962         return reg_named_buff_nextkey(rx, flags);
6963     else {
6964         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6965         return NULL;
6966     }
6967 }
6968
6969 SV*
6970 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6971                           const U32 flags)
6972 {
6973     AV *retarray = NULL;
6974     SV *ret;
6975     struct regexp *const rx = ReANY(r);
6976
6977     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6978
6979     if (flags & RXapif_ALL)
6980         retarray=newAV();
6981
6982     if (rx && RXp_PAREN_NAMES(rx)) {
6983         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6984         if (he_str) {
6985             IV i;
6986             SV* sv_dat=HeVAL(he_str);
6987             I32 *nums=(I32*)SvPVX(sv_dat);
6988             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6989                 if ((I32)(rx->nparens) >= nums[i]
6990                     && rx->offs[nums[i]].start != -1
6991                     && rx->offs[nums[i]].end != -1)
6992                 {
6993                     ret = newSVpvs("");
6994                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6995                     if (!retarray)
6996                         return ret;
6997                 } else {
6998                     if (retarray)
6999                         ret = newSVsv(&PL_sv_undef);
7000                 }
7001                 if (retarray)
7002                     av_push(retarray, ret);
7003             }
7004             if (retarray)
7005                 return newRV_noinc(MUTABLE_SV(retarray));
7006         }
7007     }
7008     return NULL;
7009 }
7010
7011 bool
7012 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7013                            const U32 flags)
7014 {
7015     struct regexp *const rx = ReANY(r);
7016
7017     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7018
7019     if (rx && RXp_PAREN_NAMES(rx)) {
7020         if (flags & RXapif_ALL) {
7021             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7022         } else {
7023             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7024             if (sv) {
7025                 SvREFCNT_dec_NN(sv);
7026                 return TRUE;
7027             } else {
7028                 return FALSE;
7029             }
7030         }
7031     } else {
7032         return FALSE;
7033     }
7034 }
7035
7036 SV*
7037 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7038 {
7039     struct regexp *const rx = ReANY(r);
7040
7041     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7042
7043     if ( rx && RXp_PAREN_NAMES(rx) ) {
7044         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7045
7046         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7047     } else {
7048         return FALSE;
7049     }
7050 }
7051
7052 SV*
7053 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7054 {
7055     struct regexp *const rx = ReANY(r);
7056     GET_RE_DEBUG_FLAGS_DECL;
7057
7058     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7059
7060     if (rx && RXp_PAREN_NAMES(rx)) {
7061         HV *hv = RXp_PAREN_NAMES(rx);
7062         HE *temphe;
7063         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7064             IV i;
7065             IV parno = 0;
7066             SV* sv_dat = HeVAL(temphe);
7067             I32 *nums = (I32*)SvPVX(sv_dat);
7068             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7069                 if ((I32)(rx->lastparen) >= nums[i] &&
7070                     rx->offs[nums[i]].start != -1 &&
7071                     rx->offs[nums[i]].end != -1)
7072                 {
7073                     parno = nums[i];
7074                     break;
7075                 }
7076             }
7077             if (parno || flags & RXapif_ALL) {
7078                 return newSVhek(HeKEY_hek(temphe));
7079             }
7080         }
7081     }
7082     return NULL;
7083 }
7084
7085 SV*
7086 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7087 {
7088     SV *ret;
7089     AV *av;
7090     SSize_t length;
7091     struct regexp *const rx = ReANY(r);
7092
7093     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7094
7095     if (rx && RXp_PAREN_NAMES(rx)) {
7096         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7097             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7098         } else if (flags & RXapif_ONE) {
7099             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7100             av = MUTABLE_AV(SvRV(ret));
7101             length = av_len(av);
7102             SvREFCNT_dec_NN(ret);
7103             return newSViv(length + 1);
7104         } else {
7105             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
7106             return NULL;
7107         }
7108     }
7109     return &PL_sv_undef;
7110 }
7111
7112 SV*
7113 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7114 {
7115     struct regexp *const rx = ReANY(r);
7116     AV *av = newAV();
7117
7118     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7119
7120     if (rx && RXp_PAREN_NAMES(rx)) {
7121         HV *hv= RXp_PAREN_NAMES(rx);
7122         HE *temphe;
7123         (void)hv_iterinit(hv);
7124         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7125             IV i;
7126             IV parno = 0;
7127             SV* sv_dat = HeVAL(temphe);
7128             I32 *nums = (I32*)SvPVX(sv_dat);
7129             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7130                 if ((I32)(rx->lastparen) >= nums[i] &&
7131                     rx->offs[nums[i]].start != -1 &&
7132                     rx->offs[nums[i]].end != -1)
7133                 {
7134                     parno = nums[i];
7135                     break;
7136                 }
7137             }
7138             if (parno || flags & RXapif_ALL) {
7139                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7140             }
7141         }
7142     }
7143
7144     return newRV_noinc(MUTABLE_SV(av));
7145 }
7146
7147 void
7148 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7149                              SV * const sv)
7150 {
7151     struct regexp *const rx = ReANY(r);
7152     char *s = NULL;
7153     SSize_t i = 0;
7154     SSize_t s1, t1;
7155     I32 n = paren;
7156
7157     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7158
7159     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7160            || n == RX_BUFF_IDX_CARET_FULLMATCH
7161            || n == RX_BUFF_IDX_CARET_POSTMATCH
7162        )
7163     {
7164         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7165         if (!keepcopy) {
7166             /* on something like
7167              *    $r = qr/.../;
7168              *    /$qr/p;
7169              * the KEEPCOPY is set on the PMOP rather than the regex */
7170             if (PL_curpm && r == PM_GETRE(PL_curpm))
7171                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7172         }
7173         if (!keepcopy)
7174             goto ret_undef;
7175     }
7176
7177     if (!rx->subbeg)
7178         goto ret_undef;
7179
7180     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7181         /* no need to distinguish between them any more */
7182         n = RX_BUFF_IDX_FULLMATCH;
7183
7184     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7185         && rx->offs[0].start != -1)
7186     {
7187         /* $`, ${^PREMATCH} */
7188         i = rx->offs[0].start;
7189         s = rx->subbeg;
7190     }
7191     else
7192     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7193         && rx->offs[0].end != -1)
7194     {
7195         /* $', ${^POSTMATCH} */
7196         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7197         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7198     }
7199     else
7200     if ( 0 <= n && n <= (I32)rx->nparens &&
7201         (s1 = rx->offs[n].start) != -1 &&
7202         (t1 = rx->offs[n].end) != -1)
7203     {
7204         /* $&, ${^MATCH},  $1 ... */
7205         i = t1 - s1;
7206         s = rx->subbeg + s1 - rx->suboffset;
7207     } else {
7208         goto ret_undef;
7209     }
7210
7211     assert(s >= rx->subbeg);
7212     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7213     if (i >= 0) {
7214 #if NO_TAINT_SUPPORT
7215         sv_setpvn(sv, s, i);
7216 #else
7217         const int oldtainted = TAINT_get;
7218         TAINT_NOT;
7219         sv_setpvn(sv, s, i);
7220         TAINT_set(oldtainted);
7221 #endif
7222         if ( (rx->extflags & RXf_CANY_SEEN)
7223             ? (RXp_MATCH_UTF8(rx)
7224                         && (!i || is_utf8_string((U8*)s, i)))
7225             : (RXp_MATCH_UTF8(rx)) )
7226         {
7227             SvUTF8_on(sv);
7228         }
7229         else
7230             SvUTF8_off(sv);
7231         if (TAINTING_get) {
7232             if (RXp_MATCH_TAINTED(rx)) {
7233                 if (SvTYPE(sv) >= SVt_PVMG) {
7234                     MAGIC* const mg = SvMAGIC(sv);
7235                     MAGIC* mgt;
7236                     TAINT;
7237                     SvMAGIC_set(sv, mg->mg_moremagic);
7238                     SvTAINT(sv);
7239                     if ((mgt = SvMAGIC(sv))) {
7240                         mg->mg_moremagic = mgt;
7241                         SvMAGIC_set(sv, mg);
7242                     }
7243                 } else {
7244                     TAINT;
7245                     SvTAINT(sv);
7246                 }
7247             } else
7248                 SvTAINTED_off(sv);
7249         }
7250     } else {
7251       ret_undef:
7252         sv_setsv(sv,&PL_sv_undef);
7253         return;
7254     }
7255 }
7256
7257 void
7258 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7259                                                          SV const * const value)
7260 {
7261     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7262
7263     PERL_UNUSED_ARG(rx);
7264     PERL_UNUSED_ARG(paren);
7265     PERL_UNUSED_ARG(value);
7266
7267     if (!PL_localizing)
7268         Perl_croak_no_modify();
7269 }
7270
7271 I32
7272 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7273                               const I32 paren)
7274 {
7275     struct regexp *const rx = ReANY(r);
7276     I32 i;
7277     I32 s1, t1;
7278
7279     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7280
7281     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7282         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7283         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7284     )
7285     {
7286         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7287         if (!keepcopy) {
7288             /* on something like
7289              *    $r = qr/.../;
7290              *    /$qr/p;
7291              * the KEEPCOPY is set on the PMOP rather than the regex */
7292             if (PL_curpm && r == PM_GETRE(PL_curpm))
7293                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7294         }
7295         if (!keepcopy)
7296             goto warn_undef;
7297     }
7298
7299     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7300     switch (paren) {
7301       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7302       case RX_BUFF_IDX_PREMATCH:       /* $` */
7303         if (rx->offs[0].start != -1) {
7304                         i = rx->offs[0].start;
7305                         if (i > 0) {
7306                                 s1 = 0;
7307                                 t1 = i;
7308                                 goto getlen;
7309                         }
7310             }
7311         return 0;
7312
7313       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7314       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7315             if (rx->offs[0].end != -1) {
7316                         i = rx->sublen - rx->offs[0].end;
7317                         if (i > 0) {
7318                                 s1 = rx->offs[0].end;
7319                                 t1 = rx->sublen;
7320                                 goto getlen;
7321                         }
7322             }
7323         return 0;
7324
7325       default: /* $& / ${^MATCH}, $1, $2, ... */
7326             if (paren <= (I32)rx->nparens &&
7327             (s1 = rx->offs[paren].start) != -1 &&
7328             (t1 = rx->offs[paren].end) != -1)
7329             {
7330             i = t1 - s1;
7331             goto getlen;
7332         } else {
7333           warn_undef:
7334             if (ckWARN(WARN_UNINITIALIZED))
7335                 report_uninit((const SV *)sv);
7336             return 0;
7337         }
7338     }
7339   getlen:
7340     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7341         const char * const s = rx->subbeg - rx->suboffset + s1;
7342         const U8 *ep;
7343         STRLEN el;
7344
7345         i = t1 - s1;
7346         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7347                         i = el;
7348     }
7349     return i;
7350 }
7351
7352 SV*
7353 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7354 {
7355     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7356         PERL_UNUSED_ARG(rx);
7357         if (0)
7358             return NULL;
7359         else
7360             return newSVpvs("Regexp");
7361 }
7362
7363 /* Scans the name of a named buffer from the pattern.
7364  * If flags is REG_RSN_RETURN_NULL returns null.
7365  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7366  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7367  * to the parsed name as looked up in the RExC_paren_names hash.
7368  * If there is an error throws a vFAIL().. type exception.
7369  */
7370
7371 #define REG_RSN_RETURN_NULL    0
7372 #define REG_RSN_RETURN_NAME    1
7373 #define REG_RSN_RETURN_DATA    2
7374
7375 STATIC SV*
7376 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7377 {
7378     char *name_start = RExC_parse;
7379
7380     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7381
7382     assert (RExC_parse <= RExC_end);
7383     if (RExC_parse == RExC_end) NOOP;
7384     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7385          /* skip IDFIRST by using do...while */
7386         if (UTF)
7387             do {
7388                 RExC_parse += UTF8SKIP(RExC_parse);
7389             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7390         else
7391             do {
7392                 RExC_parse++;
7393             } while (isWORDCHAR(*RExC_parse));
7394     } else {
7395         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7396         vFAIL("Group name must start with a non-digit word character");
7397     }
7398     if ( flags ) {
7399         SV* sv_name
7400             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7401                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7402         if ( flags == REG_RSN_RETURN_NAME)
7403             return sv_name;
7404         else if (flags==REG_RSN_RETURN_DATA) {
7405             HE *he_str = NULL;
7406             SV *sv_dat = NULL;
7407             if ( ! sv_name )      /* should not happen*/
7408                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7409             if (RExC_paren_names)
7410                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7411             if ( he_str )
7412                 sv_dat = HeVAL(he_str);
7413             if ( ! sv_dat )
7414                 vFAIL("Reference to nonexistent named group");
7415             return sv_dat;
7416         }
7417         else {
7418             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7419                        (unsigned long) flags);
7420         }
7421         assert(0); /* NOT REACHED */
7422     }
7423     return NULL;
7424 }
7425
7426 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7427     int rem=(int)(RExC_end - RExC_parse);                       \
7428     int cut;                                                    \
7429     int num;                                                    \
7430     int iscut=0;                                                \
7431     if (rem>10) {                                               \
7432         rem=10;                                                 \
7433         iscut=1;                                                \
7434     }                                                           \
7435     cut=10-rem;                                                 \
7436     if (RExC_lastparse!=RExC_parse)                             \
7437         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7438             rem, RExC_parse,                                    \
7439             cut + 4,                                            \
7440             iscut ? "..." : "<"                                 \
7441         );                                                      \
7442     else                                                        \
7443         PerlIO_printf(Perl_debug_log,"%16s","");                \
7444                                                                 \
7445     if (SIZE_ONLY)                                              \
7446        num = RExC_size + 1;                                     \
7447     else                                                        \
7448        num=REG_NODE_NUM(RExC_emit);                             \
7449     if (RExC_lastnum!=num)                                      \
7450        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7451     else                                                        \
7452        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7453     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7454         (int)((depth*2)), "",                                   \
7455         (funcname)                                              \
7456     );                                                          \
7457     RExC_lastnum=num;                                           \
7458     RExC_lastparse=RExC_parse;                                  \
7459 })
7460
7461
7462
7463 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7464     DEBUG_PARSE_MSG((funcname));                            \
7465     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7466 })
7467 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7468     DEBUG_PARSE_MSG((funcname));                            \
7469     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7470 })
7471
7472 /* This section of code defines the inversion list object and its methods.  The
7473  * interfaces are highly subject to change, so as much as possible is static to
7474  * this file.  An inversion list is here implemented as a malloc'd C UV array
7475  * as an SVt_INVLIST scalar.
7476  *
7477  * An inversion list for Unicode is an array of code points, sorted by ordinal
7478  * number.  The zeroth element is the first code point in the list.  The 1th
7479  * element is the first element beyond that not in the list.  In other words,
7480  * the first range is
7481  *  invlist[0]..(invlist[1]-1)
7482  * The other ranges follow.  Thus every element whose index is divisible by two
7483  * marks the beginning of a range that is in the list, and every element not
7484  * divisible by two marks the beginning of a range not in the list.  A single
7485  * element inversion list that contains the single code point N generally
7486  * consists of two elements
7487  *  invlist[0] == N
7488  *  invlist[1] == N+1
7489  * (The exception is when N is the highest representable value on the
7490  * machine, in which case the list containing just it would be a single
7491  * element, itself.  By extension, if the last range in the list extends to
7492  * infinity, then the first element of that range will be in the inversion list
7493  * at a position that is divisible by two, and is the final element in the
7494  * list.)
7495  * Taking the complement (inverting) an inversion list is quite simple, if the
7496  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7497  * This implementation reserves an element at the beginning of each inversion
7498  * list to always contain 0; there is an additional flag in the header which
7499  * indicates if the list begins at the 0, or is offset to begin at the next
7500  * element.
7501  *
7502  * More about inversion lists can be found in "Unicode Demystified"
7503  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7504  * More will be coming when functionality is added later.
7505  *
7506  * The inversion list data structure is currently implemented as an SV pointing
7507  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7508  * array of UV whose memory management is automatically handled by the existing
7509  * facilities for SV's.
7510  *
7511  * Some of the methods should always be private to the implementation, and some
7512  * should eventually be made public */
7513
7514 /* The header definitions are in F<inline_invlist.c> */
7515
7516 PERL_STATIC_INLINE UV*
7517 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7518 {
7519     /* Returns a pointer to the first element in the inversion list's array.
7520      * This is called upon initialization of an inversion list.  Where the
7521      * array begins depends on whether the list has the code point U+0000 in it
7522      * or not.  The other parameter tells it whether the code that follows this
7523      * call is about to put a 0 in the inversion list or not.  The first
7524      * element is either the element reserved for 0, if TRUE, or the element
7525      * after it, if FALSE */
7526
7527     bool* offset = get_invlist_offset_addr(invlist);
7528     UV* zero_addr = (UV *) SvPVX(invlist);
7529
7530     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7531
7532     /* Must be empty */
7533     assert(! _invlist_len(invlist));
7534
7535     *zero_addr = 0;
7536
7537     /* 1^1 = 0; 1^0 = 1 */
7538     *offset = 1 ^ will_have_0;
7539     return zero_addr + *offset;
7540 }
7541
7542 PERL_STATIC_INLINE UV*
7543 S_invlist_array(pTHX_ SV* const invlist)
7544 {
7545     /* Returns the pointer to the inversion list's array.  Every time the
7546      * length changes, this needs to be called in case malloc or realloc moved
7547      * it */
7548
7549     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7550
7551     /* Must not be empty.  If these fail, you probably didn't check for <len>
7552      * being non-zero before trying to get the array */
7553     assert(_invlist_len(invlist));
7554
7555     /* The very first element always contains zero, The array begins either
7556      * there, or if the inversion list is offset, at the element after it.
7557      * The offset header field determines which; it contains 0 or 1 to indicate
7558      * how much additionally to add */
7559     assert(0 == *(SvPVX(invlist)));
7560     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7561 }
7562
7563 PERL_STATIC_INLINE void
7564 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7565 {
7566     /* Sets the current number of elements stored in the inversion list.
7567      * Updates SvCUR correspondingly */
7568
7569     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7570
7571     assert(SvTYPE(invlist) == SVt_INVLIST);
7572
7573     SvCUR_set(invlist,
7574               (len == 0)
7575                ? 0
7576                : TO_INTERNAL_SIZE(len + offset));
7577     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7578 }
7579
7580 PERL_STATIC_INLINE IV*
7581 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7582 {
7583     /* Return the address of the IV that is reserved to hold the cached index
7584      * */
7585
7586     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7587
7588     assert(SvTYPE(invlist) == SVt_INVLIST);
7589
7590     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7591 }
7592
7593 PERL_STATIC_INLINE IV
7594 S_invlist_previous_index(pTHX_ SV* const invlist)
7595 {
7596     /* Returns cached index of previous search */
7597
7598     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7599
7600     return *get_invlist_previous_index_addr(invlist);
7601 }
7602
7603 PERL_STATIC_INLINE void
7604 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7605 {
7606     /* Caches <index> for later retrieval */
7607
7608     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7609
7610     assert(index == 0 || index < (int) _invlist_len(invlist));
7611
7612     *get_invlist_previous_index_addr(invlist) = index;
7613 }
7614
7615 PERL_STATIC_INLINE UV
7616 S_invlist_max(pTHX_ SV* const invlist)
7617 {
7618     /* Returns the maximum number of elements storable in the inversion list's
7619      * array, without having to realloc() */
7620
7621     PERL_ARGS_ASSERT_INVLIST_MAX;
7622
7623     assert(SvTYPE(invlist) == SVt_INVLIST);
7624
7625     /* Assumes worst case, in which the 0 element is not counted in the
7626      * inversion list, so subtracts 1 for that */
7627     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7628            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7629            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7630 }
7631
7632 #ifndef PERL_IN_XSUB_RE
7633 SV*
7634 Perl__new_invlist(pTHX_ IV initial_size)
7635 {
7636
7637     /* Return a pointer to a newly constructed inversion list, with enough
7638      * space to store 'initial_size' elements.  If that number is negative, a
7639      * system default is used instead */
7640
7641     SV* new_list;
7642
7643     if (initial_size < 0) {
7644         initial_size = 10;
7645     }
7646
7647     /* Allocate the initial space */
7648     new_list = newSV_type(SVt_INVLIST);
7649
7650     /* First 1 is in case the zero element isn't in the list; second 1 is for
7651      * trailing NUL */
7652     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7653     invlist_set_len(new_list, 0, 0);
7654
7655     /* Force iterinit() to be used to get iteration to work */
7656     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7657
7658     *get_invlist_previous_index_addr(new_list) = 0;
7659
7660     return new_list;
7661 }
7662 #endif
7663
7664 STATIC SV*
7665 S__new_invlist_C_array(pTHX_ const UV* const list)
7666 {
7667     /* Return a pointer to a newly constructed inversion list, initialized to
7668      * point to <list>, which has to be in the exact correct inversion list
7669      * form, including internal fields.  Thus this is a dangerous routine that
7670      * should not be used in the wrong hands.  The passed in 'list' contains
7671      * several header fields at the beginning that are not part of the
7672      * inversion list body proper */
7673
7674     const STRLEN length = (STRLEN) list[0];
7675     const UV version_id =          list[1];
7676     const bool offset   =    cBOOL(list[2]);
7677 #define HEADER_LENGTH 3
7678     /* If any of the above changes in any way, you must change HEADER_LENGTH
7679      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7680      *      perl -E 'say int(rand 2**31-1)'
7681      */
7682 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7683                                         data structure type, so that one being
7684                                         passed in can be validated to be an
7685                                         inversion list of the correct vintage.
7686                                        */
7687
7688     SV* invlist = newSV_type(SVt_INVLIST);
7689
7690     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7691
7692     if (version_id != INVLIST_VERSION_ID) {
7693         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7694     }
7695
7696     /* The generated array passed in includes header elements that aren't part
7697      * of the list proper, so start it just after them */
7698     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7699
7700     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7701                                shouldn't touch it */
7702
7703     *(get_invlist_offset_addr(invlist)) = offset;
7704
7705     /* The 'length' passed to us is the physical number of elements in the
7706      * inversion list.  But if there is an offset the logical number is one
7707      * less than that */
7708     invlist_set_len(invlist, length  - offset, offset);
7709
7710     invlist_set_previous_index(invlist, 0);
7711
7712     /* Initialize the iteration pointer. */
7713     invlist_iterfinish(invlist);
7714
7715     return invlist;
7716 }
7717
7718 STATIC void
7719 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7720 {
7721     /* Grow the maximum size of an inversion list */
7722
7723     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7724
7725     assert(SvTYPE(invlist) == SVt_INVLIST);
7726
7727     /* Add one to account for the zero element at the beginning which may not
7728      * be counted by the calling parameters */
7729     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7730 }
7731
7732 PERL_STATIC_INLINE void
7733 S_invlist_trim(pTHX_ SV* const invlist)
7734 {
7735     PERL_ARGS_ASSERT_INVLIST_TRIM;
7736
7737     assert(SvTYPE(invlist) == SVt_INVLIST);
7738
7739     /* Change the length of the inversion list to how many entries it currently
7740      * has */
7741     SvPV_shrink_to_cur((SV *) invlist);
7742 }
7743
7744 STATIC void
7745 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7746 {
7747    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7748     * the end of the inversion list.  The range must be above any existing
7749     * ones. */
7750
7751     UV* array;
7752     UV max = invlist_max(invlist);
7753     UV len = _invlist_len(invlist);
7754     bool offset;
7755
7756     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7757
7758     if (len == 0) { /* Empty lists must be initialized */
7759         offset = start != 0;
7760         array = _invlist_array_init(invlist, ! offset);
7761     }
7762     else {
7763         /* Here, the existing list is non-empty. The current max entry in the
7764          * list is generally the first value not in the set, except when the
7765          * set extends to the end of permissible values, in which case it is
7766          * the first entry in that final set, and so this call is an attempt to
7767          * append out-of-order */
7768
7769         UV final_element = len - 1;
7770         array = invlist_array(invlist);
7771         if (array[final_element] > start
7772             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7773         {
7774             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7775                        array[final_element], start,
7776                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7777         }
7778
7779         /* Here, it is a legal append.  If the new range begins with the first
7780          * value not in the set, it is extending the set, so the new first
7781          * value not in the set is one greater than the newly extended range.
7782          * */
7783         offset = *get_invlist_offset_addr(invlist);
7784         if (array[final_element] == start) {
7785             if (end != UV_MAX) {
7786                 array[final_element] = end + 1;
7787             }
7788             else {
7789                 /* But if the end is the maximum representable on the machine,
7790                  * just let the range that this would extend to have no end */
7791                 invlist_set_len(invlist, len - 1, offset);
7792             }
7793             return;
7794         }
7795     }
7796
7797     /* Here the new range doesn't extend any existing set.  Add it */
7798
7799     len += 2;   /* Includes an element each for the start and end of range */
7800
7801     /* If wll overflow the existing space, extend, which may cause the array to
7802      * be moved */
7803     if (max < len) {
7804         invlist_extend(invlist, len);
7805
7806         /* Have to set len here to avoid assert failure in invlist_array() */
7807         invlist_set_len(invlist, len, offset);
7808
7809         array = invlist_array(invlist);
7810     }
7811     else {
7812         invlist_set_len(invlist, len, offset);
7813     }
7814
7815     /* The next item on the list starts the range, the one after that is
7816      * one past the new range.  */
7817     array[len - 2] = start;
7818     if (end != UV_MAX) {
7819         array[len - 1] = end + 1;
7820     }
7821     else {
7822         /* But if the end is the maximum representable on the machine, just let
7823          * the range have no end */
7824         invlist_set_len(invlist, len - 1, offset);
7825     }
7826 }
7827
7828 #ifndef PERL_IN_XSUB_RE
7829
7830 IV
7831 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7832 {
7833     /* Searches the inversion list for the entry that contains the input code
7834      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7835      * return value is the index into the list's array of the range that
7836      * contains <cp> */
7837
7838     IV low = 0;
7839     IV mid;
7840     IV high = _invlist_len(invlist);
7841     const IV highest_element = high - 1;
7842     const UV* array;
7843
7844     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7845
7846     /* If list is empty, return failure. */
7847     if (high == 0) {
7848         return -1;
7849     }
7850
7851     /* (We can't get the array unless we know the list is non-empty) */
7852     array = invlist_array(invlist);
7853
7854     mid = invlist_previous_index(invlist);
7855     assert(mid >=0 && mid <= highest_element);
7856
7857     /* <mid> contains the cache of the result of the previous call to this
7858      * function (0 the first time).  See if this call is for the same result,
7859      * or if it is for mid-1.  This is under the theory that calls to this
7860      * function will often be for related code points that are near each other.
7861      * And benchmarks show that caching gives better results.  We also test
7862      * here if the code point is within the bounds of the list.  These tests
7863      * replace others that would have had to be made anyway to make sure that
7864      * the array bounds were not exceeded, and these give us extra information
7865      * at the same time */
7866     if (cp >= array[mid]) {
7867         if (cp >= array[highest_element]) {
7868             return highest_element;
7869         }
7870
7871         /* Here, array[mid] <= cp < array[highest_element].  This means that
7872          * the final element is not the answer, so can exclude it; it also
7873          * means that <mid> is not the final element, so can refer to 'mid + 1'
7874          * safely */
7875         if (cp < array[mid + 1]) {
7876             return mid;
7877         }
7878         high--;
7879         low = mid + 1;
7880     }
7881     else { /* cp < aray[mid] */
7882         if (cp < array[0]) { /* Fail if outside the array */
7883             return -1;
7884         }
7885         high = mid;
7886         if (cp >= array[mid - 1]) {
7887             goto found_entry;
7888         }
7889     }
7890
7891     /* Binary search.  What we are looking for is <i> such that
7892      *  array[i] <= cp < array[i+1]
7893      * The loop below converges on the i+1.  Note that there may not be an
7894      * (i+1)th element in the array, and things work nonetheless */
7895     while (low < high) {
7896         mid = (low + high) / 2;
7897         assert(mid <= highest_element);
7898         if (array[mid] <= cp) { /* cp >= array[mid] */
7899             low = mid + 1;
7900
7901             /* We could do this extra test to exit the loop early.
7902             if (cp < array[low]) {
7903                 return mid;
7904             }
7905             */
7906         }
7907         else { /* cp < array[mid] */
7908             high = mid;
7909         }
7910     }
7911
7912   found_entry:
7913     high--;
7914     invlist_set_previous_index(invlist, high);
7915     return high;
7916 }
7917
7918 void
7919 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7920 {
7921     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7922      * but is used when the swash has an inversion list.  This makes this much
7923      * faster, as it uses a binary search instead of a linear one.  This is
7924      * intimately tied to that function, and perhaps should be in utf8.c,
7925      * except it is intimately tied to inversion lists as well.  It assumes
7926      * that <swatch> is all 0's on input */
7927
7928     UV current = start;
7929     const IV len = _invlist_len(invlist);
7930     IV i;
7931     const UV * array;
7932
7933     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7934
7935     if (len == 0) { /* Empty inversion list */
7936         return;
7937     }
7938
7939     array = invlist_array(invlist);
7940
7941     /* Find which element it is */
7942     i = _invlist_search(invlist, start);
7943
7944     /* We populate from <start> to <end> */
7945     while (current < end) {
7946         UV upper;
7947
7948         /* The inversion list gives the results for every possible code point
7949          * after the first one in the list.  Only those ranges whose index is
7950          * even are ones that the inversion list matches.  For the odd ones,
7951          * and if the initial code point is not in the list, we have to skip
7952          * forward to the next element */
7953         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7954             i++;
7955             if (i >= len) { /* Finished if beyond the end of the array */
7956                 return;
7957             }
7958             current = array[i];
7959             if (current >= end) {   /* Finished if beyond the end of what we
7960                                        are populating */
7961                 if (LIKELY(end < UV_MAX)) {
7962                     return;
7963                 }
7964
7965                 /* We get here when the upper bound is the maximum
7966                  * representable on the machine, and we are looking for just
7967                  * that code point.  Have to special case it */
7968                 i = len;
7969                 goto join_end_of_list;
7970             }
7971         }
7972         assert(current >= start);
7973
7974         /* The current range ends one below the next one, except don't go past
7975          * <end> */
7976         i++;
7977         upper = (i < len && array[i] < end) ? array[i] : end;
7978
7979         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7980          * for each code point in it */
7981         for (; current < upper; current++) {
7982             const STRLEN offset = (STRLEN)(current - start);
7983             swatch[offset >> 3] |= 1 << (offset & 7);
7984         }
7985
7986     join_end_of_list:
7987
7988         /* Quit if at the end of the list */
7989         if (i >= len) {
7990
7991             /* But first, have to deal with the highest possible code point on
7992              * the platform.  The previous code assumes that <end> is one
7993              * beyond where we want to populate, but that is impossible at the
7994              * platform's infinity, so have to handle it specially */
7995             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7996             {
7997                 const STRLEN offset = (STRLEN)(end - start);
7998                 swatch[offset >> 3] |= 1 << (offset & 7);
7999             }
8000             return;
8001         }
8002
8003         /* Advance to the next range, which will be for code points not in the
8004          * inversion list */
8005         current = array[i];
8006     }
8007
8008     return;
8009 }
8010
8011 void
8012 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
8013 {
8014     /* Take the union of two inversion lists and point <output> to it.  *output
8015      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8016      * the reference count to that list will be decremented if not already a
8017      * temporary (mortal); otherwise *output will be made correspondingly
8018      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8019      * second list is returned.  If <complement_b> is TRUE, the union is taken
8020      * of the complement (inversion) of <b> instead of b itself.
8021      *
8022      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8023      * Richard Gillam, published by Addison-Wesley, and explained at some
8024      * length there.  The preface says to incorporate its examples into your
8025      * code at your own risk.
8026      *
8027      * The algorithm is like a merge sort.
8028      *
8029      * XXX A potential performance improvement is to keep track as we go along
8030      * if only one of the inputs contributes to the result, meaning the other
8031      * is a subset of that one.  In that case, we can skip the final copy and
8032      * return the larger of the input lists, but then outside code might need
8033      * to keep track of whether to free the input list or not */
8034
8035     const UV* array_a;    /* a's array */
8036     const UV* array_b;
8037     UV len_a;       /* length of a's array */
8038     UV len_b;
8039
8040     SV* u;                      /* the resulting union */
8041     UV* array_u;
8042     UV len_u;
8043
8044     UV i_a = 0;             /* current index into a's array */
8045     UV i_b = 0;
8046     UV i_u = 0;
8047
8048     /* running count, as explained in the algorithm source book; items are
8049      * stopped accumulating and are output when the count changes to/from 0.
8050      * The count is incremented when we start a range that's in the set, and
8051      * decremented when we start a range that's not in the set.  So its range
8052      * is 0 to 2.  Only when the count is zero is something not in the set.
8053      */
8054     UV count = 0;
8055
8056     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8057     assert(a != b);
8058
8059     /* If either one is empty, the union is the other one */
8060     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8061         bool make_temp = FALSE; /* Should we mortalize the result? */
8062
8063         if (*output == a) {
8064             if (a != NULL) {
8065                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8066                     SvREFCNT_dec_NN(a);
8067                 }
8068             }
8069         }
8070         if (*output != b) {
8071             *output = invlist_clone(b);
8072             if (complement_b) {
8073                 _invlist_invert(*output);
8074             }
8075         } /* else *output already = b; */
8076
8077         if (make_temp) {
8078             sv_2mortal(*output);
8079         }
8080         return;
8081     }
8082     else if ((len_b = _invlist_len(b)) == 0) {
8083         bool make_temp = FALSE;
8084         if (*output == b) {
8085             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8086                 SvREFCNT_dec_NN(b);
8087             }
8088         }
8089
8090         /* The complement of an empty list is a list that has everything in it,
8091          * so the union with <a> includes everything too */
8092         if (complement_b) {
8093             if (a == *output) {
8094                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8095                     SvREFCNT_dec_NN(a);
8096                 }
8097             }
8098             *output = _new_invlist(1);
8099             _append_range_to_invlist(*output, 0, UV_MAX);
8100         }
8101         else if (*output != a) {
8102             *output = invlist_clone(a);
8103         }
8104         /* else *output already = a; */
8105
8106         if (make_temp) {
8107             sv_2mortal(*output);
8108         }
8109         return;
8110     }
8111
8112     /* Here both lists exist and are non-empty */
8113     array_a = invlist_array(a);
8114     array_b = invlist_array(b);
8115
8116     /* If are to take the union of 'a' with the complement of b, set it
8117      * up so are looking at b's complement. */
8118     if (complement_b) {
8119
8120         /* To complement, we invert: if the first element is 0, remove it.  To
8121          * do this, we just pretend the array starts one later */
8122         if (array_b[0] == 0) {
8123             array_b++;
8124             len_b--;
8125         }
8126         else {
8127
8128             /* But if the first element is not zero, we pretend the list starts
8129              * at the 0 that is always stored immediately before the array. */
8130             array_b--;
8131             len_b++;
8132         }
8133     }
8134
8135     /* Size the union for the worst case: that the sets are completely
8136      * disjoint */
8137     u = _new_invlist(len_a + len_b);
8138
8139     /* Will contain U+0000 if either component does */
8140     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8141                                       || (len_b > 0 && array_b[0] == 0));
8142
8143     /* Go through each list item by item, stopping when exhausted one of
8144      * them */
8145     while (i_a < len_a && i_b < len_b) {
8146         UV cp;      /* The element to potentially add to the union's array */
8147         bool cp_in_set;   /* is it in the the input list's set or not */
8148
8149         /* We need to take one or the other of the two inputs for the union.
8150          * Since we are merging two sorted lists, we take the smaller of the
8151          * next items.  In case of a tie, we take the one that is in its set
8152          * first.  If we took one not in the set first, it would decrement the
8153          * count, possibly to 0 which would cause it to be output as ending the
8154          * range, and the next time through we would take the same number, and
8155          * output it again as beginning the next range.  By doing it the
8156          * opposite way, there is no possibility that the count will be
8157          * momentarily decremented to 0, and thus the two adjoining ranges will
8158          * be seamlessly merged.  (In a tie and both are in the set or both not
8159          * in the set, it doesn't matter which we take first.) */
8160         if (array_a[i_a] < array_b[i_b]
8161             || (array_a[i_a] == array_b[i_b]
8162                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8163         {
8164             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8165             cp= array_a[i_a++];
8166         }
8167         else {
8168             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8169             cp = array_b[i_b++];
8170         }
8171
8172         /* Here, have chosen which of the two inputs to look at.  Only output
8173          * if the running count changes to/from 0, which marks the
8174          * beginning/end of a range in that's in the set */
8175         if (cp_in_set) {
8176             if (count == 0) {
8177                 array_u[i_u++] = cp;
8178             }
8179             count++;
8180         }
8181         else {
8182             count--;
8183             if (count == 0) {
8184                 array_u[i_u++] = cp;
8185             }
8186         }
8187     }
8188
8189     /* Here, we are finished going through at least one of the lists, which
8190      * means there is something remaining in at most one.  We check if the list
8191      * that hasn't been exhausted is positioned such that we are in the middle
8192      * of a range in its set or not.  (i_a and i_b point to the element beyond
8193      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8194      * is potentially more to output.
8195      * There are four cases:
8196      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8197      *     in the union is entirely from the non-exhausted set.
8198      *  2) Both were in their sets, count is 2.  Nothing further should
8199      *     be output, as everything that remains will be in the exhausted
8200      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8201      *     that
8202      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8203      *     Nothing further should be output because the union includes
8204      *     everything from the exhausted set.  Not decrementing ensures that.
8205      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8206      *     decrementing to 0 insures that we look at the remainder of the
8207      *     non-exhausted set */
8208     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8209         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8210     {
8211         count--;
8212     }
8213
8214     /* The final length is what we've output so far, plus what else is about to
8215      * be output.  (If 'count' is non-zero, then the input list we exhausted
8216      * has everything remaining up to the machine's limit in its set, and hence
8217      * in the union, so there will be no further output. */
8218     len_u = i_u;
8219     if (count == 0) {
8220         /* At most one of the subexpressions will be non-zero */
8221         len_u += (len_a - i_a) + (len_b - i_b);
8222     }
8223
8224     /* Set result to final length, which can change the pointer to array_u, so
8225      * re-find it */
8226     if (len_u != _invlist_len(u)) {
8227         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8228         invlist_trim(u);
8229         array_u = invlist_array(u);
8230     }
8231
8232     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8233      * the other) ended with everything above it not in its set.  That means
8234      * that the remaining part of the union is precisely the same as the
8235      * non-exhausted list, so can just copy it unchanged.  (If both list were
8236      * exhausted at the same time, then the operations below will be both 0.)
8237      */
8238     if (count == 0) {
8239         IV copy_count; /* At most one will have a non-zero copy count */
8240         if ((copy_count = len_a - i_a) > 0) {
8241             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8242         }
8243         else if ((copy_count = len_b - i_b) > 0) {
8244             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8245         }
8246     }
8247
8248     /*  We may be removing a reference to one of the inputs.  If so, the output
8249      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8250      *  count decremented) */
8251     if (a == *output || b == *output) {
8252         assert(! invlist_is_iterating(*output));
8253         if ((SvTEMP(*output))) {
8254             sv_2mortal(u);
8255         }
8256         else {
8257             SvREFCNT_dec_NN(*output);
8258         }
8259     }
8260
8261     *output = u;
8262
8263     return;
8264 }
8265
8266 void
8267 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
8268 {
8269     /* Take the intersection of two inversion lists and point <i> to it.  *i
8270      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8271      * the reference count to that list will be decremented if not already a
8272      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8273      * The first list, <a>, may be NULL, in which case an empty list is
8274      * returned.  If <complement_b> is TRUE, the result will be the
8275      * intersection of <a> and the complement (or inversion) of <b> instead of
8276      * <b> directly.
8277      *
8278      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8279      * Richard Gillam, published by Addison-Wesley, and explained at some
8280      * length there.  The preface says to incorporate its examples into your
8281      * code at your own risk.  In fact, it had bugs
8282      *
8283      * The algorithm is like a merge sort, and is essentially the same as the
8284      * union above
8285      */
8286
8287     const UV* array_a;          /* a's array */
8288     const UV* array_b;
8289     UV len_a;   /* length of a's array */
8290     UV len_b;
8291
8292     SV* r;                   /* the resulting intersection */
8293     UV* array_r;
8294     UV len_r;
8295
8296     UV i_a = 0;             /* current index into a's array */
8297     UV i_b = 0;
8298     UV i_r = 0;
8299
8300     /* running count, as explained in the algorithm source book; items are
8301      * stopped accumulating and are output when the count changes to/from 2.
8302      * The count is incremented when we start a range that's in the set, and
8303      * decremented when we start a range that's not in the set.  So its range
8304      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8305      */
8306     UV count = 0;
8307
8308     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8309     assert(a != b);
8310
8311     /* Special case if either one is empty */
8312     len_a = (a == NULL) ? 0 : _invlist_len(a);
8313     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8314         bool make_temp = FALSE;
8315
8316         if (len_a != 0 && complement_b) {
8317
8318             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8319              * be empty.  Here, also we are using 'b's complement, which hence
8320              * must be every possible code point.  Thus the intersection is
8321              * simply 'a'. */
8322             if (*i != a) {
8323                 if (*i == b) {
8324                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8325                         SvREFCNT_dec_NN(b);
8326                     }
8327                 }
8328
8329                 *i = invlist_clone(a);
8330             }
8331             /* else *i is already 'a' */
8332
8333             if (make_temp) {
8334                 sv_2mortal(*i);
8335             }
8336             return;
8337         }
8338
8339         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8340          * intersection must be empty */
8341         if (*i == a) {
8342             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8343                 SvREFCNT_dec_NN(a);
8344             }
8345         }
8346         else if (*i == b) {
8347             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8348                 SvREFCNT_dec_NN(b);
8349             }
8350         }
8351         *i = _new_invlist(0);
8352         if (make_temp) {
8353             sv_2mortal(*i);
8354         }
8355
8356         return;
8357     }
8358
8359     /* Here both lists exist and are non-empty */
8360     array_a = invlist_array(a);
8361     array_b = invlist_array(b);
8362
8363     /* If are to take the intersection of 'a' with the complement of b, set it
8364      * up so are looking at b's complement. */
8365     if (complement_b) {
8366
8367         /* To complement, we invert: if the first element is 0, remove it.  To
8368          * do this, we just pretend the array starts one later */
8369         if (array_b[0] == 0) {
8370             array_b++;
8371             len_b--;
8372         }
8373         else {
8374
8375             /* But if the first element is not zero, we pretend the list starts
8376              * at the 0 that is always stored immediately before the array. */
8377             array_b--;
8378             len_b++;
8379         }
8380     }
8381
8382     /* Size the intersection for the worst case: that the intersection ends up
8383      * fragmenting everything to be completely disjoint */
8384     r= _new_invlist(len_a + len_b);
8385
8386     /* Will contain U+0000 iff both components do */
8387     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8388                                      && len_b > 0 && array_b[0] == 0);
8389
8390     /* Go through each list item by item, stopping when exhausted one of
8391      * them */
8392     while (i_a < len_a && i_b < len_b) {
8393         UV cp;      /* The element to potentially add to the intersection's
8394                        array */
8395         bool cp_in_set; /* Is it in the input list's set or not */
8396
8397         /* We need to take one or the other of the two inputs for the
8398          * intersection.  Since we are merging two sorted lists, we take the
8399          * smaller of the next items.  In case of a tie, we take the one that
8400          * is not in its set first (a difference from the union algorithm).  If
8401          * we took one in the set first, it would increment the count, possibly
8402          * to 2 which would cause it to be output as starting a range in the
8403          * intersection, and the next time through we would take that same
8404          * number, and output it again as ending the set.  By doing it the
8405          * opposite of this, there is no possibility that the count will be
8406          * momentarily incremented to 2.  (In a tie and both are in the set or
8407          * both not in the set, it doesn't matter which we take first.) */
8408         if (array_a[i_a] < array_b[i_b]
8409             || (array_a[i_a] == array_b[i_b]
8410                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8411         {
8412             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8413             cp= array_a[i_a++];
8414         }
8415         else {
8416             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8417             cp= array_b[i_b++];
8418         }
8419
8420         /* Here, have chosen which of the two inputs to look at.  Only output
8421          * if the running count changes to/from 2, which marks the
8422          * beginning/end of a range that's in the intersection */
8423         if (cp_in_set) {
8424             count++;
8425             if (count == 2) {
8426                 array_r[i_r++] = cp;
8427             }
8428         }
8429         else {
8430             if (count == 2) {
8431                 array_r[i_r++] = cp;
8432             }
8433             count--;
8434         }
8435     }
8436
8437     /* Here, we are finished going through at least one of the lists, which
8438      * means there is something remaining in at most one.  We check if the list
8439      * that has been exhausted is positioned such that we are in the middle
8440      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8441      * the ones we care about.)  There are four cases:
8442      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8443      *     nothing left in the intersection.
8444      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8445      *     above 2.  What should be output is exactly that which is in the
8446      *     non-exhausted set, as everything it has is also in the intersection
8447      *     set, and everything it doesn't have can't be in the intersection
8448      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8449      *     gets incremented to 2.  Like the previous case, the intersection is
8450      *     everything that remains in the non-exhausted set.
8451      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8452      *     remains 1.  And the intersection has nothing more. */
8453     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8454         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8455     {
8456         count++;
8457     }
8458
8459     /* The final length is what we've output so far plus what else is in the
8460      * intersection.  At most one of the subexpressions below will be non-zero */
8461     len_r = i_r;
8462     if (count >= 2) {
8463         len_r += (len_a - i_a) + (len_b - i_b);
8464     }
8465
8466     /* Set result to final length, which can change the pointer to array_r, so
8467      * re-find it */
8468     if (len_r != _invlist_len(r)) {
8469         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8470         invlist_trim(r);
8471         array_r = invlist_array(r);
8472     }
8473
8474     /* Finish outputting any remaining */
8475     if (count >= 2) { /* At most one will have a non-zero copy count */
8476         IV copy_count;
8477         if ((copy_count = len_a - i_a) > 0) {
8478             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8479         }
8480         else if ((copy_count = len_b - i_b) > 0) {
8481             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8482         }
8483     }
8484
8485     /*  We may be removing a reference to one of the inputs.  If so, the output
8486      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8487      *  count decremented) */
8488     if (a == *i || b == *i) {
8489         assert(! invlist_is_iterating(*i));
8490         if (SvTEMP(*i)) {
8491             sv_2mortal(r);
8492         }
8493         else {
8494             SvREFCNT_dec_NN(*i);
8495         }
8496     }
8497
8498     *i = r;
8499
8500     return;
8501 }
8502
8503 SV*
8504 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8505 {
8506     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8507      * set.  A pointer to the inversion list is returned.  This may actually be
8508      * a new list, in which case the passed in one has been destroyed.  The
8509      * passed in inversion list can be NULL, in which case a new one is created
8510      * with just the one range in it */
8511
8512     SV* range_invlist;
8513     UV len;
8514
8515     if (invlist == NULL) {
8516         invlist = _new_invlist(2);
8517         len = 0;
8518     }
8519     else {
8520         len = _invlist_len(invlist);
8521     }
8522
8523     /* If comes after the final entry actually in the list, can just append it
8524      * to the end, */
8525     if (len == 0
8526         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8527             && start >= invlist_array(invlist)[len - 1]))
8528     {
8529         _append_range_to_invlist(invlist, start, end);
8530         return invlist;
8531     }
8532
8533     /* Here, can't just append things, create and return a new inversion list
8534      * which is the union of this range and the existing inversion list */
8535     range_invlist = _new_invlist(2);
8536     _append_range_to_invlist(range_invlist, start, end);
8537
8538     _invlist_union(invlist, range_invlist, &invlist);
8539
8540     /* The temporary can be freed */
8541     SvREFCNT_dec_NN(range_invlist);
8542
8543     return invlist;
8544 }
8545
8546 #endif
8547
8548 PERL_STATIC_INLINE SV*
8549 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8550     return _add_range_to_invlist(invlist, cp, cp);
8551 }
8552
8553 #ifndef PERL_IN_XSUB_RE
8554 void
8555 Perl__invlist_invert(pTHX_ SV* const invlist)
8556 {
8557     /* Complement the input inversion list.  This adds a 0 if the list didn't
8558      * have a zero; removes it otherwise.  As described above, the data
8559      * structure is set up so that this is very efficient */
8560
8561     PERL_ARGS_ASSERT__INVLIST_INVERT;
8562
8563     assert(! invlist_is_iterating(invlist));
8564
8565     /* The inverse of matching nothing is matching everything */
8566     if (_invlist_len(invlist) == 0) {
8567         _append_range_to_invlist(invlist, 0, UV_MAX);
8568         return;
8569     }
8570
8571     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8572 }
8573
8574 void
8575 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8576 {
8577     /* Complement the input inversion list (which must be a Unicode property,
8578      * all of which don't match above the Unicode maximum code point.)  And
8579      * Perl has chosen to not have the inversion match above that either.  This
8580      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8581      */
8582
8583     UV len;
8584     UV* array;
8585
8586     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8587
8588     _invlist_invert(invlist);
8589
8590     len = _invlist_len(invlist);
8591
8592     if (len != 0) { /* If empty do nothing */
8593         array = invlist_array(invlist);
8594         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8595             /* Add 0x110000.  First, grow if necessary */
8596             len++;
8597             if (invlist_max(invlist) < len) {
8598                 invlist_extend(invlist, len);
8599                 array = invlist_array(invlist);
8600             }
8601             invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8602             array[len - 1] = PERL_UNICODE_MAX + 1;
8603         }
8604         else {  /* Remove the 0x110000 */
8605             invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8606         }
8607     }
8608
8609     return;
8610 }
8611 #endif
8612
8613 PERL_STATIC_INLINE SV*
8614 S_invlist_clone(pTHX_ SV* const invlist)
8615 {
8616
8617     /* Return a new inversion list that is a copy of the input one, which is
8618      * unchanged.  The new list will not be mortal even if the old one was. */
8619
8620     /* Need to allocate extra space to accommodate Perl's addition of a
8621      * trailing NUL to SvPV's, since it thinks they are always strings */
8622     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8623     STRLEN physical_length = SvCUR(invlist);
8624     bool offset = *(get_invlist_offset_addr(invlist));
8625
8626     PERL_ARGS_ASSERT_INVLIST_CLONE;
8627
8628     *(get_invlist_offset_addr(new_invlist)) = offset;
8629     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8630     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8631
8632     return new_invlist;
8633 }
8634
8635 PERL_STATIC_INLINE STRLEN*
8636 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8637 {
8638     /* Return the address of the UV that contains the current iteration
8639      * position */
8640
8641     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8642
8643     assert(SvTYPE(invlist) == SVt_INVLIST);
8644
8645     return &(((XINVLIST*) SvANY(invlist))->iterator);
8646 }
8647
8648 PERL_STATIC_INLINE void
8649 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8650 {
8651     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8652
8653     *get_invlist_iter_addr(invlist) = 0;
8654 }
8655
8656 PERL_STATIC_INLINE void
8657 S_invlist_iterfinish(pTHX_ SV* invlist)
8658 {
8659     /* Terminate iterator for invlist.  This is to catch development errors.
8660      * Any iteration that is interrupted before completed should call this
8661      * function.  Functions that add code points anywhere else but to the end
8662      * of an inversion list assert that they are not in the middle of an
8663      * iteration.  If they were, the addition would make the iteration
8664      * problematical: if the iteration hadn't reached the place where things
8665      * were being added, it would be ok */
8666
8667     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8668
8669     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8670 }
8671
8672 STATIC bool
8673 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8674 {
8675     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8676      * This call sets in <*start> and <*end>, the next range in <invlist>.
8677      * Returns <TRUE> if successful and the next call will return the next
8678      * range; <FALSE> if was already at the end of the list.  If the latter,
8679      * <*start> and <*end> are unchanged, and the next call to this function
8680      * will start over at the beginning of the list */
8681
8682     STRLEN* pos = get_invlist_iter_addr(invlist);
8683     UV len = _invlist_len(invlist);
8684     UV *array;
8685
8686     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8687
8688     if (*pos >= len) {
8689         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8690         return FALSE;
8691     }
8692
8693     array = invlist_array(invlist);
8694
8695     *start = array[(*pos)++];
8696
8697     if (*pos >= len) {
8698         *end = UV_MAX;
8699     }
8700     else {
8701         *end = array[(*pos)++] - 1;
8702     }
8703
8704     return TRUE;
8705 }
8706
8707 PERL_STATIC_INLINE bool
8708 S_invlist_is_iterating(pTHX_ SV* const invlist)
8709 {
8710     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8711
8712     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8713 }
8714
8715 PERL_STATIC_INLINE UV
8716 S_invlist_highest(pTHX_ SV* const invlist)
8717 {
8718     /* Returns the highest code point that matches an inversion list.  This API
8719      * has an ambiguity, as it returns 0 under either the highest is actually
8720      * 0, or if the list is empty.  If this distinction matters to you, check
8721      * for emptiness before calling this function */
8722
8723     UV len = _invlist_len(invlist);
8724     UV *array;
8725
8726     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8727
8728     if (len == 0) {
8729         return 0;
8730     }
8731
8732     array = invlist_array(invlist);
8733
8734     /* The last element in the array in the inversion list always starts a
8735      * range that goes to infinity.  That range may be for code points that are
8736      * matched in the inversion list, or it may be for ones that aren't
8737      * matched.  In the latter case, the highest code point in the set is one
8738      * less than the beginning of this range; otherwise it is the final element
8739      * of this range: infinity */
8740     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8741            ? UV_MAX
8742            : array[len - 1] - 1;
8743 }
8744
8745 #ifndef PERL_IN_XSUB_RE
8746 SV *
8747 Perl__invlist_contents(pTHX_ SV* const invlist)
8748 {
8749     /* Get the contents of an inversion list into a string SV so that they can
8750      * be printed out.  It uses the format traditionally done for debug tracing
8751      */
8752
8753     UV start, end;
8754     SV* output = newSVpvs("\n");
8755
8756     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8757
8758     assert(! invlist_is_iterating(invlist));
8759
8760     invlist_iterinit(invlist);
8761     while (invlist_iternext(invlist, &start, &end)) {
8762         if (end == UV_MAX) {
8763             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8764         }
8765         else if (end != start) {
8766             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8767                     start,       end);
8768         }
8769         else {
8770             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8771         }
8772     }
8773
8774     return output;
8775 }
8776 #endif
8777
8778 #ifndef PERL_IN_XSUB_RE
8779 void
8780 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8781 {
8782     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
8783      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
8784      * the string 'indent'.  The output looks like this:
8785          [0] 0x000A .. 0x000D
8786          [2] 0x0085
8787          [4] 0x2028 .. 0x2029
8788          [6] 0x3104 .. INFINITY
8789      * This means that the first range of code points matched by the list are
8790      * 0xA through 0xD; the second range contains only the single code point
8791      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
8792      * are used to define each range (except if the final range extends to
8793      * infinity, only a single element is needed).  The array index of the
8794      * first element for the corresponding range is given in brackets. */
8795
8796     UV start, end;
8797     STRLEN count = 0;
8798
8799     PERL_ARGS_ASSERT__INVLIST_DUMP;
8800
8801     if (invlist_is_iterating(invlist)) {
8802         Perl_dump_indent(aTHX_ level, file,
8803              "%sCan't dump inversion list because is in middle of iterating\n",
8804              indent);
8805         return;
8806     }
8807
8808     invlist_iterinit(invlist);
8809     while (invlist_iternext(invlist, &start, &end)) {
8810         if (end == UV_MAX) {
8811             Perl_dump_indent(aTHX_ level, file,
8812                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8813                                    indent, (UV)count, start);
8814         }
8815         else if (end != start) {
8816             Perl_dump_indent(aTHX_ level, file,
8817                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8818                                 indent, (UV)count, start,         end);
8819         }
8820         else {
8821             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8822                                             indent, (UV)count, start);
8823         }
8824         count += 2;
8825     }
8826 }
8827 #endif
8828
8829 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8830 bool
8831 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8832 {
8833     /* Return a boolean as to if the two passed in inversion lists are
8834      * identical.  The final argument, if TRUE, says to take the complement of
8835      * the second inversion list before doing the comparison */
8836
8837     const UV* array_a = invlist_array(a);
8838     const UV* array_b = invlist_array(b);
8839     UV len_a = _invlist_len(a);
8840     UV len_b = _invlist_len(b);
8841
8842     UV i = 0;               /* current index into the arrays */
8843     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8844
8845     PERL_ARGS_ASSERT__INVLISTEQ;
8846
8847     /* If are to compare 'a' with the complement of b, set it
8848      * up so are looking at b's complement. */
8849     if (complement_b) {
8850
8851         /* The complement of nothing is everything, so <a> would have to have
8852          * just one element, starting at zero (ending at infinity) */
8853         if (len_b == 0) {
8854             return (len_a == 1 && array_a[0] == 0);
8855         }
8856         else if (array_b[0] == 0) {
8857
8858             /* Otherwise, to complement, we invert.  Here, the first element is
8859              * 0, just remove it.  To do this, we just pretend the array starts
8860              * one later */
8861
8862             array_b++;
8863             len_b--;
8864         }
8865         else {
8866
8867             /* But if the first element is not zero, we pretend the list starts
8868              * at the 0 that is always stored immediately before the array. */
8869             array_b--;
8870             len_b++;
8871         }
8872     }
8873
8874     /* Make sure that the lengths are the same, as well as the final element
8875      * before looping through the remainder.  (Thus we test the length, final,
8876      * and first elements right off the bat) */
8877     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8878         retval = FALSE;
8879     }
8880     else for (i = 0; i < len_a - 1; i++) {
8881         if (array_a[i] != array_b[i]) {
8882             retval = FALSE;
8883             break;
8884         }
8885     }
8886
8887     return retval;
8888 }
8889 #endif
8890
8891 #undef HEADER_LENGTH
8892 #undef TO_INTERNAL_SIZE
8893 #undef FROM_INTERNAL_SIZE
8894 #undef INVLIST_VERSION_ID
8895
8896 /* End of inversion list object */
8897
8898 STATIC void
8899 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
8900 {
8901     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8902      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8903      * should point to the first flag; it is updated on output to point to the
8904      * final ')' or ':'.  There needs to be at least one flag, or this will
8905      * abort */
8906
8907     /* for (?g), (?gc), and (?o) warnings; warning
8908        about (?c) will warn about (?g) -- japhy    */
8909
8910 #define WASTED_O  0x01
8911 #define WASTED_G  0x02
8912 #define WASTED_C  0x04
8913 #define WASTED_GC (WASTED_G|WASTED_C)
8914     I32 wastedflags = 0x00;
8915     U32 posflags = 0, negflags = 0;
8916     U32 *flagsp = &posflags;
8917     char has_charset_modifier = '\0';
8918     regex_charset cs;
8919     bool has_use_defaults = FALSE;
8920     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8921
8922     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8923
8924     /* '^' as an initial flag sets certain defaults */
8925     if (UCHARAT(RExC_parse) == '^') {
8926         RExC_parse++;
8927         has_use_defaults = TRUE;
8928         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8929         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8930                                         ? REGEX_UNICODE_CHARSET
8931                                         : REGEX_DEPENDS_CHARSET);
8932     }
8933
8934     cs = get_regex_charset(RExC_flags);
8935     if (cs == REGEX_DEPENDS_CHARSET
8936         && (RExC_utf8 || RExC_uni_semantics))
8937     {
8938         cs = REGEX_UNICODE_CHARSET;
8939     }
8940
8941     while (*RExC_parse) {
8942         /* && strchr("iogcmsx", *RExC_parse) */
8943         /* (?g), (?gc) and (?o) are useless here
8944            and must be globally applied -- japhy */
8945         switch (*RExC_parse) {
8946
8947             /* Code for the imsx flags */
8948             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8949
8950             case LOCALE_PAT_MOD:
8951                 if (has_charset_modifier) {
8952                     goto excess_modifier;
8953                 }
8954                 else if (flagsp == &negflags) {
8955                     goto neg_modifier;
8956                 }
8957                 cs = REGEX_LOCALE_CHARSET;
8958                 has_charset_modifier = LOCALE_PAT_MOD;
8959                 RExC_contains_locale = 1;
8960                 break;
8961             case UNICODE_PAT_MOD:
8962                 if (has_charset_modifier) {
8963                     goto excess_modifier;
8964                 }
8965                 else if (flagsp == &negflags) {
8966                     goto neg_modifier;
8967                 }
8968                 cs = REGEX_UNICODE_CHARSET;
8969                 has_charset_modifier = UNICODE_PAT_MOD;
8970                 break;
8971             case ASCII_RESTRICT_PAT_MOD:
8972                 if (flagsp == &negflags) {
8973                     goto neg_modifier;
8974                 }
8975                 if (has_charset_modifier) {
8976                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8977                         goto excess_modifier;
8978                     }
8979                     /* Doubled modifier implies more restricted */
8980                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8981                 }
8982                 else {
8983                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
8984                 }
8985                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8986                 break;
8987             case DEPENDS_PAT_MOD:
8988                 if (has_use_defaults) {
8989                     goto fail_modifiers;
8990                 }
8991                 else if (flagsp == &negflags) {
8992                     goto neg_modifier;
8993                 }
8994                 else if (has_charset_modifier) {
8995                     goto excess_modifier;
8996                 }
8997
8998                 /* The dual charset means unicode semantics if the
8999                  * pattern (or target, not known until runtime) are
9000                  * utf8, or something in the pattern indicates unicode
9001                  * semantics */
9002                 cs = (RExC_utf8 || RExC_uni_semantics)
9003                      ? REGEX_UNICODE_CHARSET
9004                      : REGEX_DEPENDS_CHARSET;
9005                 has_charset_modifier = DEPENDS_PAT_MOD;
9006                 break;
9007             excess_modifier:
9008                 RExC_parse++;
9009                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9010                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9011                 }
9012                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9013                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9014                 }
9015                 else {
9016                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9017                 }
9018                 /*NOTREACHED*/
9019             neg_modifier:
9020                 RExC_parse++;
9021                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9022                 /*NOTREACHED*/
9023             case ONCE_PAT_MOD: /* 'o' */
9024             case GLOBAL_PAT_MOD: /* 'g' */
9025                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9026                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9027                     if (! (wastedflags & wflagbit) ) {
9028                         wastedflags |= wflagbit;
9029                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9030                         vWARN5(
9031                             RExC_parse + 1,
9032                             "Useless (%s%c) - %suse /%c modifier",
9033                             flagsp == &negflags ? "?-" : "?",
9034                             *RExC_parse,
9035                             flagsp == &negflags ? "don't " : "",
9036                             *RExC_parse
9037                         );
9038                     }
9039                 }
9040                 break;
9041
9042             case CONTINUE_PAT_MOD: /* 'c' */
9043                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9044                     if (! (wastedflags & WASTED_C) ) {
9045                         wastedflags |= WASTED_GC;
9046                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9047                         vWARN3(
9048                             RExC_parse + 1,
9049                             "Useless (%sc) - %suse /gc modifier",
9050                             flagsp == &negflags ? "?-" : "?",
9051                             flagsp == &negflags ? "don't " : ""
9052                         );
9053                     }
9054                 }
9055                 break;
9056             case KEEPCOPY_PAT_MOD: /* 'p' */
9057                 if (flagsp == &negflags) {
9058                     if (SIZE_ONLY)
9059                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9060                 } else {
9061                     *flagsp |= RXf_PMf_KEEPCOPY;
9062                 }
9063                 break;
9064             case '-':
9065                 /* A flag is a default iff it is following a minus, so
9066                  * if there is a minus, it means will be trying to
9067                  * re-specify a default which is an error */
9068                 if (has_use_defaults || flagsp == &negflags) {
9069                     goto fail_modifiers;
9070                 }
9071                 flagsp = &negflags;
9072                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9073                 break;
9074             case ':':
9075             case ')':
9076                 RExC_flags |= posflags;
9077                 RExC_flags &= ~negflags;
9078                 set_regex_charset(&RExC_flags, cs);
9079                 if (RExC_flags & RXf_PMf_FOLD) {
9080                     RExC_contains_i = 1;
9081                 }
9082                 return;
9083                 /*NOTREACHED*/
9084             default:
9085             fail_modifiers:
9086                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9087                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9088                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9089                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9090                 /*NOTREACHED*/
9091         }
9092
9093         ++RExC_parse;
9094     }
9095 }
9096
9097 /*
9098  - reg - regular expression, i.e. main body or parenthesized thing
9099  *
9100  * Caller must absorb opening parenthesis.
9101  *
9102  * Combining parenthesis handling with the base level of regular expression
9103  * is a trifle forced, but the need to tie the tails of the branches to what
9104  * follows makes it hard to avoid.
9105  */
9106 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9107 #ifdef DEBUGGING
9108 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9109 #else
9110 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9111 #endif
9112
9113 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9114    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9115    needs to be restarted.
9116    Otherwise would only return NULL if regbranch() returns NULL, which
9117    cannot happen.  */
9118 STATIC regnode *
9119 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9120     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9121      * 2 is like 1, but indicates that nextchar() has been called to advance
9122      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9123      * this flag alerts us to the need to check for that */
9124 {
9125     dVAR;
9126     regnode *ret;               /* Will be the head of the group. */
9127     regnode *br;
9128     regnode *lastbr;
9129     regnode *ender = NULL;
9130     I32 parno = 0;
9131     I32 flags;
9132     U32 oregflags = RExC_flags;
9133     bool have_branch = 0;
9134     bool is_open = 0;
9135     I32 freeze_paren = 0;
9136     I32 after_freeze = 0;
9137
9138     char * parse_start = RExC_parse; /* MJD */
9139     char * const oregcomp_parse = RExC_parse;
9140
9141     GET_RE_DEBUG_FLAGS_DECL;
9142
9143     PERL_ARGS_ASSERT_REG;
9144     DEBUG_PARSE("reg ");
9145
9146     *flagp = 0;                         /* Tentatively. */
9147
9148
9149     /* Make an OPEN node, if parenthesized. */
9150     if (paren) {
9151
9152         /* Under /x, space and comments can be gobbled up between the '(' and
9153          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9154          * intervening space, as the sequence is a token, and a token should be
9155          * indivisible */
9156         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9157
9158         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9159             char *start_verb = RExC_parse;
9160             STRLEN verb_len = 0;
9161             char *start_arg = NULL;
9162             unsigned char op = 0;
9163             int argok = 1;
9164             int internal_argval = 0; /* internal_argval is only useful if !argok */
9165
9166             if (has_intervening_patws && SIZE_ONLY) {
9167                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9168             }
9169             while ( *RExC_parse && *RExC_parse != ')' ) {
9170                 if ( *RExC_parse == ':' ) {
9171                     start_arg = RExC_parse + 1;
9172                     break;
9173                 }
9174                 RExC_parse++;
9175             }
9176             ++start_verb;
9177             verb_len = RExC_parse - start_verb;
9178             if ( start_arg ) {
9179                 RExC_parse++;
9180                 while ( *RExC_parse && *RExC_parse != ')' )
9181                     RExC_parse++;
9182                 if ( *RExC_parse != ')' )
9183                     vFAIL("Unterminated verb pattern argument");
9184                 if ( RExC_parse == start_arg )
9185                     start_arg = NULL;
9186             } else {
9187                 if ( *RExC_parse != ')' )
9188                     vFAIL("Unterminated verb pattern");
9189             }
9190
9191             switch ( *start_verb ) {
9192             case 'A':  /* (*ACCEPT) */
9193                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9194                     op = ACCEPT;
9195                     internal_argval = RExC_nestroot;
9196                 }
9197                 break;
9198             case 'C':  /* (*COMMIT) */
9199                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9200                     op = COMMIT;
9201                 break;
9202             case 'F':  /* (*FAIL) */
9203                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9204                     op = OPFAIL;
9205                     argok = 0;
9206                 }
9207                 break;
9208             case ':':  /* (*:NAME) */
9209             case 'M':  /* (*MARK:NAME) */
9210                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9211                     op = MARKPOINT;
9212                     argok = -1;
9213                 }
9214                 break;
9215             case 'P':  /* (*PRUNE) */
9216                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9217                     op = PRUNE;
9218                 break;
9219             case 'S':   /* (*SKIP) */
9220                 if ( memEQs(start_verb,verb_len,"SKIP") )
9221                     op = SKIP;
9222                 break;
9223             case 'T':  /* (*THEN) */
9224                 /* [19:06] <TimToady> :: is then */
9225                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9226                     op = CUTGROUP;
9227                     RExC_seen |= REG_SEEN_CUTGROUP;
9228                 }
9229                 break;
9230             }
9231             if ( ! op ) {
9232                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9233                 vFAIL2utf8f(
9234                     "Unknown verb pattern '%"UTF8f"'",
9235                     UTF8fARG(UTF, verb_len, start_verb));
9236             }
9237             if ( argok ) {
9238                 if ( start_arg && internal_argval ) {
9239                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9240                         verb_len, start_verb);
9241                 } else if ( argok < 0 && !start_arg ) {
9242                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9243                         verb_len, start_verb);
9244                 } else {
9245                     ret = reganode(pRExC_state, op, internal_argval);
9246                     if ( ! internal_argval && ! SIZE_ONLY ) {
9247                         if (start_arg) {
9248                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
9249                             ARG(ret) = add_data( pRExC_state, STR_WITH_LEN("S"));
9250                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9251                             ret->flags = 0;
9252                         } else {
9253                             ret->flags = 1;
9254                         }
9255                     }
9256                 }
9257                 if (!internal_argval)
9258                     RExC_seen |= REG_SEEN_VERBARG;
9259             } else if ( start_arg ) {
9260                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9261                         verb_len, start_verb);
9262             } else {
9263                 ret = reg_node(pRExC_state, op);
9264             }
9265             nextchar(pRExC_state);
9266             return ret;
9267         }
9268         else if (*RExC_parse == '?') { /* (?...) */
9269             bool is_logical = 0;
9270             const char * const seqstart = RExC_parse;
9271             if (has_intervening_patws && SIZE_ONLY) {
9272                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9273             }
9274
9275             RExC_parse++;
9276             paren = *RExC_parse++;
9277             ret = NULL;                 /* For look-ahead/behind. */
9278             switch (paren) {
9279
9280             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9281                 paren = *RExC_parse++;
9282                 if ( paren == '<')         /* (?P<...>) named capture */
9283                     goto named_capture;
9284                 else if (paren == '>') {   /* (?P>name) named recursion */
9285                     goto named_recursion;
9286                 }
9287                 else if (paren == '=') {   /* (?P=...)  named backref */
9288                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
9289                        you change this make sure you change that */
9290                     char* name_start = RExC_parse;
9291                     U32 num = 0;
9292                     SV *sv_dat = reg_scan_name(pRExC_state,
9293                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9294                     if (RExC_parse == name_start || *RExC_parse != ')')
9295                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9296                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9297
9298                     if (!SIZE_ONLY) {
9299                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9300                         RExC_rxi->data->data[num]=(void*)sv_dat;
9301                         SvREFCNT_inc_simple_void(sv_dat);
9302                     }
9303                     RExC_sawback = 1;
9304                     ret = reganode(pRExC_state,
9305                                    ((! FOLD)
9306                                      ? NREF
9307                                      : (ASCII_FOLD_RESTRICTED)
9308                                        ? NREFFA
9309                                        : (AT_LEAST_UNI_SEMANTICS)
9310                                          ? NREFFU
9311                                          : (LOC)
9312                                            ? NREFFL
9313                                            : NREFF),
9314                                     num);
9315                     *flagp |= HASWIDTH;
9316
9317                     Set_Node_Offset(ret, parse_start+1);
9318                     Set_Node_Cur_Length(ret, parse_start);
9319
9320                     nextchar(pRExC_state);
9321                     return ret;
9322                 }
9323                 RExC_parse++;
9324                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9325                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9326                 /*NOTREACHED*/
9327             case '<':           /* (?<...) */
9328                 if (*RExC_parse == '!')
9329                     paren = ',';
9330                 else if (*RExC_parse != '=')
9331               named_capture:
9332                 {               /* (?<...>) */
9333                     char *name_start;
9334                     SV *svname;
9335                     paren= '>';
9336             case '\'':          /* (?'...') */
9337                     name_start= RExC_parse;
9338                     svname = reg_scan_name(pRExC_state,
9339                         SIZE_ONLY    /* reverse test from the others */
9340                         ? REG_RSN_RETURN_NAME
9341                         : REG_RSN_RETURN_NULL);
9342                     if (RExC_parse == name_start || *RExC_parse != paren)
9343                         vFAIL2("Sequence (?%c... not terminated",
9344                             paren=='>' ? '<' : paren);
9345                     if (SIZE_ONLY) {
9346                         HE *he_str;
9347                         SV *sv_dat = NULL;
9348                         if (!svname) /* shouldn't happen */
9349                             Perl_croak(aTHX_
9350                                 "panic: reg_scan_name returned NULL");
9351                         if (!RExC_paren_names) {
9352                             RExC_paren_names= newHV();
9353                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9354 #ifdef DEBUGGING
9355                             RExC_paren_name_list= newAV();
9356                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9357 #endif
9358                         }
9359                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9360                         if ( he_str )
9361                             sv_dat = HeVAL(he_str);
9362                         if ( ! sv_dat ) {
9363                             /* croak baby croak */
9364                             Perl_croak(aTHX_
9365                                 "panic: paren_name hash element allocation failed");
9366                         } else if ( SvPOK(sv_dat) ) {
9367                             /* (?|...) can mean we have dupes so scan to check
9368                                its already been stored. Maybe a flag indicating
9369                                we are inside such a construct would be useful,
9370                                but the arrays are likely to be quite small, so
9371                                for now we punt -- dmq */
9372                             IV count = SvIV(sv_dat);
9373                             I32 *pv = (I32*)SvPVX(sv_dat);
9374                             IV i;
9375                             for ( i = 0 ; i < count ; i++ ) {
9376                                 if ( pv[i] == RExC_npar ) {
9377                                     count = 0;
9378                                     break;
9379                                 }
9380                             }
9381                             if ( count ) {
9382                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
9383                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9384                                 pv[count] = RExC_npar;
9385                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9386                             }
9387                         } else {
9388                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9389                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
9390                             SvIOK_on(sv_dat);
9391                             SvIV_set(sv_dat, 1);
9392                         }
9393 #ifdef DEBUGGING
9394                         /* Yes this does cause a memory leak in debugging Perls */
9395                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
9396                             SvREFCNT_dec_NN(svname);
9397 #endif
9398
9399                         /*sv_dump(sv_dat);*/
9400                     }
9401                     nextchar(pRExC_state);
9402                     paren = 1;
9403                     goto capturing_parens;
9404                 }
9405                 RExC_seen |= REG_SEEN_LOOKBEHIND;
9406                 RExC_in_lookbehind++;
9407                 RExC_parse++;
9408             case '=':           /* (?=...) */
9409                 RExC_seen_zerolen++;
9410                 break;
9411             case '!':           /* (?!...) */
9412                 RExC_seen_zerolen++;
9413                 if (*RExC_parse == ')') {
9414                     ret=reg_node(pRExC_state, OPFAIL);
9415                     nextchar(pRExC_state);
9416                     return ret;
9417                 }
9418                 break;
9419             case '|':           /* (?|...) */
9420                 /* branch reset, behave like a (?:...) except that
9421                    buffers in alternations share the same numbers */
9422                 paren = ':';
9423                 after_freeze = freeze_paren = RExC_npar;
9424                 break;
9425             case ':':           /* (?:...) */
9426             case '>':           /* (?>...) */
9427                 break;
9428             case '$':           /* (?$...) */
9429             case '@':           /* (?@...) */
9430                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9431                 break;
9432             case '#':           /* (?#...) */
9433                 /* XXX As soon as we disallow separating the '?' and '*' (by
9434                  * spaces or (?#...) comment), it is believed that this case
9435                  * will be unreachable and can be removed.  See
9436                  * [perl #117327] */
9437                 while (*RExC_parse && *RExC_parse != ')')
9438                     RExC_parse++;
9439                 if (*RExC_parse != ')')
9440                     FAIL("Sequence (?#... not terminated");
9441                 nextchar(pRExC_state);
9442                 *flagp = TRYAGAIN;
9443                 return NULL;
9444             case '0' :           /* (?0) */
9445             case 'R' :           /* (?R) */
9446                 if (*RExC_parse != ')')
9447                     FAIL("Sequence (?R) not terminated");
9448                 ret = reg_node(pRExC_state, GOSTART);
9449                     RExC_seen |= REG_SEEN_GOSTART;
9450                 *flagp |= POSTPONED;
9451                 nextchar(pRExC_state);
9452                 return ret;
9453                 /*notreached*/
9454             { /* named and numeric backreferences */
9455                 I32 num;
9456             case '&':            /* (?&NAME) */
9457                 parse_start = RExC_parse - 1;
9458               named_recursion:
9459                 {
9460                     SV *sv_dat = reg_scan_name(pRExC_state,
9461                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9462                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9463                 }
9464                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9465                     vFAIL("Sequence (?&... not terminated");
9466                 goto gen_recurse_regop;
9467                 assert(0); /* NOT REACHED */
9468             case '+':
9469                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9470                     RExC_parse++;
9471                     vFAIL("Illegal pattern");
9472                 }
9473                 goto parse_recursion;
9474                 /* NOT REACHED*/
9475             case '-': /* (?-1) */
9476                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9477                     RExC_parse--; /* rewind to let it be handled later */
9478                     goto parse_flags;
9479                 }
9480                 /*FALLTHROUGH */
9481             case '1': case '2': case '3': case '4': /* (?1) */
9482             case '5': case '6': case '7': case '8': case '9':
9483                 RExC_parse--;
9484               parse_recursion:
9485                 num = atoi(RExC_parse);
9486                 parse_start = RExC_parse - 1; /* MJD */
9487                 if (*RExC_parse == '-')
9488                     RExC_parse++;
9489                 while (isDIGIT(*RExC_parse))
9490                         RExC_parse++;
9491                 if (*RExC_parse!=')')
9492                     vFAIL("Expecting close bracket");
9493
9494               gen_recurse_regop:
9495                 if ( paren == '-' ) {
9496                     /*
9497                     Diagram of capture buffer numbering.
9498                     Top line is the normal capture buffer numbers
9499                     Bottom line is the negative indexing as from
9500                     the X (the (?-2))
9501
9502                     +   1 2    3 4 5 X          6 7
9503                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9504                     -   5 4    3 2 1 X          x x
9505
9506                     */
9507                     num = RExC_npar + num;
9508                     if (num < 1)  {
9509                         RExC_parse++;
9510                         vFAIL("Reference to nonexistent group");
9511                     }
9512                 } else if ( paren == '+' ) {
9513                     num = RExC_npar + num - 1;
9514                 }
9515
9516                 ret = reganode(pRExC_state, GOSUB, num);
9517                 if (!SIZE_ONLY) {
9518                     if (num > (I32)RExC_rx->nparens) {
9519                         RExC_parse++;
9520                         vFAIL("Reference to nonexistent group");
9521                     }
9522                     ARG2L_SET( ret, RExC_recurse_count++);
9523                     RExC_emit++;
9524                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9525                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9526                 } else {
9527                     RExC_size++;
9528                 }
9529                 RExC_seen |= REG_SEEN_RECURSE;
9530                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9531                 Set_Node_Offset(ret, parse_start); /* MJD */
9532
9533                 *flagp |= POSTPONED;
9534                 nextchar(pRExC_state);
9535                 return ret;
9536             } /* named and numeric backreferences */
9537             assert(0); /* NOT REACHED */
9538
9539             case '?':           /* (??...) */
9540                 is_logical = 1;
9541                 if (*RExC_parse != '{') {
9542                     RExC_parse++;
9543                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9544                     vFAIL2utf8f(
9545                         "Sequence (%"UTF8f"...) not recognized",
9546                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9547                     /*NOTREACHED*/
9548                 }
9549                 *flagp |= POSTPONED;
9550                 paren = *RExC_parse++;
9551                 /* FALL THROUGH */
9552             case '{':           /* (?{...}) */
9553             {
9554                 U32 n = 0;
9555                 struct reg_code_block *cb;
9556
9557                 RExC_seen_zerolen++;
9558
9559                 if (   !pRExC_state->num_code_blocks
9560                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9561                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9562                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9563                             - RExC_start)
9564                 ) {
9565                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9566                         FAIL("panic: Sequence (?{...}): no code block found\n");
9567                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9568                 }
9569                 /* this is a pre-compiled code block (?{...}) */
9570                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9571                 RExC_parse = RExC_start + cb->end;
9572                 if (!SIZE_ONLY) {
9573                     OP *o = cb->block;
9574                     if (cb->src_regex) {
9575                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9576                         RExC_rxi->data->data[n] =
9577                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9578                         RExC_rxi->data->data[n+1] = (void*)o;
9579                     }
9580                     else {
9581                         n = add_data(pRExC_state,
9582                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9583                         RExC_rxi->data->data[n] = (void*)o;
9584                     }
9585                 }
9586                 pRExC_state->code_index++;
9587                 nextchar(pRExC_state);
9588
9589                 if (is_logical) {
9590                     regnode *eval;
9591                     ret = reg_node(pRExC_state, LOGICAL);
9592                     eval = reganode(pRExC_state, EVAL, n);
9593                     if (!SIZE_ONLY) {
9594                         ret->flags = 2;
9595                         /* for later propagation into (??{}) return value */
9596                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9597                     }
9598                     REGTAIL(pRExC_state, ret, eval);
9599                     /* deal with the length of this later - MJD */
9600                     return ret;
9601                 }
9602                 ret = reganode(pRExC_state, EVAL, n);
9603                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9604                 Set_Node_Offset(ret, parse_start);
9605                 return ret;
9606             }
9607             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9608             {
9609                 int is_define= 0;
9610                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9611                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9612                         || RExC_parse[1] == '<'
9613                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9614                         I32 flag;
9615                         regnode *tail;
9616
9617                         ret = reg_node(pRExC_state, LOGICAL);
9618                         if (!SIZE_ONLY)
9619                             ret->flags = 1;
9620
9621                         tail = reg(pRExC_state, 1, &flag, depth+1);
9622                         if (flag & RESTART_UTF8) {
9623                             *flagp = RESTART_UTF8;
9624                             return NULL;
9625                         }
9626                         REGTAIL(pRExC_state, ret, tail);
9627                         goto insert_if;
9628                     }
9629                 }
9630                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9631                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9632                 {
9633                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9634                     char *name_start= RExC_parse++;
9635                     U32 num = 0;
9636                     SV *sv_dat=reg_scan_name(pRExC_state,
9637                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9638                     if (RExC_parse == name_start || *RExC_parse != ch)
9639                         vFAIL2("Sequence (?(%c... not terminated",
9640                             (ch == '>' ? '<' : ch));
9641                     RExC_parse++;
9642                     if (!SIZE_ONLY) {
9643                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9644                         RExC_rxi->data->data[num]=(void*)sv_dat;
9645                         SvREFCNT_inc_simple_void(sv_dat);
9646                     }
9647                     ret = reganode(pRExC_state,NGROUPP,num);
9648                     goto insert_if_check_paren;
9649                 }
9650                 else if (RExC_parse[0] == 'D' &&
9651                          RExC_parse[1] == 'E' &&
9652                          RExC_parse[2] == 'F' &&
9653                          RExC_parse[3] == 'I' &&
9654                          RExC_parse[4] == 'N' &&
9655                          RExC_parse[5] == 'E')
9656                 {
9657                     ret = reganode(pRExC_state,DEFINEP,0);
9658                     RExC_parse +=6 ;
9659                     is_define = 1;
9660                     goto insert_if_check_paren;
9661                 }
9662                 else if (RExC_parse[0] == 'R') {
9663                     RExC_parse++;
9664                     parno = 0;
9665                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9666                         parno = atoi(RExC_parse++);
9667                         while (isDIGIT(*RExC_parse))
9668                             RExC_parse++;
9669                     } else if (RExC_parse[0] == '&') {
9670                         SV *sv_dat;
9671                         RExC_parse++;
9672                         sv_dat = reg_scan_name(pRExC_state,
9673                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9674                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9675                     }
9676                     ret = reganode(pRExC_state,INSUBP,parno);
9677                     goto insert_if_check_paren;
9678                 }
9679                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9680                     /* (?(1)...) */
9681                     char c;
9682                     char *tmp;
9683                     parno = atoi(RExC_parse++);
9684
9685                     while (isDIGIT(*RExC_parse))
9686                         RExC_parse++;
9687                     ret = reganode(pRExC_state, GROUPP, parno);
9688
9689                  insert_if_check_paren:
9690                     if (*(tmp = nextchar(pRExC_state)) != ')') {
9691                         /* nextchar also skips comments, so undo its work
9692                          * and skip over the the next character.
9693                          */
9694                         RExC_parse = tmp;
9695                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9696                         vFAIL("Switch condition not recognized");
9697                     }
9698                   insert_if:
9699                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9700                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9701                     if (br == NULL) {
9702                         if (flags & RESTART_UTF8) {
9703                             *flagp = RESTART_UTF8;
9704                             return NULL;
9705                         }
9706                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9707                               (UV) flags);
9708                     } else
9709                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9710                     c = *nextchar(pRExC_state);
9711                     if (flags&HASWIDTH)
9712                         *flagp |= HASWIDTH;
9713                     if (c == '|') {
9714                         if (is_define)
9715                             vFAIL("(?(DEFINE)....) does not allow branches");
9716                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9717                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9718                             if (flags & RESTART_UTF8) {
9719                                 *flagp = RESTART_UTF8;
9720                                 return NULL;
9721                             }
9722                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9723                                   (UV) flags);
9724                         }
9725                         REGTAIL(pRExC_state, ret, lastbr);
9726                         if (flags&HASWIDTH)
9727                             *flagp |= HASWIDTH;
9728                         c = *nextchar(pRExC_state);
9729                     }
9730                     else
9731                         lastbr = NULL;
9732                     if (c != ')')
9733                         vFAIL("Switch (?(condition)... contains too many branches");
9734                     ender = reg_node(pRExC_state, TAIL);
9735                     REGTAIL(pRExC_state, br, ender);
9736                     if (lastbr) {
9737                         REGTAIL(pRExC_state, lastbr, ender);
9738                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9739                     }
9740                     else
9741                         REGTAIL(pRExC_state, ret, ender);
9742                     RExC_size++; /* XXX WHY do we need this?!!
9743                                     For large programs it seems to be required
9744                                     but I can't figure out why. -- dmq*/
9745                     return ret;
9746                 }
9747                 else {
9748                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9749                     vFAIL("Unknown switch condition (?(...))");
9750                 }
9751             }
9752             case '[':           /* (?[ ... ]) */
9753                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9754                                          oregcomp_parse);
9755             case 0:
9756                 RExC_parse--; /* for vFAIL to print correctly */
9757                 vFAIL("Sequence (? incomplete");
9758                 break;
9759             default: /* e.g., (?i) */
9760                 --RExC_parse;
9761               parse_flags:
9762                 parse_lparen_question_flags(pRExC_state);
9763                 if (UCHARAT(RExC_parse) != ':') {
9764                     nextchar(pRExC_state);
9765                     *flagp = TRYAGAIN;
9766                     return NULL;
9767                 }
9768                 paren = ':';
9769                 nextchar(pRExC_state);
9770                 ret = NULL;
9771                 goto parse_rest;
9772             } /* end switch */
9773         }
9774         else {                  /* (...) */
9775           capturing_parens:
9776             parno = RExC_npar;
9777             RExC_npar++;
9778
9779             ret = reganode(pRExC_state, OPEN, parno);
9780             if (!SIZE_ONLY ){
9781                 if (!RExC_nestroot)
9782                     RExC_nestroot = parno;
9783                 if (RExC_seen & REG_SEEN_RECURSE
9784                     && !RExC_open_parens[parno-1])
9785                 {
9786                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9787                         "Setting open paren #%"IVdf" to %d\n",
9788                         (IV)parno, REG_NODE_NUM(ret)));
9789                     RExC_open_parens[parno-1]= ret;
9790                 }
9791             }
9792             Set_Node_Length(ret, 1); /* MJD */
9793             Set_Node_Offset(ret, RExC_parse); /* MJD */
9794             is_open = 1;
9795         }
9796     }
9797     else                        /* ! paren */
9798         ret = NULL;
9799
9800    parse_rest:
9801     /* Pick up the branches, linking them together. */
9802     parse_start = RExC_parse;   /* MJD */
9803     br = regbranch(pRExC_state, &flags, 1,depth+1);
9804
9805     /*     branch_len = (paren != 0); */
9806
9807     if (br == NULL) {
9808         if (flags & RESTART_UTF8) {
9809             *flagp = RESTART_UTF8;
9810             return NULL;
9811         }
9812         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9813     }
9814     if (*RExC_parse == '|') {
9815         if (!SIZE_ONLY && RExC_extralen) {
9816             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9817         }
9818         else {                  /* MJD */
9819             reginsert(pRExC_state, BRANCH, br, depth+1);
9820             Set_Node_Length(br, paren != 0);
9821             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9822         }
9823         have_branch = 1;
9824         if (SIZE_ONLY)
9825             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9826     }
9827     else if (paren == ':') {
9828         *flagp |= flags&SIMPLE;
9829     }
9830     if (is_open) {                              /* Starts with OPEN. */
9831         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9832     }
9833     else if (paren != '?')              /* Not Conditional */
9834         ret = br;
9835     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9836     lastbr = br;
9837     while (*RExC_parse == '|') {
9838         if (!SIZE_ONLY && RExC_extralen) {
9839             ender = reganode(pRExC_state, LONGJMP,0);
9840             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9841         }
9842         if (SIZE_ONLY)
9843             RExC_extralen += 2;         /* Account for LONGJMP. */
9844         nextchar(pRExC_state);
9845         if (freeze_paren) {
9846             if (RExC_npar > after_freeze)
9847                 after_freeze = RExC_npar;
9848             RExC_npar = freeze_paren;
9849         }
9850         br = regbranch(pRExC_state, &flags, 0, depth+1);
9851
9852         if (br == NULL) {
9853             if (flags & RESTART_UTF8) {
9854                 *flagp = RESTART_UTF8;
9855                 return NULL;
9856             }
9857             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9858         }
9859         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9860         lastbr = br;
9861         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9862     }
9863
9864     if (have_branch || paren != ':') {
9865         /* Make a closing node, and hook it on the end. */
9866         switch (paren) {
9867         case ':':
9868             ender = reg_node(pRExC_state, TAIL);
9869             break;
9870         case 1: case 2:
9871             ender = reganode(pRExC_state, CLOSE, parno);
9872             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9873                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9874                         "Setting close paren #%"IVdf" to %d\n",
9875                         (IV)parno, REG_NODE_NUM(ender)));
9876                 RExC_close_parens[parno-1]= ender;
9877                 if (RExC_nestroot == parno)
9878                     RExC_nestroot = 0;
9879             }
9880             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9881             Set_Node_Length(ender,1); /* MJD */
9882             break;
9883         case '<':
9884         case ',':
9885         case '=':
9886         case '!':
9887             *flagp &= ~HASWIDTH;
9888             /* FALL THROUGH */
9889         case '>':
9890             ender = reg_node(pRExC_state, SUCCEED);
9891             break;
9892         case 0:
9893             ender = reg_node(pRExC_state, END);
9894             if (!SIZE_ONLY) {
9895                 assert(!RExC_opend); /* there can only be one! */
9896                 RExC_opend = ender;
9897             }
9898             break;
9899         }
9900         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9901             SV * const mysv_val1=sv_newmortal();
9902             SV * const mysv_val2=sv_newmortal();
9903             DEBUG_PARSE_MSG("lsbr");
9904             regprop(RExC_rx, mysv_val1, lastbr);
9905             regprop(RExC_rx, mysv_val2, ender);
9906             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9907                           SvPV_nolen_const(mysv_val1),
9908                           (IV)REG_NODE_NUM(lastbr),
9909                           SvPV_nolen_const(mysv_val2),
9910                           (IV)REG_NODE_NUM(ender),
9911                           (IV)(ender - lastbr)
9912             );
9913         });
9914         REGTAIL(pRExC_state, lastbr, ender);
9915
9916         if (have_branch && !SIZE_ONLY) {
9917             char is_nothing= 1;
9918             if (depth==1)
9919                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9920
9921             /* Hook the tails of the branches to the closing node. */
9922             for (br = ret; br; br = regnext(br)) {
9923                 const U8 op = PL_regkind[OP(br)];
9924                 if (op == BRANCH) {
9925                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9926                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9927                         is_nothing= 0;
9928                 }
9929                 else if (op == BRANCHJ) {
9930                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9931                     /* for now we always disable this optimisation * /
9932                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9933                     */
9934                         is_nothing= 0;
9935                 }
9936             }
9937             if (is_nothing) {
9938                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9939                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9940                     SV * const mysv_val1=sv_newmortal();
9941                     SV * const mysv_val2=sv_newmortal();
9942                     DEBUG_PARSE_MSG("NADA");
9943                     regprop(RExC_rx, mysv_val1, ret);
9944                     regprop(RExC_rx, mysv_val2, ender);
9945                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9946                                   SvPV_nolen_const(mysv_val1),
9947                                   (IV)REG_NODE_NUM(ret),
9948                                   SvPV_nolen_const(mysv_val2),
9949                                   (IV)REG_NODE_NUM(ender),
9950                                   (IV)(ender - ret)
9951                     );
9952                 });
9953                 OP(br)= NOTHING;
9954                 if (OP(ender) == TAIL) {
9955                     NEXT_OFF(br)= 0;
9956                     RExC_emit= br + 1;
9957                 } else {
9958                     regnode *opt;
9959                     for ( opt= br + 1; opt < ender ; opt++ )
9960                         OP(opt)= OPTIMIZED;
9961                     NEXT_OFF(br)= ender - br;
9962                 }
9963             }
9964         }
9965     }
9966
9967     {
9968         const char *p;
9969         static const char parens[] = "=!<,>";
9970
9971         if (paren && (p = strchr(parens, paren))) {
9972             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9973             int flag = (p - parens) > 1;
9974
9975             if (paren == '>')
9976                 node = SUSPEND, flag = 0;
9977             reginsert(pRExC_state, node,ret, depth+1);
9978             Set_Node_Cur_Length(ret, parse_start);
9979             Set_Node_Offset(ret, parse_start + 1);
9980             ret->flags = flag;
9981             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9982         }
9983     }
9984
9985     /* Check for proper termination. */
9986     if (paren) {
9987         /* restore original flags, but keep (?p) */
9988         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9989         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9990             RExC_parse = oregcomp_parse;
9991             vFAIL("Unmatched (");
9992         }
9993     }
9994     else if (!paren && RExC_parse < RExC_end) {
9995         if (*RExC_parse == ')') {
9996             RExC_parse++;
9997             vFAIL("Unmatched )");
9998         }
9999         else
10000             FAIL("Junk on end of regexp");      /* "Can't happen". */
10001         assert(0); /* NOTREACHED */
10002     }
10003
10004     if (RExC_in_lookbehind) {
10005         RExC_in_lookbehind--;
10006     }
10007     if (after_freeze > RExC_npar)
10008         RExC_npar = after_freeze;
10009     return(ret);
10010 }
10011
10012 /*
10013  - regbranch - one alternative of an | operator
10014  *
10015  * Implements the concatenation operator.
10016  *
10017  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10018  * restarted.
10019  */
10020 STATIC regnode *
10021 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10022 {
10023     dVAR;
10024     regnode *ret;
10025     regnode *chain = NULL;
10026     regnode *latest;
10027     I32 flags = 0, c = 0;
10028     GET_RE_DEBUG_FLAGS_DECL;
10029
10030     PERL_ARGS_ASSERT_REGBRANCH;
10031
10032     DEBUG_PARSE("brnc");
10033
10034     if (first)
10035         ret = NULL;
10036     else {
10037         if (!SIZE_ONLY && RExC_extralen)
10038             ret = reganode(pRExC_state, BRANCHJ,0);
10039         else {
10040             ret = reg_node(pRExC_state, BRANCH);
10041             Set_Node_Length(ret, 1);
10042         }
10043     }
10044
10045     if (!first && SIZE_ONLY)
10046         RExC_extralen += 1;                     /* BRANCHJ */
10047
10048     *flagp = WORST;                     /* Tentatively. */
10049
10050     RExC_parse--;
10051     nextchar(pRExC_state);
10052     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10053         flags &= ~TRYAGAIN;
10054         latest = regpiece(pRExC_state, &flags,depth+1);
10055         if (latest == NULL) {
10056             if (flags & TRYAGAIN)
10057                 continue;
10058             if (flags & RESTART_UTF8) {
10059                 *flagp = RESTART_UTF8;
10060                 return NULL;
10061             }
10062             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10063         }
10064         else if (ret == NULL)
10065             ret = latest;
10066         *flagp |= flags&(HASWIDTH|POSTPONED);
10067         if (chain == NULL)      /* First piece. */
10068             *flagp |= flags&SPSTART;
10069         else {
10070             RExC_naughty++;
10071             REGTAIL(pRExC_state, chain, latest);
10072         }
10073         chain = latest;
10074         c++;
10075     }
10076     if (chain == NULL) {        /* Loop ran zero times. */
10077         chain = reg_node(pRExC_state, NOTHING);
10078         if (ret == NULL)
10079             ret = chain;
10080     }
10081     if (c == 1) {
10082         *flagp |= flags&SIMPLE;
10083     }
10084
10085     return ret;
10086 }
10087
10088 /*
10089  - regpiece - something followed by possible [*+?]
10090  *
10091  * Note that the branching code sequences used for ? and the general cases
10092  * of * and + are somewhat optimized:  they use the same NOTHING node as
10093  * both the endmarker for their branch list and the body of the last branch.
10094  * It might seem that this node could be dispensed with entirely, but the
10095  * endmarker role is not redundant.
10096  *
10097  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10098  * TRYAGAIN.
10099  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10100  * restarted.
10101  */
10102 STATIC regnode *
10103 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10104 {
10105     dVAR;
10106     regnode *ret;
10107     char op;
10108     char *next;
10109     I32 flags;
10110     const char * const origparse = RExC_parse;
10111     I32 min;
10112     I32 max = REG_INFTY;
10113 #ifdef RE_TRACK_PATTERN_OFFSETS
10114     char *parse_start;
10115 #endif
10116     const char *maxpos = NULL;
10117
10118     /* Save the original in case we change the emitted regop to a FAIL. */
10119     regnode * const orig_emit = RExC_emit;
10120
10121     GET_RE_DEBUG_FLAGS_DECL;
10122
10123     PERL_ARGS_ASSERT_REGPIECE;
10124
10125     DEBUG_PARSE("piec");
10126
10127     ret = regatom(pRExC_state, &flags,depth+1);
10128     if (ret == NULL) {
10129         if (flags & (TRYAGAIN|RESTART_UTF8))
10130             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10131         else
10132             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10133         return(NULL);
10134     }
10135
10136     op = *RExC_parse;
10137
10138     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10139         maxpos = NULL;
10140 #ifdef RE_TRACK_PATTERN_OFFSETS
10141         parse_start = RExC_parse; /* MJD */
10142 #endif
10143         next = RExC_parse + 1;
10144         while (isDIGIT(*next) || *next == ',') {
10145             if (*next == ',') {
10146                 if (maxpos)
10147                     break;
10148                 else
10149                     maxpos = next;
10150             }
10151             next++;
10152         }
10153         if (*next == '}') {             /* got one */
10154             if (!maxpos)
10155                 maxpos = next;
10156             RExC_parse++;
10157             min = atoi(RExC_parse);
10158             if (*maxpos == ',')
10159                 maxpos++;
10160             else
10161                 maxpos = RExC_parse;
10162             max = atoi(maxpos);
10163             if (!max && *maxpos != '0')
10164                 max = REG_INFTY;                /* meaning "infinity" */
10165             else if (max >= REG_INFTY)
10166                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10167             RExC_parse = next;
10168             nextchar(pRExC_state);
10169             if (max < min) {    /* If can't match, warn and optimize to fail
10170                                    unconditionally */
10171                 if (SIZE_ONLY) {
10172                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10173
10174                     /* We can't back off the size because we have to reserve
10175                      * enough space for all the things we are about to throw
10176                      * away, but we can shrink it by the ammount we are about
10177                      * to re-use here */
10178                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10179                 }
10180                 else {
10181                     RExC_emit = orig_emit;
10182                 }
10183                 ret = reg_node(pRExC_state, OPFAIL);
10184                 return ret;
10185             }
10186             else if (min == max
10187                      && RExC_parse < RExC_end
10188                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10189             {
10190                 if (SIZE_ONLY) {
10191                     ckWARN2reg(RExC_parse + 1,
10192                                "Useless use of greediness modifier '%c'",
10193                                *RExC_parse);
10194                 }
10195                 /* Absorb the modifier, so later code doesn't see nor use
10196                     * it */
10197                 nextchar(pRExC_state);
10198             }
10199
10200         do_curly:
10201             if ((flags&SIMPLE)) {
10202                 RExC_naughty += 2 + RExC_naughty / 2;
10203                 reginsert(pRExC_state, CURLY, ret, depth+1);
10204                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10205                 Set_Node_Cur_Length(ret, parse_start);
10206             }
10207             else {
10208                 regnode * const w = reg_node(pRExC_state, WHILEM);
10209
10210                 w->flags = 0;
10211                 REGTAIL(pRExC_state, ret, w);
10212                 if (!SIZE_ONLY && RExC_extralen) {
10213                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10214                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10215                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10216                 }
10217                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10218                                 /* MJD hk */
10219                 Set_Node_Offset(ret, parse_start+1);
10220                 Set_Node_Length(ret,
10221                                 op == '{' ? (RExC_parse - parse_start) : 1);
10222
10223                 if (!SIZE_ONLY && RExC_extralen)
10224                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10225                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10226                 if (SIZE_ONLY)
10227                     RExC_whilem_seen++, RExC_extralen += 3;
10228                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10229             }
10230             ret->flags = 0;
10231
10232             if (min > 0)
10233                 *flagp = WORST;
10234             if (max > 0)
10235                 *flagp |= HASWIDTH;
10236             if (!SIZE_ONLY) {
10237                 ARG1_SET(ret, (U16)min);
10238                 ARG2_SET(ret, (U16)max);
10239             }
10240
10241             goto nest_check;
10242         }
10243     }
10244
10245     if (!ISMULT1(op)) {
10246         *flagp = flags;
10247         return(ret);
10248     }
10249
10250 #if 0                           /* Now runtime fix should be reliable. */
10251
10252     /* if this is reinstated, don't forget to put this back into perldiag:
10253
10254             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10255
10256            (F) The part of the regexp subject to either the * or + quantifier
10257            could match an empty string. The {#} shows in the regular
10258            expression about where the problem was discovered.
10259
10260     */
10261
10262     if (!(flags&HASWIDTH) && op != '?')
10263       vFAIL("Regexp *+ operand could be empty");
10264 #endif
10265
10266 #ifdef RE_TRACK_PATTERN_OFFSETS
10267     parse_start = RExC_parse;
10268 #endif
10269     nextchar(pRExC_state);
10270
10271     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10272
10273     if (op == '*' && (flags&SIMPLE)) {
10274         reginsert(pRExC_state, STAR, ret, depth+1);
10275         ret->flags = 0;
10276         RExC_naughty += 4;
10277     }
10278     else if (op == '*') {
10279         min = 0;
10280         goto do_curly;
10281     }
10282     else if (op == '+' && (flags&SIMPLE)) {
10283         reginsert(pRExC_state, PLUS, ret, depth+1);
10284         ret->flags = 0;
10285         RExC_naughty += 3;
10286     }
10287     else if (op == '+') {
10288         min = 1;
10289         goto do_curly;
10290     }
10291     else if (op == '?') {
10292         min = 0; max = 1;
10293         goto do_curly;
10294     }
10295   nest_check:
10296     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10297         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10298         ckWARN2reg(RExC_parse,
10299                    "%"UTF8f" matches null string many times",
10300                    UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
10301                    origparse));
10302         (void)ReREFCNT_inc(RExC_rx_sv);
10303     }
10304
10305     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10306         nextchar(pRExC_state);
10307         reginsert(pRExC_state, MINMOD, ret, depth+1);
10308         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10309     }
10310     else
10311     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10312         regnode *ender;
10313         nextchar(pRExC_state);
10314         ender = reg_node(pRExC_state, SUCCEED);
10315         REGTAIL(pRExC_state, ret, ender);
10316         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10317         ret->flags = 0;
10318         ender = reg_node(pRExC_state, TAIL);
10319         REGTAIL(pRExC_state, ret, ender);
10320     }
10321
10322     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10323         RExC_parse++;
10324         vFAIL("Nested quantifiers");
10325     }
10326
10327     return(ret);
10328 }
10329
10330 STATIC bool
10331 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10332         const bool strict   /* Apply stricter parsing rules? */
10333     )
10334 {
10335
10336  /* This is expected to be called by a parser routine that has recognized '\N'
10337    and needs to handle the rest. RExC_parse is expected to point at the first
10338    char following the N at the time of the call.  On successful return,
10339    RExC_parse has been updated to point to just after the sequence identified
10340    by this routine, and <*flagp> has been updated.
10341
10342    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10343    character class.
10344
10345    \N may begin either a named sequence, or if outside a character class, mean
10346    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10347    attempted to decide which, and in the case of a named sequence, converted it
10348    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10349    where c1... are the characters in the sequence.  For single-quoted regexes,
10350    the tokenizer passes the \N sequence through unchanged; this code will not
10351    attempt to determine this nor expand those, instead raising a syntax error.
10352    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10353    or there is no '}', it signals that this \N occurrence means to match a
10354    non-newline.
10355
10356    Only the \N{U+...} form should occur in a character class, for the same
10357    reason that '.' inside a character class means to just match a period: it
10358    just doesn't make sense.
10359
10360    The function raises an error (via vFAIL), and doesn't return for various
10361    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10362    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10363    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10364    only possible if node_p is non-NULL.
10365
10366
10367    If <valuep> is non-null, it means the caller can accept an input sequence
10368    consisting of a just a single code point; <*valuep> is set to that value
10369    if the input is such.
10370
10371    If <node_p> is non-null it signifies that the caller can accept any other
10372    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10373    is set as follows:
10374     1) \N means not-a-NL: points to a newly created REG_ANY node;
10375     2) \N{}:              points to a new NOTHING node;
10376     3) otherwise:         points to a new EXACT node containing the resolved
10377                           string.
10378    Note that FALSE is returned for single code point sequences if <valuep> is
10379    null.
10380  */
10381
10382     char * endbrace;    /* '}' following the name */
10383     char* p;
10384     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10385                            stream */
10386     bool has_multiple_chars; /* true if the input stream contains a sequence of
10387                                 more than one character */
10388
10389     GET_RE_DEBUG_FLAGS_DECL;
10390
10391     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10392
10393     GET_RE_DEBUG_FLAGS;
10394
10395     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10396
10397     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10398      * modifier.  The other meaning does not, so use a temporary until we find
10399      * out which we are being called with */
10400     p = (RExC_flags & RXf_PMf_EXTENDED)
10401         ? regwhite( pRExC_state, RExC_parse )
10402         : RExC_parse;
10403
10404     /* Disambiguate between \N meaning a named character versus \N meaning
10405      * [^\n].  The former is assumed when it can't be the latter. */
10406     if (*p != '{' || regcurly(p, FALSE)) {
10407         RExC_parse = p;
10408         if (! node_p) {
10409             /* no bare \N allowed in a charclass */
10410             if (in_char_class) {
10411                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10412             }
10413             return FALSE;
10414         }
10415         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10416                            current char */
10417         nextchar(pRExC_state);
10418         *node_p = reg_node(pRExC_state, REG_ANY);
10419         *flagp |= HASWIDTH|SIMPLE;
10420         RExC_naughty++;
10421         Set_Node_Length(*node_p, 1); /* MJD */
10422         return TRUE;
10423     }
10424
10425     /* Here, we have decided it should be a named character or sequence */
10426
10427     /* The test above made sure that the next real character is a '{', but
10428      * under the /x modifier, it could be separated by space (or a comment and
10429      * \n) and this is not allowed (for consistency with \x{...} and the
10430      * tokenizer handling of \N{NAME}). */
10431     if (*RExC_parse != '{') {
10432         vFAIL("Missing braces on \\N{}");
10433     }
10434
10435     RExC_parse++;       /* Skip past the '{' */
10436
10437     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10438         || ! (endbrace == RExC_parse            /* nothing between the {} */
10439               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
10440                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
10441     {
10442         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10443         vFAIL("\\N{NAME} must be resolved by the lexer");
10444     }
10445
10446     if (endbrace == RExC_parse) {   /* empty: \N{} */
10447         bool ret = TRUE;
10448         if (node_p) {
10449             *node_p = reg_node(pRExC_state,NOTHING);
10450         }
10451         else if (in_char_class) {
10452             if (SIZE_ONLY && in_char_class) {
10453                 if (strict) {
10454                     RExC_parse++;   /* Position after the "}" */
10455                     vFAIL("Zero length \\N{}");
10456                 }
10457                 else {
10458                     ckWARNreg(RExC_parse,
10459                               "Ignoring zero length \\N{} in character class");
10460                 }
10461             }
10462             ret = FALSE;
10463         }
10464         else {
10465             return FALSE;
10466         }
10467         nextchar(pRExC_state);
10468         return ret;
10469     }
10470
10471     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10472     RExC_parse += 2;    /* Skip past the 'U+' */
10473
10474     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10475
10476     /* Code points are separated by dots.  If none, there is only one code
10477      * point, and is terminated by the brace */
10478     has_multiple_chars = (endchar < endbrace);
10479
10480     if (valuep && (! has_multiple_chars || in_char_class)) {
10481         /* We only pay attention to the first char of
10482         multichar strings being returned in char classes. I kinda wonder
10483         if this makes sense as it does change the behaviour
10484         from earlier versions, OTOH that behaviour was broken
10485         as well. XXX Solution is to recharacterize as
10486         [rest-of-class]|multi1|multi2... */
10487
10488         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10489         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10490             | PERL_SCAN_DISALLOW_PREFIX
10491             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10492
10493         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10494
10495         /* The tokenizer should have guaranteed validity, but it's possible to
10496          * bypass it by using single quoting, so check */
10497         if (length_of_hex == 0
10498             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10499         {
10500             RExC_parse += length_of_hex;        /* Includes all the valid */
10501             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10502                             ? UTF8SKIP(RExC_parse)
10503                             : 1;
10504             /* Guard against malformed utf8 */
10505             if (RExC_parse >= endchar) {
10506                 RExC_parse = endchar;
10507             }
10508             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10509         }
10510
10511         if (in_char_class && has_multiple_chars) {
10512             if (strict) {
10513                 RExC_parse = endbrace;
10514                 vFAIL("\\N{} in character class restricted to one character");
10515             }
10516             else {
10517                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10518             }
10519         }
10520
10521         RExC_parse = endbrace + 1;
10522     }
10523     else if (! node_p || ! has_multiple_chars) {
10524
10525         /* Here, the input is legal, but not according to the caller's
10526          * options.  We fail without advancing the parse, so that the
10527          * caller can try again */
10528         RExC_parse = p;
10529         return FALSE;
10530     }
10531     else {
10532
10533         /* What is done here is to convert this to a sub-pattern of the form
10534          * (?:\x{char1}\x{char2}...)
10535          * and then call reg recursively.  That way, it retains its atomicness,
10536          * while not having to worry about special handling that some code
10537          * points may have.  toke.c has converted the original Unicode values
10538          * to native, so that we can just pass on the hex values unchanged.  We
10539          * do have to set a flag to keep recoding from happening in the
10540          * recursion */
10541
10542         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10543         STRLEN len;
10544         char *orig_end = RExC_end;
10545         I32 flags;
10546
10547         while (RExC_parse < endbrace) {
10548
10549             /* Convert to notation the rest of the code understands */
10550             sv_catpv(substitute_parse, "\\x{");
10551             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10552             sv_catpv(substitute_parse, "}");
10553
10554             /* Point to the beginning of the next character in the sequence. */
10555             RExC_parse = endchar + 1;
10556             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10557         }
10558         sv_catpv(substitute_parse, ")");
10559
10560         RExC_parse = SvPV(substitute_parse, len);
10561
10562         /* Don't allow empty number */
10563         if (len < 8) {
10564             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10565         }
10566         RExC_end = RExC_parse + len;
10567
10568         /* The values are Unicode, and therefore not subject to recoding */
10569         RExC_override_recoding = 1;
10570
10571         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10572             if (flags & RESTART_UTF8) {
10573                 *flagp = RESTART_UTF8;
10574                 return FALSE;
10575             }
10576             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10577                   (UV) flags);
10578         }
10579         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10580
10581         RExC_parse = endbrace;
10582         RExC_end = orig_end;
10583         RExC_override_recoding = 0;
10584
10585         nextchar(pRExC_state);
10586     }
10587
10588     return TRUE;
10589 }
10590
10591
10592 /*
10593  * reg_recode
10594  *
10595  * It returns the code point in utf8 for the value in *encp.
10596  *    value: a code value in the source encoding
10597  *    encp:  a pointer to an Encode object
10598  *
10599  * If the result from Encode is not a single character,
10600  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10601  */
10602 STATIC UV
10603 S_reg_recode(pTHX_ const char value, SV **encp)
10604 {
10605     STRLEN numlen = 1;
10606     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10607     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10608     const STRLEN newlen = SvCUR(sv);
10609     UV uv = UNICODE_REPLACEMENT;
10610
10611     PERL_ARGS_ASSERT_REG_RECODE;
10612
10613     if (newlen)
10614         uv = SvUTF8(sv)
10615              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10616              : *(U8*)s;
10617
10618     if (!newlen || numlen != newlen) {
10619         uv = UNICODE_REPLACEMENT;
10620         *encp = NULL;
10621     }
10622     return uv;
10623 }
10624
10625 PERL_STATIC_INLINE U8
10626 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10627 {
10628     U8 op;
10629
10630     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10631
10632     if (! FOLD) {
10633         return EXACT;
10634     }
10635
10636     op = get_regex_charset(RExC_flags);
10637     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10638         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10639                  been, so there is no hole */
10640     }
10641
10642     return op + EXACTF;
10643 }
10644
10645 PERL_STATIC_INLINE void
10646 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10647 {
10648     /* This knows the details about sizing an EXACTish node, setting flags for
10649      * it (by setting <*flagp>, and potentially populating it with a single
10650      * character.
10651      *
10652      * If <len> (the length in bytes) is non-zero, this function assumes that
10653      * the node has already been populated, and just does the sizing.  In this
10654      * case <code_point> should be the final code point that has already been
10655      * placed into the node.  This value will be ignored except that under some
10656      * circumstances <*flagp> is set based on it.
10657      *
10658      * If <len> is zero, the function assumes that the node is to contain only
10659      * the single character given by <code_point> and calculates what <len>
10660      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10661      * additionally will populate the node's STRING with <code_point>, if <len>
10662      * is 0.  In both cases <*flagp> is appropriately set
10663      *
10664      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10665      * 255, must be folded (the former only when the rules indicate it can
10666      * match 'ss') */
10667
10668     bool len_passed_in = cBOOL(len != 0);
10669     U8 character[UTF8_MAXBYTES_CASE+1];
10670
10671     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10672
10673     if (! len_passed_in) {
10674         if (UTF) {
10675             if (FOLD && (! LOC || code_point > 255)) {
10676                 _to_uni_fold_flags(code_point,
10677                                    character,
10678                                    &len,
10679                                    FOLD_FLAGS_FULL | ((LOC)
10680                                                      ? FOLD_FLAGS_LOCALE
10681                                                      : (ASCII_FOLD_RESTRICTED)
10682                                                        ? FOLD_FLAGS_NOMIX_ASCII
10683                                                        : 0));
10684             }
10685             else {
10686                 uvchr_to_utf8( character, code_point);
10687                 len = UTF8SKIP(character);
10688             }
10689         }
10690         else if (! FOLD
10691                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10692                  || ASCII_FOLD_RESTRICTED
10693                  || ! AT_LEAST_UNI_SEMANTICS)
10694         {
10695             *character = (U8) code_point;
10696             len = 1;
10697         }
10698         else {
10699             *character = 's';
10700             *(character + 1) = 's';
10701             len = 2;
10702         }
10703     }
10704
10705     if (SIZE_ONLY) {
10706         RExC_size += STR_SZ(len);
10707     }
10708     else {
10709         RExC_emit += STR_SZ(len);
10710         STR_LEN(node) = len;
10711         if (! len_passed_in) {
10712             Copy((char *) character, STRING(node), len, char);
10713         }
10714     }
10715
10716     *flagp |= HASWIDTH;
10717
10718     /* A single character node is SIMPLE, except for the special-cased SHARP S
10719      * under /di. */
10720     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10721         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10722             || ! FOLD || ! DEPENDS_SEMANTICS))
10723     {
10724         *flagp |= SIMPLE;
10725     }
10726 }
10727
10728
10729 /* return atoi(p), unless it's too big to sensibly be a backref,
10730  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
10731
10732 static I32
10733 S_backref_value(char *p)
10734 {
10735     char *q = p;
10736
10737     for (;isDIGIT(*q); q++); /* calculate length of num */
10738     if (q - p == 0 || q - p > 9)
10739         return I32_MAX;
10740     return atoi(p);
10741 }
10742
10743
10744 /*
10745  - regatom - the lowest level
10746
10747    Try to identify anything special at the start of the pattern. If there
10748    is, then handle it as required. This may involve generating a single regop,
10749    such as for an assertion; or it may involve recursing, such as to
10750    handle a () structure.
10751
10752    If the string doesn't start with something special then we gobble up
10753    as much literal text as we can.
10754
10755    Once we have been able to handle whatever type of thing started the
10756    sequence, we return.
10757
10758    Note: we have to be careful with escapes, as they can be both literal
10759    and special, and in the case of \10 and friends, context determines which.
10760
10761    A summary of the code structure is:
10762
10763    switch (first_byte) {
10764         cases for each special:
10765             handle this special;
10766             break;
10767         case '\\':
10768             switch (2nd byte) {
10769                 cases for each unambiguous special:
10770                     handle this special;
10771                     break;
10772                 cases for each ambigous special/literal:
10773                     disambiguate;
10774                     if (special)  handle here
10775                     else goto defchar;
10776                 default: // unambiguously literal:
10777                     goto defchar;
10778             }
10779         default:  // is a literal char
10780             // FALL THROUGH
10781         defchar:
10782             create EXACTish node for literal;
10783             while (more input and node isn't full) {
10784                 switch (input_byte) {
10785                    cases for each special;
10786                        make sure parse pointer is set so that the next call to
10787                            regatom will see this special first
10788                        goto loopdone; // EXACTish node terminated by prev. char
10789                    default:
10790                        append char to EXACTISH node;
10791                 }
10792                 get next input byte;
10793             }
10794         loopdone:
10795    }
10796    return the generated node;
10797
10798    Specifically there are two separate switches for handling
10799    escape sequences, with the one for handling literal escapes requiring
10800    a dummy entry for all of the special escapes that are actually handled
10801    by the other.
10802
10803    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10804    TRYAGAIN.
10805    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10806    restarted.
10807    Otherwise does not return NULL.
10808 */
10809
10810 STATIC regnode *
10811 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10812 {
10813     dVAR;
10814     regnode *ret = NULL;
10815     I32 flags = 0;
10816     char *parse_start = RExC_parse;
10817     U8 op;
10818     int invert = 0;
10819
10820     GET_RE_DEBUG_FLAGS_DECL;
10821
10822     *flagp = WORST;             /* Tentatively. */
10823
10824     DEBUG_PARSE("atom");
10825
10826     PERL_ARGS_ASSERT_REGATOM;
10827
10828 tryagain:
10829     switch ((U8)*RExC_parse) {
10830     case '^':
10831         RExC_seen_zerolen++;
10832         nextchar(pRExC_state);
10833         if (RExC_flags & RXf_PMf_MULTILINE)
10834             ret = reg_node(pRExC_state, MBOL);
10835         else if (RExC_flags & RXf_PMf_SINGLELINE)
10836             ret = reg_node(pRExC_state, SBOL);
10837         else
10838             ret = reg_node(pRExC_state, BOL);
10839         Set_Node_Length(ret, 1); /* MJD */
10840         break;
10841     case '$':
10842         nextchar(pRExC_state);
10843         if (*RExC_parse)
10844             RExC_seen_zerolen++;
10845         if (RExC_flags & RXf_PMf_MULTILINE)
10846             ret = reg_node(pRExC_state, MEOL);
10847         else if (RExC_flags & RXf_PMf_SINGLELINE)
10848             ret = reg_node(pRExC_state, SEOL);
10849         else
10850             ret = reg_node(pRExC_state, EOL);
10851         Set_Node_Length(ret, 1); /* MJD */
10852         break;
10853     case '.':
10854         nextchar(pRExC_state);
10855         if (RExC_flags & RXf_PMf_SINGLELINE)
10856             ret = reg_node(pRExC_state, SANY);
10857         else
10858             ret = reg_node(pRExC_state, REG_ANY);
10859         *flagp |= HASWIDTH|SIMPLE;
10860         RExC_naughty++;
10861         Set_Node_Length(ret, 1); /* MJD */
10862         break;
10863     case '[':
10864     {
10865         char * const oregcomp_parse = ++RExC_parse;
10866         ret = regclass(pRExC_state, flagp,depth+1,
10867                        FALSE, /* means parse the whole char class */
10868                        TRUE, /* allow multi-char folds */
10869                        FALSE, /* don't silence non-portable warnings. */
10870                        NULL);
10871         if (*RExC_parse != ']') {
10872             RExC_parse = oregcomp_parse;
10873             vFAIL("Unmatched [");
10874         }
10875         if (ret == NULL) {
10876             if (*flagp & RESTART_UTF8)
10877                 return NULL;
10878             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10879                   (UV) *flagp);
10880         }
10881         nextchar(pRExC_state);
10882         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10883         break;
10884     }
10885     case '(':
10886         nextchar(pRExC_state);
10887         ret = reg(pRExC_state, 2, &flags,depth+1);
10888         if (ret == NULL) {
10889                 if (flags & TRYAGAIN) {
10890                     if (RExC_parse == RExC_end) {
10891                          /* Make parent create an empty node if needed. */
10892                         *flagp |= TRYAGAIN;
10893                         return(NULL);
10894                     }
10895                     goto tryagain;
10896                 }
10897                 if (flags & RESTART_UTF8) {
10898                     *flagp = RESTART_UTF8;
10899                     return NULL;
10900                 }
10901                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10902         }
10903         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10904         break;
10905     case '|':
10906     case ')':
10907         if (flags & TRYAGAIN) {
10908             *flagp |= TRYAGAIN;
10909             return NULL;
10910         }
10911         vFAIL("Internal urp");
10912                                 /* Supposed to be caught earlier. */
10913         break;
10914     case '{':
10915         if (!regcurly(RExC_parse, FALSE)) {
10916             RExC_parse++;
10917             goto defchar;
10918         }
10919         /* FALL THROUGH */
10920     case '?':
10921     case '+':
10922     case '*':
10923         RExC_parse++;
10924         vFAIL("Quantifier follows nothing");
10925         break;
10926     case '\\':
10927         /* Special Escapes
10928
10929            This switch handles escape sequences that resolve to some kind
10930            of special regop and not to literal text. Escape sequnces that
10931            resolve to literal text are handled below in the switch marked
10932            "Literal Escapes".
10933
10934            Every entry in this switch *must* have a corresponding entry
10935            in the literal escape switch. However, the opposite is not
10936            required, as the default for this switch is to jump to the
10937            literal text handling code.
10938         */
10939         switch ((U8)*++RExC_parse) {
10940             U8 arg;
10941         /* Special Escapes */
10942         case 'A':
10943             RExC_seen_zerolen++;
10944             ret = reg_node(pRExC_state, SBOL);
10945             *flagp |= SIMPLE;
10946             goto finish_meta_pat;
10947         case 'G':
10948             ret = reg_node(pRExC_state, GPOS);
10949             RExC_seen |= REG_SEEN_GPOS;
10950             *flagp |= SIMPLE;
10951             goto finish_meta_pat;
10952         case 'K':
10953             RExC_seen_zerolen++;
10954             ret = reg_node(pRExC_state, KEEPS);
10955             *flagp |= SIMPLE;
10956             /* XXX:dmq : disabling in-place substitution seems to
10957              * be necessary here to avoid cases of memory corruption, as
10958              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10959              */
10960             RExC_seen |= REG_SEEN_LOOKBEHIND;
10961             goto finish_meta_pat;
10962         case 'Z':
10963             ret = reg_node(pRExC_state, SEOL);
10964             *flagp |= SIMPLE;
10965             RExC_seen_zerolen++;                /* Do not optimize RE away */
10966             goto finish_meta_pat;
10967         case 'z':
10968             ret = reg_node(pRExC_state, EOS);
10969             *flagp |= SIMPLE;
10970             RExC_seen_zerolen++;                /* Do not optimize RE away */
10971             goto finish_meta_pat;
10972         case 'C':
10973             ret = reg_node(pRExC_state, CANY);
10974             RExC_seen |= REG_SEEN_CANY;
10975             *flagp |= HASWIDTH|SIMPLE;
10976             goto finish_meta_pat;
10977         case 'X':
10978             ret = reg_node(pRExC_state, CLUMP);
10979             *flagp |= HASWIDTH;
10980             goto finish_meta_pat;
10981
10982         case 'W':
10983             invert = 1;
10984             /* FALLTHROUGH */
10985         case 'w':
10986             arg = ANYOF_WORDCHAR;
10987             goto join_posix;
10988
10989         case 'b':
10990             RExC_seen_zerolen++;
10991             RExC_seen |= REG_SEEN_LOOKBEHIND;
10992             op = BOUND + get_regex_charset(RExC_flags);
10993             if (op > BOUNDA) {  /* /aa is same as /a */
10994                 op = BOUNDA;
10995             }
10996             ret = reg_node(pRExC_state, op);
10997             FLAGS(ret) = get_regex_charset(RExC_flags);
10998             *flagp |= SIMPLE;
10999             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11000                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
11001             }
11002             goto finish_meta_pat;
11003         case 'B':
11004             RExC_seen_zerolen++;
11005             RExC_seen |= REG_SEEN_LOOKBEHIND;
11006             op = NBOUND + get_regex_charset(RExC_flags);
11007             if (op > NBOUNDA) { /* /aa is same as /a */
11008                 op = NBOUNDA;
11009             }
11010             ret = reg_node(pRExC_state, op);
11011             FLAGS(ret) = get_regex_charset(RExC_flags);
11012             *flagp |= SIMPLE;
11013             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11014                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
11015             }
11016             goto finish_meta_pat;
11017
11018         case 'D':
11019             invert = 1;
11020             /* FALLTHROUGH */
11021         case 'd':
11022             arg = ANYOF_DIGIT;
11023             goto join_posix;
11024
11025         case 'R':
11026             ret = reg_node(pRExC_state, LNBREAK);
11027             *flagp |= HASWIDTH|SIMPLE;
11028             goto finish_meta_pat;
11029
11030         case 'H':
11031             invert = 1;
11032             /* FALLTHROUGH */
11033         case 'h':
11034             arg = ANYOF_BLANK;
11035             op = POSIXU;
11036             goto join_posix_op_known;
11037
11038         case 'V':
11039             invert = 1;
11040             /* FALLTHROUGH */
11041         case 'v':
11042             arg = ANYOF_VERTWS;
11043             op = POSIXU;
11044             goto join_posix_op_known;
11045
11046         case 'S':
11047             invert = 1;
11048             /* FALLTHROUGH */
11049         case 's':
11050             arg = ANYOF_SPACE;
11051
11052         join_posix:
11053
11054             op = POSIXD + get_regex_charset(RExC_flags);
11055             if (op > POSIXA) {  /* /aa is same as /a */
11056                 op = POSIXA;
11057             }
11058
11059         join_posix_op_known:
11060
11061             if (invert) {
11062                 op += NPOSIXD - POSIXD;
11063             }
11064
11065             ret = reg_node(pRExC_state, op);
11066             if (! SIZE_ONLY) {
11067                 FLAGS(ret) = namedclass_to_classnum(arg);
11068             }
11069
11070             *flagp |= HASWIDTH|SIMPLE;
11071             /* FALL THROUGH */
11072
11073          finish_meta_pat:
11074             nextchar(pRExC_state);
11075             Set_Node_Length(ret, 2); /* MJD */
11076             break;
11077         case 'p':
11078         case 'P':
11079             {
11080 #ifdef DEBUGGING
11081                 char* parse_start = RExC_parse - 2;
11082 #endif
11083
11084                 RExC_parse--;
11085
11086                 ret = regclass(pRExC_state, flagp,depth+1,
11087                                TRUE, /* means just parse this element */
11088                                FALSE, /* don't allow multi-char folds */
11089                                FALSE, /* don't silence non-portable warnings.
11090                                          It would be a bug if these returned
11091                                          non-portables */
11092                                NULL);
11093                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11094                    are allowed.  */
11095                 if (!ret)
11096                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11097                           (UV) *flagp);
11098
11099                 RExC_parse--;
11100
11101                 Set_Node_Offset(ret, parse_start + 2);
11102                 Set_Node_Cur_Length(ret, parse_start);
11103                 nextchar(pRExC_state);
11104             }
11105             break;
11106         case 'N':
11107             /* Handle \N and \N{NAME} with multiple code points here and not
11108              * below because it can be multicharacter. join_exact() will join
11109              * them up later on.  Also this makes sure that things like
11110              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11111              * The options to the grok function call causes it to fail if the
11112              * sequence is just a single code point.  We then go treat it as
11113              * just another character in the current EXACT node, and hence it
11114              * gets uniform treatment with all the other characters.  The
11115              * special treatment for quantifiers is not needed for such single
11116              * character sequences */
11117             ++RExC_parse;
11118             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11119                                 FALSE /* not strict */ )) {
11120                 if (*flagp & RESTART_UTF8)
11121                     return NULL;
11122                 RExC_parse--;
11123                 goto defchar;
11124             }
11125             break;
11126         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11127         parse_named_seq:
11128         {
11129             char ch= RExC_parse[1];
11130             if (ch != '<' && ch != '\'' && ch != '{') {
11131                 RExC_parse++;
11132                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11133                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11134             } else {
11135                 /* this pretty much dupes the code for (?P=...) in reg(), if
11136                    you change this make sure you change that */
11137                 char* name_start = (RExC_parse += 2);
11138                 U32 num = 0;
11139                 SV *sv_dat = reg_scan_name(pRExC_state,
11140                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11141                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11142                 if (RExC_parse == name_start || *RExC_parse != ch)
11143                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11144                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11145
11146                 if (!SIZE_ONLY) {
11147                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11148                     RExC_rxi->data->data[num]=(void*)sv_dat;
11149                     SvREFCNT_inc_simple_void(sv_dat);
11150                 }
11151
11152                 RExC_sawback = 1;
11153                 ret = reganode(pRExC_state,
11154                                ((! FOLD)
11155                                  ? NREF
11156                                  : (ASCII_FOLD_RESTRICTED)
11157                                    ? NREFFA
11158                                    : (AT_LEAST_UNI_SEMANTICS)
11159                                      ? NREFFU
11160                                      : (LOC)
11161                                        ? NREFFL
11162                                        : NREFF),
11163                                 num);
11164                 *flagp |= HASWIDTH;
11165
11166                 /* override incorrect value set in reganode MJD */
11167                 Set_Node_Offset(ret, parse_start+1);
11168                 Set_Node_Cur_Length(ret, parse_start);
11169                 nextchar(pRExC_state);
11170
11171             }
11172             break;
11173         }
11174         case 'g':
11175         case '1': case '2': case '3': case '4':
11176         case '5': case '6': case '7': case '8': case '9':
11177             {
11178                 I32 num;
11179                 bool hasbrace = 0;
11180
11181                 if (*RExC_parse == 'g') {
11182                     bool isrel = 0;
11183
11184                     RExC_parse++;
11185                     if (*RExC_parse == '{') {
11186                         RExC_parse++;
11187                         hasbrace = 1;
11188                     }
11189                     if (*RExC_parse == '-') {
11190                         RExC_parse++;
11191                         isrel = 1;
11192                     }
11193                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11194                         if (isrel) RExC_parse--;
11195                         RExC_parse -= 2;
11196                         goto parse_named_seq;
11197                     }
11198
11199                     num = S_backref_value(RExC_parse);
11200                     if (num == 0)
11201                         vFAIL("Reference to invalid group 0");
11202                     else if (num == I32_MAX) {
11203                          if (isDIGIT(*RExC_parse))
11204                             vFAIL("Reference to nonexistent group");
11205                         else
11206                             vFAIL("Unterminated \\g... pattern");
11207                     }
11208
11209                     if (isrel) {
11210                         num = RExC_npar - num;
11211                         if (num < 1)
11212                             vFAIL("Reference to nonexistent or unclosed group");
11213                     }
11214                 }
11215                 else {
11216                     num = S_backref_value(RExC_parse);
11217                     /* bare \NNN might be backref or octal */
11218                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11219                             && *RExC_parse != '8' && *RExC_parse != '9'))
11220                         /* Probably a character specified in octal, e.g. \35 */
11221                         goto defchar;
11222                 }
11223
11224                 /* at this point RExC_parse definitely points to a backref
11225                  * number */
11226                 {
11227 #ifdef RE_TRACK_PATTERN_OFFSETS
11228                     char * const parse_start = RExC_parse - 1; /* MJD */
11229 #endif
11230                     while (isDIGIT(*RExC_parse))
11231                         RExC_parse++;
11232                     if (hasbrace) {
11233                         if (*RExC_parse != '}')
11234                             vFAIL("Unterminated \\g{...} pattern");
11235                         RExC_parse++;
11236                     }
11237                     if (!SIZE_ONLY) {
11238                         if (num > (I32)RExC_rx->nparens)
11239                             vFAIL("Reference to nonexistent group");
11240                     }
11241                     RExC_sawback = 1;
11242                     ret = reganode(pRExC_state,
11243                                    ((! FOLD)
11244                                      ? REF
11245                                      : (ASCII_FOLD_RESTRICTED)
11246                                        ? REFFA
11247                                        : (AT_LEAST_UNI_SEMANTICS)
11248                                          ? REFFU
11249                                          : (LOC)
11250                                            ? REFFL
11251                                            : REFF),
11252                                     num);
11253                     *flagp |= HASWIDTH;
11254
11255                     /* override incorrect value set in reganode MJD */
11256                     Set_Node_Offset(ret, parse_start+1);
11257                     Set_Node_Cur_Length(ret, parse_start);
11258                     RExC_parse--;
11259                     nextchar(pRExC_state);
11260                 }
11261             }
11262             break;
11263         case '\0':
11264             if (RExC_parse >= RExC_end)
11265                 FAIL("Trailing \\");
11266             /* FALL THROUGH */
11267         default:
11268             /* Do not generate "unrecognized" warnings here, we fall
11269                back into the quick-grab loop below */
11270             parse_start--;
11271             goto defchar;
11272         }
11273         break;
11274
11275     case '#':
11276         if (RExC_flags & RXf_PMf_EXTENDED) {
11277             if ( reg_skipcomment( pRExC_state ) )
11278                 goto tryagain;
11279         }
11280         /* FALL THROUGH */
11281
11282     default:
11283
11284             parse_start = RExC_parse - 1;
11285
11286             RExC_parse++;
11287
11288         defchar: {
11289             STRLEN len = 0;
11290             UV ender = 0;
11291             char *p;
11292             char *s;
11293 #define MAX_NODE_STRING_SIZE 127
11294             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11295             char *s0;
11296             U8 upper_parse = MAX_NODE_STRING_SIZE;
11297             STRLEN foldlen;
11298             U8 node_type = compute_EXACTish(pRExC_state);
11299             bool next_is_quantifier;
11300             char * oldp = NULL;
11301
11302             /* We can convert EXACTF nodes to EXACTFU if they contain only
11303              * characters that match identically regardless of the target
11304              * string's UTF8ness.  The reason to do this is that EXACTF is not
11305              * trie-able, EXACTFU is.  (We don't need to figure this out until
11306              * pass 2) */
11307             bool maybe_exactfu = node_type == EXACTF && PASS2;
11308
11309             /* If a folding node contains only code points that don't
11310              * participate in folds, it can be changed into an EXACT node,
11311              * which allows the optimizer more things to look for */
11312             bool maybe_exact;
11313
11314             ret = reg_node(pRExC_state, node_type);
11315
11316             /* In pass1, folded, we use a temporary buffer instead of the
11317              * actual node, as the node doesn't exist yet */
11318             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11319
11320             s0 = s;
11321
11322         reparse:
11323
11324             /* We do the EXACTFish to EXACT node only if folding, and not if in
11325              * locale, as whether a character folds or not isn't known until
11326              * runtime.  (And we don't need to figure this out until pass 2) */
11327             maybe_exact = FOLD && ! LOC && PASS2;
11328
11329             /* XXX The node can hold up to 255 bytes, yet this only goes to
11330              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11331              * 255 allows us to not have to worry about overflow due to
11332              * converting to utf8 and fold expansion, but that value is
11333              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11334              * split up by this limit into a single one using the real max of
11335              * 255.  Even at 127, this breaks under rare circumstances.  If
11336              * folding, we do not want to split a node at a character that is a
11337              * non-final in a multi-char fold, as an input string could just
11338              * happen to want to match across the node boundary.  The join
11339              * would solve that problem if the join actually happens.  But a
11340              * series of more than two nodes in a row each of 127 would cause
11341              * the first join to succeed to get to 254, but then there wouldn't
11342              * be room for the next one, which could at be one of those split
11343              * multi-char folds.  I don't know of any fool-proof solution.  One
11344              * could back off to end with only a code point that isn't such a
11345              * non-final, but it is possible for there not to be any in the
11346              * entire node. */
11347             for (p = RExC_parse - 1;
11348                  len < upper_parse && p < RExC_end;
11349                  len++)
11350             {
11351                 oldp = p;
11352
11353                 if (RExC_flags & RXf_PMf_EXTENDED)
11354                     p = regwhite( pRExC_state, p );
11355                 switch ((U8)*p) {
11356                 case '^':
11357                 case '$':
11358                 case '.':
11359                 case '[':
11360                 case '(':
11361                 case ')':
11362                 case '|':
11363                     goto loopdone;
11364                 case '\\':
11365                     /* Literal Escapes Switch
11366
11367                        This switch is meant to handle escape sequences that
11368                        resolve to a literal character.
11369
11370                        Every escape sequence that represents something
11371                        else, like an assertion or a char class, is handled
11372                        in the switch marked 'Special Escapes' above in this
11373                        routine, but also has an entry here as anything that
11374                        isn't explicitly mentioned here will be treated as
11375                        an unescaped equivalent literal.
11376                     */
11377
11378                     switch ((U8)*++p) {
11379                     /* These are all the special escapes. */
11380                     case 'A':             /* Start assertion */
11381                     case 'b': case 'B':   /* Word-boundary assertion*/
11382                     case 'C':             /* Single char !DANGEROUS! */
11383                     case 'd': case 'D':   /* digit class */
11384                     case 'g': case 'G':   /* generic-backref, pos assertion */
11385                     case 'h': case 'H':   /* HORIZWS */
11386                     case 'k': case 'K':   /* named backref, keep marker */
11387                     case 'p': case 'P':   /* Unicode property */
11388                               case 'R':   /* LNBREAK */
11389                     case 's': case 'S':   /* space class */
11390                     case 'v': case 'V':   /* VERTWS */
11391                     case 'w': case 'W':   /* word class */
11392                     case 'X':             /* eXtended Unicode "combining character sequence" */
11393                     case 'z': case 'Z':   /* End of line/string assertion */
11394                         --p;
11395                         goto loopdone;
11396
11397                     /* Anything after here is an escape that resolves to a
11398                        literal. (Except digits, which may or may not)
11399                      */
11400                     case 'n':
11401                         ender = '\n';
11402                         p++;
11403                         break;
11404                     case 'N': /* Handle a single-code point named character. */
11405                         /* The options cause it to fail if a multiple code
11406                          * point sequence.  Handle those in the switch() above
11407                          * */
11408                         RExC_parse = p + 1;
11409                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11410                                             flagp, depth, FALSE,
11411                                             FALSE /* not strict */ ))
11412                         {
11413                             if (*flagp & RESTART_UTF8)
11414                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11415                             RExC_parse = p = oldp;
11416                             goto loopdone;
11417                         }
11418                         p = RExC_parse;
11419                         if (ender > 0xff) {
11420                             REQUIRE_UTF8;
11421                         }
11422                         break;
11423                     case 'r':
11424                         ender = '\r';
11425                         p++;
11426                         break;
11427                     case 't':
11428                         ender = '\t';
11429                         p++;
11430                         break;
11431                     case 'f':
11432                         ender = '\f';
11433                         p++;
11434                         break;
11435                     case 'e':
11436                           ender = ASCII_TO_NATIVE('\033');
11437                         p++;
11438                         break;
11439                     case 'a':
11440                           ender = '\a';
11441                         p++;
11442                         break;
11443                     case 'o':
11444                         {
11445                             UV result;
11446                             const char* error_msg;
11447
11448                             bool valid = grok_bslash_o(&p,
11449                                                        &result,
11450                                                        &error_msg,
11451                                                        TRUE, /* out warnings */
11452                                                        FALSE, /* not strict */
11453                                                        TRUE, /* Output warnings
11454                                                                 for non-
11455                                                                 portables */
11456                                                        UTF);
11457                             if (! valid) {
11458                                 RExC_parse = p; /* going to die anyway; point
11459                                                    to exact spot of failure */
11460                                 vFAIL(error_msg);
11461                             }
11462                             ender = result;
11463                             if (PL_encoding && ender < 0x100) {
11464                                 goto recode_encoding;
11465                             }
11466                             if (ender > 0xff) {
11467                                 REQUIRE_UTF8;
11468                             }
11469                             break;
11470                         }
11471                     case 'x':
11472                         {
11473                             UV result = UV_MAX; /* initialize to erroneous
11474                                                    value */
11475                             const char* error_msg;
11476
11477                             bool valid = grok_bslash_x(&p,
11478                                                        &result,
11479                                                        &error_msg,
11480                                                        TRUE, /* out warnings */
11481                                                        FALSE, /* not strict */
11482                                                        TRUE, /* Output warnings
11483                                                                 for non-
11484                                                                 portables */
11485                                                        UTF);
11486                             if (! valid) {
11487                                 RExC_parse = p; /* going to die anyway; point
11488                                                    to exact spot of failure */
11489                                 vFAIL(error_msg);
11490                             }
11491                             ender = result;
11492
11493                             if (PL_encoding && ender < 0x100) {
11494                                 goto recode_encoding;
11495                             }
11496                             if (ender > 0xff) {
11497                                 REQUIRE_UTF8;
11498                             }
11499                             break;
11500                         }
11501                     case 'c':
11502                         p++;
11503                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11504                         break;
11505                     case '8': case '9': /* must be a backreference */
11506                         --p;
11507                         goto loopdone;
11508                     case '1': case '2': case '3':case '4':
11509                     case '5': case '6': case '7':
11510                         /* When we parse backslash escapes there is ambiguity
11511                          * between backreferences and octal escapes. Any escape
11512                          * from \1 - \9 is a backreference, any multi-digit
11513                          * escape which does not start with 0 and which when
11514                          * evaluated as decimal could refer to an already
11515                          * parsed capture buffer is a backslash. Anything else
11516                          * is octal.
11517                          *
11518                          * Note this implies that \118 could be interpreted as
11519                          * 118 OR as "\11" . "8" depending on whether there
11520                          * were 118 capture buffers defined already in the
11521                          * pattern.  */
11522                         if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
11523                         {  /* Not to be treated as an octal constant, go
11524                                    find backref */
11525                             --p;
11526                             goto loopdone;
11527                         }
11528                     case '0':
11529                         {
11530                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11531                             STRLEN numlen = 3;
11532                             ender = grok_oct(p, &numlen, &flags, NULL);
11533                             if (ender > 0xff) {
11534                                 REQUIRE_UTF8;
11535                             }
11536                             p += numlen;
11537                             if (SIZE_ONLY   /* like \08, \178 */
11538                                 && numlen < 3
11539                                 && p < RExC_end
11540                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11541                             {
11542                                 reg_warn_non_literal_string(
11543                                          p + 1,
11544                                          form_short_octal_warning(p, numlen));
11545                             }
11546                         }
11547                         if (PL_encoding && ender < 0x100)
11548                             goto recode_encoding;
11549                         break;
11550                     recode_encoding:
11551                         if (! RExC_override_recoding) {
11552                             SV* enc = PL_encoding;
11553                             ender = reg_recode((const char)(U8)ender, &enc);
11554                             if (!enc && SIZE_ONLY)
11555                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11556                             REQUIRE_UTF8;
11557                         }
11558                         break;
11559                     case '\0':
11560                         if (p >= RExC_end)
11561                             FAIL("Trailing \\");
11562                         /* FALL THROUGH */
11563                     default:
11564                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11565                             /* Include any { following the alpha to emphasize
11566                              * that it could be part of an escape at some point
11567                              * in the future */
11568                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11569                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11570                         }
11571                         goto normal_default;
11572                     } /* End of switch on '\' */
11573                     break;
11574                 default:    /* A literal character */
11575
11576                     if (! SIZE_ONLY
11577                         && RExC_flags & RXf_PMf_EXTENDED
11578                         && ckWARN_d(WARN_DEPRECATED)
11579                         && is_PATWS_non_low(p, UTF))
11580                     {
11581                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11582                                 "Escape literal pattern white space under /x");
11583                     }
11584
11585                   normal_default:
11586                     if (UTF8_IS_START(*p) && UTF) {
11587                         STRLEN numlen;
11588                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11589                                                &numlen, UTF8_ALLOW_DEFAULT);
11590                         p += numlen;
11591                     }
11592                     else
11593                         ender = (U8) *p++;
11594                     break;
11595                 } /* End of switch on the literal */
11596
11597                 /* Here, have looked at the literal character and <ender>
11598                  * contains its ordinal, <p> points to the character after it
11599                  */
11600
11601                 if ( RExC_flags & RXf_PMf_EXTENDED)
11602                     p = regwhite( pRExC_state, p );
11603
11604                 /* If the next thing is a quantifier, it applies to this
11605                  * character only, which means that this character has to be in
11606                  * its own node and can't just be appended to the string in an
11607                  * existing node, so if there are already other characters in
11608                  * the node, close the node with just them, and set up to do
11609                  * this character again next time through, when it will be the
11610                  * only thing in its new node */
11611                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11612                 {
11613                     p = oldp;
11614                     goto loopdone;
11615                 }
11616
11617                 if (! FOLD) {
11618                     if (UTF) {
11619                         const STRLEN unilen = reguni(pRExC_state, ender, s);
11620                         if (unilen > 0) {
11621                            s   += unilen;
11622                            len += unilen;
11623                         }
11624
11625                         /* The loop increments <len> each time, as all but this
11626                          * path (and one other) through it add a single byte to
11627                          * the EXACTish node.  But this one has changed len to
11628                          * be the correct final value, so subtract one to
11629                          * cancel out the increment that follows */
11630                         len--;
11631                     }
11632                     else {
11633                         REGC((char)ender, s++);
11634                     }
11635                 }
11636                 else /* FOLD */ if (! ( UTF
11637                         /* See comments for join_exact() as to why we fold this
11638                          * non-UTF at compile time */
11639                         || (node_type == EXACTFU
11640                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
11641                 {
11642                     if (IS_IN_SOME_FOLD_L1(ender)) {
11643                         maybe_exact = FALSE;
11644
11645                         /* See if the character's fold differs between /d and
11646                          * /u.  This includes the multi-char fold SHARP S to
11647                          * 'ss' */
11648                         if (maybe_exactfu
11649                             && (PL_fold[ender] != PL_fold_latin1[ender]
11650                                 || ender == LATIN_SMALL_LETTER_SHARP_S
11651                                 || (len > 0
11652                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
11653                                    && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11654                         {
11655                             maybe_exactfu = FALSE;
11656                         }
11657                     }
11658                     *(s++) = (char) ender;
11659                 }
11660                 else {  /* UTF */
11661
11662                     /* Prime the casefolded buffer.  Locale rules, which apply
11663                      * only to code points < 256, aren't known until execution,
11664                      * so for them, just output the original character using
11665                      * utf8.  If we start to fold non-UTF patterns, be sure to
11666                      * update join_exact() */
11667                     if (LOC && ender < 256) {
11668                         if (UVCHR_IS_INVARIANT(ender)) {
11669                             *s = (U8) ender;
11670                             foldlen = 1;
11671                         } else {
11672                             *s = UTF8_TWO_BYTE_HI(ender);
11673                             *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11674                             foldlen = 2;
11675                         }
11676                     }
11677                     else {
11678                         UV folded = _to_uni_fold_flags(
11679                                        ender,
11680                                        (U8 *) s,
11681                                        &foldlen,
11682                                        FOLD_FLAGS_FULL
11683                                        | ((LOC) ?  FOLD_FLAGS_LOCALE
11684                                                 : (ASCII_FOLD_RESTRICTED)
11685                                                   ? FOLD_FLAGS_NOMIX_ASCII
11686                                                   : 0)
11687                                         );
11688
11689                         /* If this node only contains non-folding code points
11690                          * so far, see if this new one is also non-folding */
11691                         if (maybe_exact) {
11692                             if (folded != ender) {
11693                                 maybe_exact = FALSE;
11694                             }
11695                             else {
11696                                 /* Here the fold is the original; we have
11697                                  * to check further to see if anything
11698                                  * folds to it */
11699                                 if (! PL_utf8_foldable) {
11700                                     SV* swash = swash_init("utf8",
11701                                                        "_Perl_Any_Folds",
11702                                                        &PL_sv_undef, 1, 0);
11703                                     PL_utf8_foldable =
11704                                                 _get_swash_invlist(swash);
11705                                     SvREFCNT_dec_NN(swash);
11706                                 }
11707                                 if (_invlist_contains_cp(PL_utf8_foldable,
11708                                                          ender))
11709                                 {
11710                                     maybe_exact = FALSE;
11711                                 }
11712                             }
11713                         }
11714                         ender = folded;
11715                     }
11716                     s += foldlen;
11717
11718                     /* The loop increments <len> each time, as all but this
11719                      * path (and one other) through it add a single byte to the
11720                      * EXACTish node.  But this one has changed len to be the
11721                      * correct final value, so subtract one to cancel out the
11722                      * increment that follows */
11723                     len += foldlen - 1;
11724                 }
11725
11726                 if (next_is_quantifier) {
11727
11728                     /* Here, the next input is a quantifier, and to get here,
11729                      * the current character is the only one in the node.
11730                      * Also, here <len> doesn't include the final byte for this
11731                      * character */
11732                     len++;
11733                     goto loopdone;
11734                 }
11735
11736             } /* End of loop through literal characters */
11737
11738             /* Here we have either exhausted the input or ran out of room in
11739              * the node.  (If we encountered a character that can't be in the
11740              * node, transfer is made directly to <loopdone>, and so we
11741              * wouldn't have fallen off the end of the loop.)  In the latter
11742              * case, we artificially have to split the node into two, because
11743              * we just don't have enough space to hold everything.  This
11744              * creates a problem if the final character participates in a
11745              * multi-character fold in the non-final position, as a match that
11746              * should have occurred won't, due to the way nodes are matched,
11747              * and our artificial boundary.  So back off until we find a non-
11748              * problematic character -- one that isn't at the beginning or
11749              * middle of such a fold.  (Either it doesn't participate in any
11750              * folds, or appears only in the final position of all the folds it
11751              * does participate in.)  A better solution with far fewer false
11752              * positives, and that would fill the nodes more completely, would
11753              * be to actually have available all the multi-character folds to
11754              * test against, and to back-off only far enough to be sure that
11755              * this node isn't ending with a partial one.  <upper_parse> is set
11756              * further below (if we need to reparse the node) to include just
11757              * up through that final non-problematic character that this code
11758              * identifies, so when it is set to less than the full node, we can
11759              * skip the rest of this */
11760             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11761
11762                 const STRLEN full_len = len;
11763
11764                 assert(len >= MAX_NODE_STRING_SIZE);
11765
11766                 /* Here, <s> points to the final byte of the final character.
11767                  * Look backwards through the string until find a non-
11768                  * problematic character */
11769
11770                 if (! UTF) {
11771
11772                     /* These two have no multi-char folds to non-UTF characters
11773                      */
11774                     if (ASCII_FOLD_RESTRICTED || LOC) {
11775                         goto loopdone;
11776                     }
11777
11778                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11779                     len = s - s0 + 1;
11780                 }
11781                 else {
11782                     if (!  PL_NonL1NonFinalFold) {
11783                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11784                                         NonL1_Perl_Non_Final_Folds_invlist);
11785                     }
11786
11787                     /* Point to the first byte of the final character */
11788                     s = (char *) utf8_hop((U8 *) s, -1);
11789
11790                     while (s >= s0) {   /* Search backwards until find
11791                                            non-problematic char */
11792                         if (UTF8_IS_INVARIANT(*s)) {
11793
11794                             /* There are no ascii characters that participate
11795                              * in multi-char folds under /aa.  In EBCDIC, the
11796                              * non-ascii invariants are all control characters,
11797                              * so don't ever participate in any folds. */
11798                             if (ASCII_FOLD_RESTRICTED
11799                                 || ! IS_NON_FINAL_FOLD(*s))
11800                             {
11801                                 break;
11802                             }
11803                         }
11804                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11805
11806                             /* No Latin1 characters participate in multi-char
11807                              * folds under /l */
11808                             if (LOC
11809                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11810                                                                   *s, *(s+1))))
11811                             {
11812                                 break;
11813                             }
11814                         }
11815                         else if (! _invlist_contains_cp(
11816                                         PL_NonL1NonFinalFold,
11817                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11818                         {
11819                             break;
11820                         }
11821
11822                         /* Here, the current character is problematic in that
11823                          * it does occur in the non-final position of some
11824                          * fold, so try the character before it, but have to
11825                          * special case the very first byte in the string, so
11826                          * we don't read outside the string */
11827                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11828                     } /* End of loop backwards through the string */
11829
11830                     /* If there were only problematic characters in the string,
11831                      * <s> will point to before s0, in which case the length
11832                      * should be 0, otherwise include the length of the
11833                      * non-problematic character just found */
11834                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11835                 }
11836
11837                 /* Here, have found the final character, if any, that is
11838                  * non-problematic as far as ending the node without splitting
11839                  * it across a potential multi-char fold.  <len> contains the
11840                  * number of bytes in the node up-to and including that
11841                  * character, or is 0 if there is no such character, meaning
11842                  * the whole node contains only problematic characters.  In
11843                  * this case, give up and just take the node as-is.  We can't
11844                  * do any better */
11845                 if (len == 0) {
11846                     len = full_len;
11847
11848                     /* If the node ends in an 's' we make sure it stays EXACTF,
11849                      * as if it turns into an EXACTFU, it could later get
11850                      * joined with another 's' that would then wrongly match
11851                      * the sharp s */
11852                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11853                     {
11854                         maybe_exactfu = FALSE;
11855                     }
11856                 } else {
11857
11858                     /* Here, the node does contain some characters that aren't
11859                      * problematic.  If one such is the final character in the
11860                      * node, we are done */
11861                     if (len == full_len) {
11862                         goto loopdone;
11863                     }
11864                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11865
11866                         /* If the final character is problematic, but the
11867                          * penultimate is not, back-off that last character to
11868                          * later start a new node with it */
11869                         p = oldp;
11870                         goto loopdone;
11871                     }
11872
11873                     /* Here, the final non-problematic character is earlier
11874                      * in the input than the penultimate character.  What we do
11875                      * is reparse from the beginning, going up only as far as
11876                      * this final ok one, thus guaranteeing that the node ends
11877                      * in an acceptable character.  The reason we reparse is
11878                      * that we know how far in the character is, but we don't
11879                      * know how to correlate its position with the input parse.
11880                      * An alternate implementation would be to build that
11881                      * correlation as we go along during the original parse,
11882                      * but that would entail extra work for every node, whereas
11883                      * this code gets executed only when the string is too
11884                      * large for the node, and the final two characters are
11885                      * problematic, an infrequent occurrence.  Yet another
11886                      * possible strategy would be to save the tail of the
11887                      * string, and the next time regatom is called, initialize
11888                      * with that.  The problem with this is that unless you
11889                      * back off one more character, you won't be guaranteed
11890                      * regatom will get called again, unless regbranch,
11891                      * regpiece ... are also changed.  If you do back off that
11892                      * extra character, so that there is input guaranteed to
11893                      * force calling regatom, you can't handle the case where
11894                      * just the first character in the node is acceptable.  I
11895                      * (khw) decided to try this method which doesn't have that
11896                      * pitfall; if performance issues are found, we can do a
11897                      * combination of the current approach plus that one */
11898                     upper_parse = len;
11899                     len = 0;
11900                     s = s0;
11901                     goto reparse;
11902                 }
11903             }   /* End of verifying node ends with an appropriate char */
11904
11905         loopdone:   /* Jumped to when encounters something that shouldn't be in
11906                        the node */
11907
11908             /* I (khw) don't know if you can get here with zero length, but the
11909              * old code handled this situation by creating a zero-length EXACT
11910              * node.  Might as well be NOTHING instead */
11911             if (len == 0) {
11912                 OP(ret) = NOTHING;
11913             }
11914             else {
11915                 if (FOLD) {
11916                     /* If 'maybe_exact' is still set here, means there are no
11917                      * code points in the node that participate in folds;
11918                      * similarly for 'maybe_exactfu' and code points that match
11919                      * differently depending on UTF8ness of the target string
11920                      * */
11921                     if (maybe_exact) {
11922                         OP(ret) = EXACT;
11923                     }
11924                     else if (maybe_exactfu) {
11925                         OP(ret) = EXACTFU;
11926                     }
11927                 }
11928                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11929             }
11930
11931             RExC_parse = p - 1;
11932             Set_Node_Cur_Length(ret, parse_start);
11933             nextchar(pRExC_state);
11934             {
11935                 /* len is STRLEN which is unsigned, need to copy to signed */
11936                 IV iv = len;
11937                 if (iv < 0)
11938                     vFAIL("Internal disaster");
11939             }
11940
11941         } /* End of label 'defchar:' */
11942         break;
11943     } /* End of giant switch on input character */
11944
11945     return(ret);
11946 }
11947
11948 STATIC char *
11949 S_regwhite( RExC_state_t *pRExC_state, char *p )
11950 {
11951     const char *e = RExC_end;
11952
11953     PERL_ARGS_ASSERT_REGWHITE;
11954
11955     while (p < e) {
11956         if (isSPACE(*p))
11957             ++p;
11958         else if (*p == '#') {
11959             bool ended = 0;
11960             do {
11961                 if (*p++ == '\n') {
11962                     ended = 1;
11963                     break;
11964                 }
11965             } while (p < e);
11966             if (!ended)
11967                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11968         }
11969         else
11970             break;
11971     }
11972     return p;
11973 }
11974
11975 STATIC char *
11976 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11977 {
11978     /* Returns the next non-pattern-white space, non-comment character (the
11979      * latter only if 'recognize_comment is true) in the string p, which is
11980      * ended by RExC_end.  If there is no line break ending a comment,
11981      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11982     const char *e = RExC_end;
11983
11984     PERL_ARGS_ASSERT_REGPATWS;
11985
11986     while (p < e) {
11987         STRLEN len;
11988         if ((len = is_PATWS_safe(p, e, UTF))) {
11989             p += len;
11990         }
11991         else if (recognize_comment && *p == '#') {
11992             bool ended = 0;
11993             do {
11994                 p++;
11995                 if (is_LNBREAK_safe(p, e, UTF)) {
11996                     ended = 1;
11997                     break;
11998                 }
11999             } while (p < e);
12000             if (!ended)
12001                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12002         }
12003         else
12004             break;
12005     }
12006     return p;
12007 }
12008
12009 STATIC void
12010 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12011 {
12012     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12013      * sets up the bitmap and any flags, removing those code points from the
12014      * inversion list, setting it to NULL should it become completely empty */
12015
12016     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12017     assert(PL_regkind[OP(node)] == ANYOF);
12018
12019     ANYOF_BITMAP_ZERO(node);
12020     if (*invlist_ptr) {
12021
12022         /* This gets set if we actually need to modify things */
12023         bool change_invlist = FALSE;
12024
12025         UV start, end;
12026
12027         /* Start looking through *invlist_ptr */
12028         invlist_iterinit(*invlist_ptr);
12029         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12030             UV high;
12031             int i;
12032
12033             if (end == UV_MAX && start <= 256) {
12034                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12035             }
12036
12037             /* Quit if are above what we should change */
12038             if (start > 255) {
12039                 break;
12040             }
12041
12042             change_invlist = TRUE;
12043
12044             /* Set all the bits in the range, up to the max that we are doing */
12045             high = (end < 255) ? end : 255;
12046             for (i = start; i <= (int) high; i++) {
12047                 if (! ANYOF_BITMAP_TEST(node, i)) {
12048                     ANYOF_BITMAP_SET(node, i);
12049                 }
12050             }
12051         }
12052         invlist_iterfinish(*invlist_ptr);
12053
12054         /* Done with loop; remove any code points that are in the bitmap from
12055          * *invlist_ptr; similarly for code points above latin1 if we have a flag
12056          * to match all of them anyways */
12057         if (change_invlist) {
12058             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12059         }
12060         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12061             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12062         }
12063
12064         /* If have completely emptied it, remove it completely */
12065         if (_invlist_len(*invlist_ptr) == 0) {
12066             SvREFCNT_dec_NN(*invlist_ptr);
12067             *invlist_ptr = NULL;
12068         }
12069     }
12070 }
12071
12072 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12073    Character classes ([:foo:]) can also be negated ([:^foo:]).
12074    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12075    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12076    but trigger failures because they are currently unimplemented. */
12077
12078 #define POSIXCC_DONE(c)   ((c) == ':')
12079 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12080 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12081
12082 PERL_STATIC_INLINE I32
12083 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12084 {
12085     dVAR;
12086     I32 namedclass = OOB_NAMEDCLASS;
12087
12088     PERL_ARGS_ASSERT_REGPPOSIXCC;
12089
12090     if (value == '[' && RExC_parse + 1 < RExC_end &&
12091         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12092         POSIXCC(UCHARAT(RExC_parse)))
12093     {
12094         const char c = UCHARAT(RExC_parse);
12095         char* const s = RExC_parse++;
12096
12097         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12098             RExC_parse++;
12099         if (RExC_parse == RExC_end) {
12100             if (strict) {
12101
12102                 /* Try to give a better location for the error (than the end of
12103                  * the string) by looking for the matching ']' */
12104                 RExC_parse = s;
12105                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12106                     RExC_parse++;
12107                 }
12108                 vFAIL2("Unmatched '%c' in POSIX class", c);
12109             }
12110             /* Grandfather lone [:, [=, [. */
12111             RExC_parse = s;
12112         }
12113         else {
12114             const char* const t = RExC_parse++; /* skip over the c */
12115             assert(*t == c);
12116
12117             if (UCHARAT(RExC_parse) == ']') {
12118                 const char *posixcc = s + 1;
12119                 RExC_parse++; /* skip over the ending ] */
12120
12121                 if (*s == ':') {
12122                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12123                     const I32 skip = t - posixcc;
12124
12125                     /* Initially switch on the length of the name.  */
12126                     switch (skip) {
12127                     case 4:
12128                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12129                                                           this is the Perl \w
12130                                                         */
12131                             namedclass = ANYOF_WORDCHAR;
12132                         break;
12133                     case 5:
12134                         /* Names all of length 5.  */
12135                         /* alnum alpha ascii blank cntrl digit graph lower
12136                            print punct space upper  */
12137                         /* Offset 4 gives the best switch position.  */
12138                         switch (posixcc[4]) {
12139                         case 'a':
12140                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12141                                 namedclass = ANYOF_ALPHA;
12142                             break;
12143                         case 'e':
12144                             if (memEQ(posixcc, "spac", 4)) /* space */
12145                                 namedclass = ANYOF_PSXSPC;
12146                             break;
12147                         case 'h':
12148                             if (memEQ(posixcc, "grap", 4)) /* graph */
12149                                 namedclass = ANYOF_GRAPH;
12150                             break;
12151                         case 'i':
12152                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12153                                 namedclass = ANYOF_ASCII;
12154                             break;
12155                         case 'k':
12156                             if (memEQ(posixcc, "blan", 4)) /* blank */
12157                                 namedclass = ANYOF_BLANK;
12158                             break;
12159                         case 'l':
12160                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12161                                 namedclass = ANYOF_CNTRL;
12162                             break;
12163                         case 'm':
12164                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12165                                 namedclass = ANYOF_ALPHANUMERIC;
12166                             break;
12167                         case 'r':
12168                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12169                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12170                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12171                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12172                             break;
12173                         case 't':
12174                             if (memEQ(posixcc, "digi", 4)) /* digit */
12175                                 namedclass = ANYOF_DIGIT;
12176                             else if (memEQ(posixcc, "prin", 4)) /* print */
12177                                 namedclass = ANYOF_PRINT;
12178                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12179                                 namedclass = ANYOF_PUNCT;
12180                             break;
12181                         }
12182                         break;
12183                     case 6:
12184                         if (memEQ(posixcc, "xdigit", 6))
12185                             namedclass = ANYOF_XDIGIT;
12186                         break;
12187                     }
12188
12189                     if (namedclass == OOB_NAMEDCLASS)
12190                         vFAIL2utf8f(
12191                             "POSIX class [:%"UTF8f":] unknown",
12192                             UTF8fARG(UTF, t - s - 1, s + 1));
12193
12194                     /* The #defines are structured so each complement is +1 to
12195                      * the normal one */
12196                     if (complement) {
12197                         namedclass++;
12198                     }
12199                     assert (posixcc[skip] == ':');
12200                     assert (posixcc[skip+1] == ']');
12201                 } else if (!SIZE_ONLY) {
12202                     /* [[=foo=]] and [[.foo.]] are still future. */
12203
12204                     /* adjust RExC_parse so the warning shows after
12205                        the class closes */
12206                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12207                         RExC_parse++;
12208                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12209                 }
12210             } else {
12211                 /* Maternal grandfather:
12212                  * "[:" ending in ":" but not in ":]" */
12213                 if (strict) {
12214                     vFAIL("Unmatched '[' in POSIX class");
12215                 }
12216
12217                 /* Grandfather lone [:, [=, [. */
12218                 RExC_parse = s;
12219             }
12220         }
12221     }
12222
12223     return namedclass;
12224 }
12225
12226 STATIC bool
12227 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12228 {
12229     /* This applies some heuristics at the current parse position (which should
12230      * be at a '[') to see if what follows might be intended to be a [:posix:]
12231      * class.  It returns true if it really is a posix class, of course, but it
12232      * also can return true if it thinks that what was intended was a posix
12233      * class that didn't quite make it.
12234      *
12235      * It will return true for
12236      *      [:alphanumerics:
12237      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12238      *                         ')' indicating the end of the (?[
12239      *      [:any garbage including %^&$ punctuation:]
12240      *
12241      * This is designed to be called only from S_handle_regex_sets; it could be
12242      * easily adapted to be called from the spot at the beginning of regclass()
12243      * that checks to see in a normal bracketed class if the surrounding []
12244      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12245      * change long-standing behavior, so I (khw) didn't do that */
12246     char* p = RExC_parse + 1;
12247     char first_char = *p;
12248
12249     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12250
12251     assert(*(p - 1) == '[');
12252
12253     if (! POSIXCC(first_char)) {
12254         return FALSE;
12255     }
12256
12257     p++;
12258     while (p < RExC_end && isWORDCHAR(*p)) p++;
12259
12260     if (p >= RExC_end) {
12261         return FALSE;
12262     }
12263
12264     if (p - RExC_parse > 2    /* Got at least 1 word character */
12265         && (*p == first_char
12266             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12267     {
12268         return TRUE;
12269     }
12270
12271     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12272
12273     return (p
12274             && p - RExC_parse > 2 /* [:] evaluates to colon;
12275                                       [::] is a bad posix class. */
12276             && first_char == *(p - 1));
12277 }
12278
12279 STATIC regnode *
12280 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
12281                    char * const oregcomp_parse)
12282 {
12283     /* Handle the (?[...]) construct to do set operations */
12284
12285     U8 curchar;
12286     UV start, end;      /* End points of code point ranges */
12287     SV* result_string;
12288     char *save_end, *save_parse;
12289     SV* final;
12290     STRLEN len;
12291     regnode* node;
12292     AV* stack;
12293     const bool save_fold = FOLD;
12294
12295     GET_RE_DEBUG_FLAGS_DECL;
12296
12297     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12298
12299     if (LOC) {
12300         vFAIL("(?[...]) not valid in locale");
12301     }
12302     RExC_uni_semantics = 1;
12303
12304     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12305      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12306      * call regclass to handle '[]' so as to not have to reinvent its parsing
12307      * rules here (throwing away the size it computes each time).  And, we exit
12308      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12309      * these things, we need to realize that something preceded by a backslash
12310      * is escaped, so we have to keep track of backslashes */
12311     if (SIZE_ONLY) {
12312         UV depth = 0; /* how many nested (?[...]) constructs */
12313
12314         Perl_ck_warner_d(aTHX_
12315             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12316             "The regex_sets feature is experimental" REPORT_LOCATION,
12317                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12318                 UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
12319
12320         while (RExC_parse < RExC_end) {
12321             SV* current = NULL;
12322             RExC_parse = regpatws(pRExC_state, RExC_parse,
12323                                 TRUE); /* means recognize comments */
12324             switch (*RExC_parse) {
12325                 case '?':
12326                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12327                     /* FALL THROUGH */
12328                 default:
12329                     break;
12330                 case '\\':
12331                     /* Skip the next byte (which could cause us to end up in
12332                      * the middle of a UTF-8 character, but since none of those
12333                      * are confusable with anything we currently handle in this
12334                      * switch (invariants all), it's safe.  We'll just hit the
12335                      * default: case next time and keep on incrementing until
12336                      * we find one of the invariants we do handle. */
12337                     RExC_parse++;
12338                     break;
12339                 case '[':
12340                 {
12341                     /* If this looks like it is a [:posix:] class, leave the
12342                      * parse pointer at the '[' to fool regclass() into
12343                      * thinking it is part of a '[[:posix:]]'.  That function
12344                      * will use strict checking to force a syntax error if it
12345                      * doesn't work out to a legitimate class */
12346                     bool is_posix_class
12347                                     = could_it_be_a_POSIX_class(pRExC_state);
12348                     if (! is_posix_class) {
12349                         RExC_parse++;
12350                     }
12351
12352                     /* regclass() can only return RESTART_UTF8 if multi-char
12353                        folds are allowed.  */
12354                     if (!regclass(pRExC_state, flagp,depth+1,
12355                                   is_posix_class, /* parse the whole char
12356                                                      class only if not a
12357                                                      posix class */
12358                                   FALSE, /* don't allow multi-char folds */
12359                                   TRUE, /* silence non-portable warnings. */
12360                                   &current))
12361                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12362                               (UV) *flagp);
12363
12364                     /* function call leaves parse pointing to the ']', except
12365                      * if we faked it */
12366                     if (is_posix_class) {
12367                         RExC_parse--;
12368                     }
12369
12370                     SvREFCNT_dec(current);   /* In case it returned something */
12371                     break;
12372                 }
12373
12374                 case ']':
12375                     if (depth--) break;
12376                     RExC_parse++;
12377                     if (RExC_parse < RExC_end
12378                         && *RExC_parse == ')')
12379                     {
12380                         node = reganode(pRExC_state, ANYOF, 0);
12381                         RExC_size += ANYOF_SKIP;
12382                         nextchar(pRExC_state);
12383                         Set_Node_Length(node,
12384                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12385                         return node;
12386                     }
12387                     goto no_close;
12388             }
12389             RExC_parse++;
12390         }
12391
12392         no_close:
12393         FAIL("Syntax error in (?[...])");
12394     }
12395
12396     /* Pass 2 only after this.  Everything in this construct is a
12397      * metacharacter.  Operands begin with either a '\' (for an escape
12398      * sequence), or a '[' for a bracketed character class.  Any other
12399      * character should be an operator, or parenthesis for grouping.  Both
12400      * types of operands are handled by calling regclass() to parse them.  It
12401      * is called with a parameter to indicate to return the computed inversion
12402      * list.  The parsing here is implemented via a stack.  Each entry on the
12403      * stack is a single character representing one of the operators, or the
12404      * '('; or else a pointer to an operand inversion list. */
12405
12406 #define IS_OPERAND(a)  (! SvIOK(a))
12407
12408     /* The stack starts empty.  It is a syntax error if the first thing parsed
12409      * is a binary operator; everything else is pushed on the stack.  When an
12410      * operand is parsed, the top of the stack is examined.  If it is a binary
12411      * operator, the item before it should be an operand, and both are replaced
12412      * by the result of doing that operation on the new operand and the one on
12413      * the stack.   Thus a sequence of binary operands is reduced to a single
12414      * one before the next one is parsed.
12415      *
12416      * A unary operator may immediately follow a binary in the input, for
12417      * example
12418      *      [a] + ! [b]
12419      * When an operand is parsed and the top of the stack is a unary operator,
12420      * the operation is performed, and then the stack is rechecked to see if
12421      * this new operand is part of a binary operation; if so, it is handled as
12422      * above.
12423      *
12424      * A '(' is simply pushed on the stack; it is valid only if the stack is
12425      * empty, or the top element of the stack is an operator or another '('
12426      * (for which the parenthesized expression will become an operand).  By the
12427      * time the corresponding ')' is parsed everything in between should have
12428      * been parsed and evaluated to a single operand (or else is a syntax
12429      * error), and is handled as a regular operand */
12430
12431     sv_2mortal((SV *)(stack = newAV()));
12432
12433     while (RExC_parse < RExC_end) {
12434         I32 top_index = av_tindex(stack);
12435         SV** top_ptr;
12436         SV* current = NULL;
12437
12438         /* Skip white space */
12439         RExC_parse = regpatws(pRExC_state, RExC_parse,
12440                                 TRUE); /* means recognize comments */
12441         if (RExC_parse >= RExC_end) {
12442             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12443         }
12444         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12445             break;
12446         }
12447
12448         switch (curchar) {
12449
12450             case '?':
12451                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12452                                                safely subtract 1 from
12453                                                RExC_parse in the next clause.
12454                                                If we have something on the
12455                                                stack, we have parsed something
12456                                              */
12457                     && UCHARAT(RExC_parse - 1) == '('
12458                     && RExC_parse < RExC_end)
12459                 {
12460                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12461                      * This happens when we have some thing like
12462                      *
12463                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12464                      *   ...
12465                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12466                      *
12467                      * Here we would be handling the interpolated
12468                      * '$thai_or_lao'.  We handle this by a recursive call to
12469                      * ourselves which returns the inversion list the
12470                      * interpolated expression evaluates to.  We use the flags
12471                      * from the interpolated pattern. */
12472                     U32 save_flags = RExC_flags;
12473                     const char * const save_parse = ++RExC_parse;
12474
12475                     parse_lparen_question_flags(pRExC_state);
12476
12477                     if (RExC_parse == save_parse  /* Makes sure there was at
12478                                                      least one flag (or this
12479                                                      embedding wasn't compiled)
12480                                                    */
12481                         || RExC_parse >= RExC_end - 4
12482                         || UCHARAT(RExC_parse) != ':'
12483                         || UCHARAT(++RExC_parse) != '('
12484                         || UCHARAT(++RExC_parse) != '?'
12485                         || UCHARAT(++RExC_parse) != '[')
12486                     {
12487
12488                         /* In combination with the above, this moves the
12489                          * pointer to the point just after the first erroneous
12490                          * character (or if there are no flags, to where they
12491                          * should have been) */
12492                         if (RExC_parse >= RExC_end - 4) {
12493                             RExC_parse = RExC_end;
12494                         }
12495                         else if (RExC_parse != save_parse) {
12496                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12497                         }
12498                         vFAIL("Expecting '(?flags:(?[...'");
12499                     }
12500                     RExC_parse++;
12501                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12502                                                     depth+1, oregcomp_parse);
12503
12504                     /* Here, 'current' contains the embedded expression's
12505                      * inversion list, and RExC_parse points to the trailing
12506                      * ']'; the next character should be the ')' which will be
12507                      * paired with the '(' that has been put on the stack, so
12508                      * the whole embedded expression reduces to '(operand)' */
12509                     RExC_parse++;
12510
12511                     RExC_flags = save_flags;
12512                     goto handle_operand;
12513                 }
12514                 /* FALL THROUGH */
12515
12516             default:
12517                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12518                 vFAIL("Unexpected character");
12519
12520             case '\\':
12521                 /* regclass() can only return RESTART_UTF8 if multi-char
12522                    folds are allowed.  */
12523                 if (!regclass(pRExC_state, flagp,depth+1,
12524                               TRUE, /* means parse just the next thing */
12525                               FALSE, /* don't allow multi-char folds */
12526                               FALSE, /* don't silence non-portable warnings.  */
12527                               &current))
12528                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12529                           (UV) *flagp);
12530                 /* regclass() will return with parsing just the \ sequence,
12531                  * leaving the parse pointer at the next thing to parse */
12532                 RExC_parse--;
12533                 goto handle_operand;
12534
12535             case '[':   /* Is a bracketed character class */
12536             {
12537                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12538
12539                 if (! is_posix_class) {
12540                     RExC_parse++;
12541                 }
12542
12543                 /* regclass() can only return RESTART_UTF8 if multi-char
12544                    folds are allowed.  */
12545                 if(!regclass(pRExC_state, flagp,depth+1,
12546                              is_posix_class, /* parse the whole char class
12547                                                 only if not a posix class */
12548                              FALSE, /* don't allow multi-char folds */
12549                              FALSE, /* don't silence non-portable warnings.  */
12550                              &current))
12551                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12552                           (UV) *flagp);
12553                 /* function call leaves parse pointing to the ']', except if we
12554                  * faked it */
12555                 if (is_posix_class) {
12556                     RExC_parse--;
12557                 }
12558
12559                 goto handle_operand;
12560             }
12561
12562             case '&':
12563             case '|':
12564             case '+':
12565             case '-':
12566             case '^':
12567                 if (top_index < 0
12568                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12569                     || ! IS_OPERAND(*top_ptr))
12570                 {
12571                     RExC_parse++;
12572                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12573                 }
12574                 av_push(stack, newSVuv(curchar));
12575                 break;
12576
12577             case '!':
12578                 av_push(stack, newSVuv(curchar));
12579                 break;
12580
12581             case '(':
12582                 if (top_index >= 0) {
12583                     top_ptr = av_fetch(stack, top_index, FALSE);
12584                     assert(top_ptr);
12585                     if (IS_OPERAND(*top_ptr)) {
12586                         RExC_parse++;
12587                         vFAIL("Unexpected '(' with no preceding operator");
12588                     }
12589                 }
12590                 av_push(stack, newSVuv(curchar));
12591                 break;
12592
12593             case ')':
12594             {
12595                 SV* lparen;
12596                 if (top_index < 1
12597                     || ! (current = av_pop(stack))
12598                     || ! IS_OPERAND(current)
12599                     || ! (lparen = av_pop(stack))
12600                     || IS_OPERAND(lparen)
12601                     || SvUV(lparen) != '(')
12602                 {
12603                     SvREFCNT_dec(current);
12604                     RExC_parse++;
12605                     vFAIL("Unexpected ')'");
12606                 }
12607                 top_index -= 2;
12608                 SvREFCNT_dec_NN(lparen);
12609
12610                 /* FALL THROUGH */
12611             }
12612
12613               handle_operand:
12614
12615                 /* Here, we have an operand to process, in 'current' */
12616
12617                 if (top_index < 0) {    /* Just push if stack is empty */
12618                     av_push(stack, current);
12619                 }
12620                 else {
12621                     SV* top = av_pop(stack);
12622                     SV *prev = NULL;
12623                     char current_operator;
12624
12625                     if (IS_OPERAND(top)) {
12626                         SvREFCNT_dec_NN(top);
12627                         SvREFCNT_dec_NN(current);
12628                         vFAIL("Operand with no preceding operator");
12629                     }
12630                     current_operator = (char) SvUV(top);
12631                     switch (current_operator) {
12632                         case '(':   /* Push the '(' back on followed by the new
12633                                        operand */
12634                             av_push(stack, top);
12635                             av_push(stack, current);
12636                             SvREFCNT_inc(top);  /* Counters the '_dec' done
12637                                                    just after the 'break', so
12638                                                    it doesn't get wrongly freed
12639                                                  */
12640                             break;
12641
12642                         case '!':
12643                             _invlist_invert(current);
12644
12645                             /* Unlike binary operators, the top of the stack,
12646                              * now that this unary one has been popped off, may
12647                              * legally be an operator, and we now have operand
12648                              * for it. */
12649                             top_index--;
12650                             SvREFCNT_dec_NN(top);
12651                             goto handle_operand;
12652
12653                         case '&':
12654                             prev = av_pop(stack);
12655                             _invlist_intersection(prev,
12656                                                    current,
12657                                                    &current);
12658                             av_push(stack, current);
12659                             break;
12660
12661                         case '|':
12662                         case '+':
12663                             prev = av_pop(stack);
12664                             _invlist_union(prev, current, &current);
12665                             av_push(stack, current);
12666                             break;
12667
12668                         case '-':
12669                             prev = av_pop(stack);;
12670                             _invlist_subtract(prev, current, &current);
12671                             av_push(stack, current);
12672                             break;
12673
12674                         case '^':   /* The union minus the intersection */
12675                         {
12676                             SV* i = NULL;
12677                             SV* u = NULL;
12678                             SV* element;
12679
12680                             prev = av_pop(stack);
12681                             _invlist_union(prev, current, &u);
12682                             _invlist_intersection(prev, current, &i);
12683                             /* _invlist_subtract will overwrite current
12684                                 without freeing what it already contains */
12685                             element = current;
12686                             _invlist_subtract(u, i, &current);
12687                             av_push(stack, current);
12688                             SvREFCNT_dec_NN(i);
12689                             SvREFCNT_dec_NN(u);
12690                             SvREFCNT_dec_NN(element);
12691                             break;
12692                         }
12693
12694                         default:
12695                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12696                 }
12697                 SvREFCNT_dec_NN(top);
12698                 SvREFCNT_dec(prev);
12699             }
12700         }
12701
12702         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12703     }
12704
12705     if (av_tindex(stack) < 0   /* Was empty */
12706         || ((final = av_pop(stack)) == NULL)
12707         || ! IS_OPERAND(final)
12708         || av_tindex(stack) >= 0)  /* More left on stack */
12709     {
12710         vFAIL("Incomplete expression within '(?[ ])'");
12711     }
12712
12713     /* Here, 'final' is the resultant inversion list from evaluating the
12714      * expression.  Return it if so requested */
12715     if (return_invlist) {
12716         *return_invlist = final;
12717         return END;
12718     }
12719
12720     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12721      * expecting a string of ranges and individual code points */
12722     invlist_iterinit(final);
12723     result_string = newSVpvs("");
12724     while (invlist_iternext(final, &start, &end)) {
12725         if (start == end) {
12726             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12727         }
12728         else {
12729             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12730                                                      start,          end);
12731         }
12732     }
12733
12734     save_parse = RExC_parse;
12735     RExC_parse = SvPV(result_string, len);
12736     save_end = RExC_end;
12737     RExC_end = RExC_parse + len;
12738
12739     /* We turn off folding around the call, as the class we have constructed
12740      * already has all folding taken into consideration, and we don't want
12741      * regclass() to add to that */
12742     RExC_flags &= ~RXf_PMf_FOLD;
12743     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12744      */
12745     node = regclass(pRExC_state, flagp,depth+1,
12746                     FALSE, /* means parse the whole char class */
12747                     FALSE, /* don't allow multi-char folds */
12748                     TRUE, /* silence non-portable warnings.  The above may very
12749                              well have generated non-portable code points, but
12750                              they're valid on this machine */
12751                     NULL);
12752     if (!node)
12753         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12754                     PTR2UV(flagp));
12755     if (save_fold) {
12756         RExC_flags |= RXf_PMf_FOLD;
12757     }
12758     RExC_parse = save_parse + 1;
12759     RExC_end = save_end;
12760     SvREFCNT_dec_NN(final);
12761     SvREFCNT_dec_NN(result_string);
12762
12763     nextchar(pRExC_state);
12764     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12765     return node;
12766 }
12767 #undef IS_OPERAND
12768
12769 /* The names of properties whose definitions are not known at compile time are
12770  * stored in this SV, after a constant heading.  So if the length has been
12771  * changed since initialization, then there is a run-time definition. */
12772 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12773
12774 STATIC regnode *
12775 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12776                  const bool stop_at_1,  /* Just parse the next thing, don't
12777                                            look for a full character class */
12778                  bool allow_multi_folds,
12779                  const bool silence_non_portable,   /* Don't output warnings
12780                                                        about too large
12781                                                        characters */
12782                  SV** ret_invlist)  /* Return an inversion list, not a node */
12783 {
12784     /* parse a bracketed class specification.  Most of these will produce an
12785      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12786      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12787      * under /i with multi-character folds: it will be rewritten following the
12788      * paradigm of this example, where the <multi-fold>s are characters which
12789      * fold to multiple character sequences:
12790      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12791      * gets effectively rewritten as:
12792      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12793      * reg() gets called (recursively) on the rewritten version, and this
12794      * function will return what it constructs.  (Actually the <multi-fold>s
12795      * aren't physically removed from the [abcdefghi], it's just that they are
12796      * ignored in the recursion by means of a flag:
12797      * <RExC_in_multi_char_class>.)
12798      *
12799      * ANYOF nodes contain a bit map for the first 256 characters, with the
12800      * corresponding bit set if that character is in the list.  For characters
12801      * above 255, a range list or swash is used.  There are extra bits for \w,
12802      * etc. in locale ANYOFs, as what these match is not determinable at
12803      * compile time
12804      *
12805      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12806      * to be restarted.  This can only happen if ret_invlist is non-NULL.
12807      */
12808
12809     dVAR;
12810     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12811     IV range = 0;
12812     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12813     regnode *ret;
12814     STRLEN numlen;
12815     IV namedclass = OOB_NAMEDCLASS;
12816     char *rangebegin = NULL;
12817     bool need_class = 0;
12818     SV *listsv = NULL;
12819     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12820                                       than just initialized.  */
12821     SV* properties = NULL;    /* Code points that match \p{} \P{} */
12822     SV* posixes = NULL;     /* Code points that match classes like [:word:],
12823                                extended beyond the Latin1 range.  These have to
12824                                be kept separate from other code points for much
12825                                of this function because their handling  is
12826                                different under /i, and for most classes under
12827                                /d as well */
12828     UV element_count = 0;   /* Number of distinct elements in the class.
12829                                Optimizations may be possible if this is tiny */
12830     AV * multi_char_matches = NULL; /* Code points that fold to more than one
12831                                        character; used under /i */
12832     UV n;
12833     char * stop_ptr = RExC_end;    /* where to stop parsing */
12834     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12835                                                    space? */
12836     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12837
12838     /* Unicode properties are stored in a swash; this holds the current one
12839      * being parsed.  If this swash is the only above-latin1 component of the
12840      * character class, an optimization is to pass it directly on to the
12841      * execution engine.  Otherwise, it is set to NULL to indicate that there
12842      * are other things in the class that have to be dealt with at execution
12843      * time */
12844     SV* swash = NULL;           /* Code points that match \p{} \P{} */
12845
12846     /* Set if a component of this character class is user-defined; just passed
12847      * on to the engine */
12848     bool has_user_defined_property = FALSE;
12849
12850     /* inversion list of code points this node matches only when the target
12851      * string is in UTF-8.  (Because is under /d) */
12852     SV* depends_list = NULL;
12853
12854     /* inversion list of code points this node matches.  For much of the
12855      * function, it includes only those that match regardless of the utf8ness
12856      * of the target string */
12857     SV* cp_list = NULL;
12858
12859 #ifdef EBCDIC
12860     /* In a range, counts how many 0-2 of the ends of it came from literals,
12861      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12862     UV literal_endpoint = 0;
12863 #endif
12864     bool invert = FALSE;    /* Is this class to be complemented */
12865
12866     /* Is there any thing like \W or [:^digit:] that matches above the legal
12867      * Unicode range? */
12868     bool runtime_posix_matches_above_Unicode = FALSE;
12869
12870     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12871         case we need to change the emitted regop to an EXACT. */
12872     const char * orig_parse = RExC_parse;
12873     const SSize_t orig_size = RExC_size;
12874     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
12875     GET_RE_DEBUG_FLAGS_DECL;
12876
12877     PERL_ARGS_ASSERT_REGCLASS;
12878 #ifndef DEBUGGING
12879     PERL_UNUSED_ARG(depth);
12880 #endif
12881
12882     DEBUG_PARSE("clas");
12883
12884     /* Assume we are going to generate an ANYOF node. */
12885     ret = reganode(pRExC_state, ANYOF, 0);
12886
12887     if (SIZE_ONLY) {
12888         RExC_size += ANYOF_SKIP;
12889         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12890     }
12891     else {
12892         ANYOF_FLAGS(ret) = 0;
12893
12894         RExC_emit += ANYOF_SKIP;
12895         if (LOC) {
12896             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12897         }
12898         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12899         initial_listsv_len = SvCUR(listsv);
12900         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12901     }
12902
12903     if (skip_white) {
12904         RExC_parse = regpatws(pRExC_state, RExC_parse,
12905                               FALSE /* means don't recognize comments */);
12906     }
12907
12908     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
12909         RExC_parse++;
12910         invert = TRUE;
12911         allow_multi_folds = FALSE;
12912         RExC_naughty++;
12913         if (skip_white) {
12914             RExC_parse = regpatws(pRExC_state, RExC_parse,
12915                                   FALSE /* means don't recognize comments */);
12916         }
12917     }
12918
12919     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12920     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12921         const char *s = RExC_parse;
12922         const char  c = *s++;
12923
12924         while (isWORDCHAR(*s))
12925             s++;
12926         if (*s && c == *s && s[1] == ']') {
12927             SAVEFREESV(RExC_rx_sv);
12928             ckWARN3reg(s+2,
12929                        "POSIX syntax [%c %c] belongs inside character classes",
12930                        c, c);
12931             (void)ReREFCNT_inc(RExC_rx_sv);
12932         }
12933     }
12934
12935     /* If the caller wants us to just parse a single element, accomplish this
12936      * by faking the loop ending condition */
12937     if (stop_at_1 && RExC_end > RExC_parse) {
12938         stop_ptr = RExC_parse + 1;
12939     }
12940
12941     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12942     if (UCHARAT(RExC_parse) == ']')
12943         goto charclassloop;
12944
12945 parseit:
12946     while (1) {
12947         if  (RExC_parse >= stop_ptr) {
12948             break;
12949         }
12950
12951         if (skip_white) {
12952             RExC_parse = regpatws(pRExC_state, RExC_parse,
12953                                   FALSE /* means don't recognize comments */);
12954         }
12955
12956         if  (UCHARAT(RExC_parse) == ']') {
12957             break;
12958         }
12959
12960     charclassloop:
12961
12962         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12963         save_value = value;
12964         save_prevvalue = prevvalue;
12965
12966         if (!range) {
12967             rangebegin = RExC_parse;
12968             element_count++;
12969         }
12970         if (UTF) {
12971             value = utf8n_to_uvchr((U8*)RExC_parse,
12972                                    RExC_end - RExC_parse,
12973                                    &numlen, UTF8_ALLOW_DEFAULT);
12974             RExC_parse += numlen;
12975         }
12976         else
12977             value = UCHARAT(RExC_parse++);
12978
12979         if (value == '['
12980             && RExC_parse < RExC_end
12981             && POSIXCC(UCHARAT(RExC_parse)))
12982         {
12983             namedclass = regpposixcc(pRExC_state, value, strict);
12984         }
12985         else if (value == '\\') {
12986             if (UTF) {
12987                 value = utf8n_to_uvchr((U8*)RExC_parse,
12988                                    RExC_end - RExC_parse,
12989                                    &numlen, UTF8_ALLOW_DEFAULT);
12990                 RExC_parse += numlen;
12991             }
12992             else
12993                 value = UCHARAT(RExC_parse++);
12994
12995             /* Some compilers cannot handle switching on 64-bit integer
12996              * values, therefore value cannot be an UV.  Yes, this will
12997              * be a problem later if we want switch on Unicode.
12998              * A similar issue a little bit later when switching on
12999              * namedclass. --jhi */
13000
13001             /* If the \ is escaping white space when white space is being
13002              * skipped, it means that that white space is wanted literally, and
13003              * is already in 'value'.  Otherwise, need to translate the escape
13004              * into what it signifies. */
13005             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13006
13007             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13008             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13009             case 's':   namedclass = ANYOF_SPACE;       break;
13010             case 'S':   namedclass = ANYOF_NSPACE;      break;
13011             case 'd':   namedclass = ANYOF_DIGIT;       break;
13012             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13013             case 'v':   namedclass = ANYOF_VERTWS;      break;
13014             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13015             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13016             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13017             case 'N':  /* Handle \N{NAME} in class */
13018                 {
13019                     /* We only pay attention to the first char of
13020                     multichar strings being returned. I kinda wonder
13021                     if this makes sense as it does change the behaviour
13022                     from earlier versions, OTOH that behaviour was broken
13023                     as well. */
13024                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13025                                       TRUE, /* => charclass */
13026                                       strict))
13027                     {
13028                         if (*flagp & RESTART_UTF8)
13029                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13030                         goto parseit;
13031                     }
13032                 }
13033                 break;
13034             case 'p':
13035             case 'P':
13036                 {
13037                 char *e;
13038
13039                 /* We will handle any undefined properties ourselves */
13040                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13041                                        /* And we actually would prefer to get
13042                                         * the straight inversion list of the
13043                                         * swash, since we will be accessing it
13044                                         * anyway, to save a little time */
13045                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13046
13047                 if (RExC_parse >= RExC_end)
13048                     vFAIL2("Empty \\%c{}", (U8)value);
13049                 if (*RExC_parse == '{') {
13050                     const U8 c = (U8)value;
13051                     e = strchr(RExC_parse++, '}');
13052                     if (!e)
13053                         vFAIL2("Missing right brace on \\%c{}", c);
13054                     while (isSPACE(UCHARAT(RExC_parse)))
13055                         RExC_parse++;
13056                     if (e == RExC_parse)
13057                         vFAIL2("Empty \\%c{}", c);
13058                     n = e - RExC_parse;
13059                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13060                         n--;
13061                 }
13062                 else {
13063                     e = RExC_parse;
13064                     n = 1;
13065                 }
13066                 if (!SIZE_ONLY) {
13067                     SV* invlist;
13068                     char* formatted;
13069                     char* name;
13070
13071                     if (UCHARAT(RExC_parse) == '^') {
13072                          RExC_parse++;
13073                          n--;
13074                          /* toggle.  (The rhs xor gets the single bit that
13075                           * differs between P and p; the other xor inverts just
13076                           * that bit) */
13077                          value ^= 'P' ^ 'p';
13078
13079                          while (isSPACE(UCHARAT(RExC_parse))) {
13080                               RExC_parse++;
13081                               n--;
13082                          }
13083                     }
13084                     /* Try to get the definition of the property into
13085                      * <invlist>.  If /i is in effect, the effective property
13086                      * will have its name be <__NAME_i>.  The design is
13087                      * discussed in commit
13088                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13089                     formatted = Perl_form(aTHX_
13090                                           "%s%.*s%s\n",
13091                                           (FOLD) ? "__" : "",
13092                                           (int)n,
13093                                           RExC_parse,
13094                                           (FOLD) ? "_i" : ""
13095                                 );
13096                     name = savepvn(formatted, strlen(formatted));
13097
13098                     /* Look up the property name, and get its swash and
13099                      * inversion list, if the property is found  */
13100                     if (swash) {
13101                         SvREFCNT_dec_NN(swash);
13102                     }
13103                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13104                                              1, /* binary */
13105                                              0, /* not tr/// */
13106                                              NULL, /* No inversion list */
13107                                              &swash_init_flags
13108                                             );
13109                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13110                         if (swash) {
13111                             SvREFCNT_dec_NN(swash);
13112                             swash = NULL;
13113                         }
13114
13115                         /* Here didn't find it.  It could be a user-defined
13116                          * property that will be available at run-time.  If we
13117                          * accept only compile-time properties, is an error;
13118                          * otherwise add it to the list for run-time look up */
13119                         if (ret_invlist) {
13120                             RExC_parse = e + 1;
13121                             vFAIL2utf8f(
13122                                 "Property '%"UTF8f"' is unknown",
13123                                 UTF8fARG(UTF, n, name));
13124                         }
13125                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13126                                         (value == 'p' ? '+' : '!'),
13127                                         UTF8fARG(UTF, n, name));
13128                         has_user_defined_property = TRUE;
13129
13130                         /* We don't know yet, so have to assume that the
13131                          * property could match something in the Latin1 range,
13132                          * hence something that isn't utf8.  Note that this
13133                          * would cause things in <depends_list> to match
13134                          * inappropriately, except that any \p{}, including
13135                          * this one forces Unicode semantics, which means there
13136                          * is <no depends_list> */
13137                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13138                     }
13139                     else {
13140
13141                         /* Here, did get the swash and its inversion list.  If
13142                          * the swash is from a user-defined property, then this
13143                          * whole character class should be regarded as such */
13144                         has_user_defined_property =
13145                                     (swash_init_flags
13146                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
13147
13148                         /* Invert if asking for the complement */
13149                         if (value == 'P') {
13150                             _invlist_union_complement_2nd(properties,
13151                                                           invlist,
13152                                                           &properties);
13153
13154                             /* The swash can't be used as-is, because we've
13155                              * inverted things; delay removing it to here after
13156                              * have copied its invlist above */
13157                             SvREFCNT_dec_NN(swash);
13158                             swash = NULL;
13159                         }
13160                         else {
13161                             _invlist_union(properties, invlist, &properties);
13162                         }
13163                     }
13164                     Safefree(name);
13165                 }
13166                 RExC_parse = e + 1;
13167                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13168                                                 named */
13169
13170                 /* \p means they want Unicode semantics */
13171                 RExC_uni_semantics = 1;
13172                 }
13173                 break;
13174             case 'n':   value = '\n';                   break;
13175             case 'r':   value = '\r';                   break;
13176             case 't':   value = '\t';                   break;
13177             case 'f':   value = '\f';                   break;
13178             case 'b':   value = '\b';                   break;
13179             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13180             case 'a':   value = '\a';                   break;
13181             case 'o':
13182                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13183                 {
13184                     const char* error_msg;
13185                     bool valid = grok_bslash_o(&RExC_parse,
13186                                                &value,
13187                                                &error_msg,
13188                                                SIZE_ONLY,   /* warnings in pass
13189                                                                1 only */
13190                                                strict,
13191                                                silence_non_portable,
13192                                                UTF);
13193                     if (! valid) {
13194                         vFAIL(error_msg);
13195                     }
13196                 }
13197                 if (PL_encoding && value < 0x100) {
13198                     goto recode_encoding;
13199                 }
13200                 break;
13201             case 'x':
13202                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13203                 {
13204                     const char* error_msg;
13205                     bool valid = grok_bslash_x(&RExC_parse,
13206                                                &value,
13207                                                &error_msg,
13208                                                TRUE, /* Output warnings */
13209                                                strict,
13210                                                silence_non_portable,
13211                                                UTF);
13212                     if (! valid) {
13213                         vFAIL(error_msg);
13214                     }
13215                 }
13216                 if (PL_encoding && value < 0x100)
13217                     goto recode_encoding;
13218                 break;
13219             case 'c':
13220                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
13221                 break;
13222             case '0': case '1': case '2': case '3': case '4':
13223             case '5': case '6': case '7':
13224                 {
13225                     /* Take 1-3 octal digits */
13226                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13227                     numlen = (strict) ? 4 : 3;
13228                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13229                     RExC_parse += numlen;
13230                     if (numlen != 3) {
13231                         if (strict) {
13232                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13233                             vFAIL("Need exactly 3 octal digits");
13234                         }
13235                         else if (! SIZE_ONLY /* like \08, \178 */
13236                                  && numlen < 3
13237                                  && RExC_parse < RExC_end
13238                                  && isDIGIT(*RExC_parse)
13239                                  && ckWARN(WARN_REGEXP))
13240                         {
13241                             SAVEFREESV(RExC_rx_sv);
13242                             reg_warn_non_literal_string(
13243                                  RExC_parse + 1,
13244                                  form_short_octal_warning(RExC_parse, numlen));
13245                             (void)ReREFCNT_inc(RExC_rx_sv);
13246                         }
13247                     }
13248                     if (PL_encoding && value < 0x100)
13249                         goto recode_encoding;
13250                     break;
13251                 }
13252             recode_encoding:
13253                 if (! RExC_override_recoding) {
13254                     SV* enc = PL_encoding;
13255                     value = reg_recode((const char)(U8)value, &enc);
13256                     if (!enc) {
13257                         if (strict) {
13258                             vFAIL("Invalid escape in the specified encoding");
13259                         }
13260                         else if (SIZE_ONLY) {
13261                             ckWARNreg(RExC_parse,
13262                                   "Invalid escape in the specified encoding");
13263                         }
13264                     }
13265                     break;
13266                 }
13267             default:
13268                 /* Allow \_ to not give an error */
13269                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13270                     if (strict) {
13271                         vFAIL2("Unrecognized escape \\%c in character class",
13272                                (int)value);
13273                     }
13274                     else {
13275                         SAVEFREESV(RExC_rx_sv);
13276                         ckWARN2reg(RExC_parse,
13277                             "Unrecognized escape \\%c in character class passed through",
13278                             (int)value);
13279                         (void)ReREFCNT_inc(RExC_rx_sv);
13280                     }
13281                 }
13282                 break;
13283             }   /* End of switch on char following backslash */
13284         } /* end of handling backslash escape sequences */
13285 #ifdef EBCDIC
13286         else
13287             literal_endpoint++;
13288 #endif
13289
13290         /* Here, we have the current token in 'value' */
13291
13292         /* What matches in a locale is not known until runtime.  This includes
13293          * what the Posix classes (like \w, [:space:]) match.  Room must be
13294          * reserved (one time per outer bracketed class) to store such classes,
13295          * either if Perl is compiled so that locale nodes always should have
13296          * this space, or if there is such posix class info to be stored.  The
13297          * space will contain a bit for each named class that is to be matched
13298          * against.  This isn't needed for \p{} and pseudo-classes, as they are
13299          * not affected by locale, and hence are dealt with separately */
13300         if (LOC
13301             && ! need_class
13302             && (ANYOF_LOCALE == ANYOF_POSIXL
13303                 || (namedclass > OOB_NAMEDCLASS
13304                     && namedclass < ANYOF_POSIXL_MAX)))
13305         {
13306             need_class = 1;
13307             if (SIZE_ONLY) {
13308                 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13309             }
13310             else {
13311                 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13312             }
13313             ANYOF_POSIXL_ZERO(ret);
13314             ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13315         }
13316
13317         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13318             U8 classnum;
13319
13320             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13321              * literal, as is the character that began the false range, i.e.
13322              * the 'a' in the examples */
13323             if (range) {
13324                 if (!SIZE_ONLY) {
13325                     const int w = (RExC_parse >= rangebegin)
13326                                   ? RExC_parse - rangebegin
13327                                   : 0;
13328                     if (strict) {
13329                         vFAIL2utf8f(
13330                             "False [] range \"%"UTF8f"\"",
13331                             UTF8fARG(UTF, w, rangebegin));
13332                     }
13333                     else {
13334                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13335                         ckWARN2reg(RExC_parse,
13336                             "False [] range \"%"UTF8f"\"",
13337                             UTF8fARG(UTF, w, rangebegin));
13338                         (void)ReREFCNT_inc(RExC_rx_sv);
13339                         cp_list = add_cp_to_invlist(cp_list, '-');
13340                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
13341                     }
13342                 }
13343
13344                 range = 0; /* this was not a true range */
13345                 element_count += 2; /* So counts for three values */
13346             }
13347
13348             classnum = namedclass_to_classnum(namedclass);
13349
13350             if (LOC && namedclass < ANYOF_POSIXL_MAX
13351 #ifndef HAS_ISASCII
13352                 && classnum != _CC_ASCII
13353 #endif
13354 #ifndef HAS_ISBLANK
13355                 && classnum != _CC_BLANK
13356 #endif
13357             ) {
13358                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13359                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13360                                                             ? -1
13361                                                             : 1)))
13362                 {
13363                     posixl_matches_all = TRUE;
13364                     break;
13365                 }
13366                 ANYOF_POSIXL_SET(ret, namedclass);
13367             }
13368             /* XXX After have made all the posix classes known at compile time
13369              * we can move the LOC handling below to above */
13370
13371             if (! SIZE_ONLY) {
13372                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13373                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13374
13375                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
13376                          * /l make a difference in what these match.  There
13377                          * would be problems if these characters had folds
13378                          * other than themselves, as cp_list is subject to
13379                          * folding. */
13380                         if (classnum != _CC_VERTSPACE) {
13381                             assert(   namedclass == ANYOF_HORIZWS
13382                                    || namedclass == ANYOF_NHORIZWS);
13383
13384                             /* It turns out that \h is just a synonym for
13385                              * XPosixBlank */
13386                             classnum = _CC_BLANK;
13387                         }
13388
13389                         _invlist_union_maybe_complement_2nd(
13390                                 cp_list,
13391                                 PL_XPosix_ptrs[classnum],
13392                                 cBOOL(namedclass % 2), /* Complement if odd
13393                                                           (NHORIZWS, NVERTWS)
13394                                                         */
13395                                 &cp_list);
13396                     }
13397                 }
13398                 else if (classnum == _CC_ASCII) {
13399 #ifdef HAS_ISASCII
13400                     if (LOC) {
13401                         ANYOF_POSIXL_SET(ret, namedclass);
13402                     }
13403                     else
13404 #endif  /* Not isascii(); just use the hard-coded definition for it */
13405                     {
13406                         _invlist_union_maybe_complement_2nd(
13407                                 posixes,
13408                                 PL_Posix_ptrs[_CC_ASCII],
13409                                 cBOOL(namedclass % 2), /* Complement if odd
13410                                                           (NASCII) */
13411                                 &posixes);
13412
13413                         /* The code points 128-255 added above will be
13414                          * subtracted out below under /d, so the flag needs to
13415                          * be set */
13416                         if (namedclass == ANYOF_NASCII && DEPENDS_SEMANTICS) {
13417                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13418                         }
13419                     }
13420                 }
13421                 else {  /* Garden variety class */
13422
13423                     /* The ascii range inversion list */
13424                     SV* ascii_source = PL_Posix_ptrs[classnum];
13425
13426                     /* The full Latin1 range inversion list */
13427                     SV* l1_source = PL_L1Posix_ptrs[classnum];
13428
13429                     /* This code is structured into two major clauses.  The
13430                      * first is for classes whose complete definitions may not
13431                      * already be known.  If not, the Latin1 definition
13432                      * (guaranteed to already known) is used plus code is
13433                      * generated to load the rest at run-time (only if needed).
13434                      * If the complete definition is known, it drops down to
13435                      * the second clause, where the complete definition is
13436                      * known */
13437
13438                     if (classnum < _FIRST_NON_SWASH_CC) {
13439
13440                         /* Here, the class has a swash, which may or not
13441                          * already be loaded */
13442
13443                         /* The name of the property to use to match the full
13444                          * eXtended Unicode range swash for this character
13445                          * class */
13446                         const char *Xname = swash_property_names[classnum];
13447
13448                         /* If returning the inversion list, we can't defer
13449                          * getting this until runtime */
13450                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
13451                             PL_utf8_swash_ptrs[classnum] =
13452                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
13453                                              1, /* binary */
13454                                              0, /* not tr/// */
13455                                              NULL, /* No inversion list */
13456                                              NULL  /* No flags */
13457                                             );
13458                             assert(PL_utf8_swash_ptrs[classnum]);
13459                         }
13460                         if ( !  PL_utf8_swash_ptrs[classnum]) {
13461                             if (namedclass % 2 == 0) { /* A non-complemented
13462                                                           class */
13463                                 /* If not /a matching, there are code points we
13464                                  * don't know at compile time.  Arrange for the
13465                                  * unknown matches to be loaded at run-time, if
13466                                  * needed */
13467                                 if (! AT_LEAST_ASCII_RESTRICTED) {
13468                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
13469                                                                  Xname);
13470                                 }
13471                                 if (LOC) {  /* Under locale, set run-time
13472                                                lookup */
13473                                     ANYOF_POSIXL_SET(ret, namedclass);
13474                                 }
13475                                 else {
13476                                     /* Add the current class's code points to
13477                                      * the running total */
13478                                     _invlist_union(posixes,
13479                                                    (AT_LEAST_ASCII_RESTRICTED)
13480                                                         ? ascii_source
13481                                                         : l1_source,
13482                                                    &posixes);
13483                                 }
13484                             }
13485                             else {  /* A complemented class */
13486                                 if (AT_LEAST_ASCII_RESTRICTED) {
13487                                     /* Under /a should match everything above
13488                                      * ASCII, plus the complement of the set's
13489                                      * ASCII matches */
13490                                     _invlist_union_complement_2nd(posixes,
13491                                                                   ascii_source,
13492                                                                   &posixes);
13493                                 }
13494                                 else {
13495                                     /* Arrange for the unknown matches to be
13496                                      * loaded at run-time, if needed */
13497                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
13498                                                                  Xname);
13499                                     runtime_posix_matches_above_Unicode = TRUE;
13500                                     if (LOC) {
13501                                         ANYOF_POSIXL_SET(ret, namedclass);
13502                                     }
13503                                     else {
13504
13505                                         /* We want to match everything in
13506                                          * Latin1, except those things that
13507                                          * l1_source matches */
13508                                         SV* scratch_list = NULL;
13509                                         _invlist_subtract(PL_Latin1, l1_source,
13510                                                           &scratch_list);
13511
13512                                         /* Add the list from this class to the
13513                                          * running total */
13514                                         if (! posixes) {
13515                                             posixes = scratch_list;
13516                                         }
13517                                         else {
13518                                             _invlist_union(posixes,
13519                                                            scratch_list,
13520                                                            &posixes);
13521                                             SvREFCNT_dec_NN(scratch_list);
13522                                         }
13523                                         if (DEPENDS_SEMANTICS) {
13524                                             ANYOF_FLAGS(ret)
13525                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
13526                                         }
13527                                     }
13528                                 }
13529                             }
13530                             goto namedclass_done;
13531                         }
13532
13533                         /* Here, there is a swash loaded for the class.  If no
13534                          * inversion list for it yet, get it */
13535                         if (! PL_XPosix_ptrs[classnum]) {
13536                             PL_XPosix_ptrs[classnum]
13537                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
13538                         }
13539                     }
13540
13541                     /* Here there is an inversion list already loaded for the
13542                      * entire class */
13543
13544                     if (namedclass % 2 == 0) {  /* A non-complemented class,
13545                                                    like ANYOF_PUNCT */
13546                         if (! LOC) {
13547                             /* For non-locale, just add it to any existing list
13548                              * */
13549                             _invlist_union(posixes,
13550                                            (AT_LEAST_ASCII_RESTRICTED)
13551                                                ? ascii_source
13552                                                : PL_XPosix_ptrs[classnum],
13553                                            &posixes);
13554                         }
13555                         else {  /* Locale */
13556                             SV* scratch_list = NULL;
13557
13558                             /* For above Latin1 code points, we use the full
13559                              * Unicode range */
13560                             _invlist_intersection(PL_AboveLatin1,
13561                                                   PL_XPosix_ptrs[classnum],
13562                                                   &scratch_list);
13563                             /* And set the output to it, adding instead if
13564                              * there already is an output.  Checking if
13565                              * 'posixes' is NULL first saves an extra clone.
13566                              * Its reference count will be decremented at the
13567                              * next union, etc, or if this is the only
13568                              * instance, at the end of the routine */
13569                             if (! posixes) {
13570                                 posixes = scratch_list;
13571                             }
13572                             else {
13573                                 _invlist_union(posixes, scratch_list, &posixes);
13574                                 SvREFCNT_dec_NN(scratch_list);
13575                             }
13576
13577 #ifndef HAS_ISBLANK
13578                             if (namedclass != ANYOF_BLANK) {
13579 #endif
13580                                 /* Set this class in the node for runtime
13581                                  * matching */
13582                                 ANYOF_POSIXL_SET(ret, namedclass);
13583 #ifndef HAS_ISBLANK
13584                             }
13585                             else {
13586                                 /* No isblank(), use the hard-coded ASCII-range
13587                                  * blanks, adding them to the running total. */
13588
13589                                 _invlist_union(posixes, ascii_source, &posixes);
13590                             }
13591 #endif
13592                         }
13593                     }
13594                     else {  /* A complemented class, like ANYOF_NPUNCT */
13595                         if (! LOC) {
13596                             _invlist_union_complement_2nd(
13597                                                 posixes,
13598                                                 (AT_LEAST_ASCII_RESTRICTED)
13599                                                     ? ascii_source
13600                                                     : PL_XPosix_ptrs[classnum],
13601                                                 &posixes);
13602                             /* Under /d, everything in the upper half of the
13603                              * Latin1 range matches this complement */
13604                             if (DEPENDS_SEMANTICS) {
13605                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13606                             }
13607                         }
13608                         else {  /* Locale */
13609                             SV* scratch_list = NULL;
13610                             _invlist_subtract(PL_AboveLatin1,
13611                                               PL_XPosix_ptrs[classnum],
13612                                               &scratch_list);
13613                             if (! posixes) {
13614                                 posixes = scratch_list;
13615                             }
13616                             else {
13617                                 _invlist_union(posixes, scratch_list, &posixes);
13618                                 SvREFCNT_dec_NN(scratch_list);
13619                             }
13620 #ifndef HAS_ISBLANK
13621                             if (namedclass != ANYOF_NBLANK) {
13622 #endif
13623                                 ANYOF_POSIXL_SET(ret, namedclass);
13624 #ifndef HAS_ISBLANK
13625                             }
13626                             else {
13627                                 /* Get the list of all code points in Latin1
13628                                  * that are not ASCII blanks, and add them to
13629                                  * the running total */
13630                                 _invlist_subtract(PL_Latin1, ascii_source,
13631                                                   &scratch_list);
13632                                 _invlist_union(posixes, scratch_list, &posixes);
13633                                 SvREFCNT_dec_NN(scratch_list);
13634                             }
13635 #endif
13636                         }
13637                     }
13638                 }
13639               namedclass_done:
13640                 continue;   /* Go get next character */
13641             }
13642         } /* end of namedclass \blah */
13643
13644         /* Here, we have a single value.  If 'range' is set, it is the ending
13645          * of a range--check its validity.  Later, we will handle each
13646          * individual code point in the range.  If 'range' isn't set, this
13647          * could be the beginning of a range, so check for that by looking
13648          * ahead to see if the next real character to be processed is the range
13649          * indicator--the minus sign */
13650
13651         if (skip_white) {
13652             RExC_parse = regpatws(pRExC_state, RExC_parse,
13653                                 FALSE /* means don't recognize comments */);
13654         }
13655
13656         if (range) {
13657             if (prevvalue > value) /* b-a */ {
13658                 const int w = RExC_parse - rangebegin;
13659                 vFAIL2utf8f(
13660                     "Invalid [] range \"%"UTF8f"\"",
13661                     UTF8fARG(UTF, w, rangebegin));
13662                 range = 0; /* not a valid range */
13663             }
13664         }
13665         else {
13666             prevvalue = value; /* save the beginning of the potential range */
13667             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13668                 && *RExC_parse == '-')
13669             {
13670                 char* next_char_ptr = RExC_parse + 1;
13671                 if (skip_white) {   /* Get the next real char after the '-' */
13672                     next_char_ptr = regpatws(pRExC_state,
13673                                              RExC_parse + 1,
13674                                              FALSE); /* means don't recognize
13675                                                         comments */
13676                 }
13677
13678                 /* If the '-' is at the end of the class (just before the ']',
13679                  * it is a literal minus; otherwise it is a range */
13680                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13681                     RExC_parse = next_char_ptr;
13682
13683                     /* a bad range like \w-, [:word:]- ? */
13684                     if (namedclass > OOB_NAMEDCLASS) {
13685                         if (strict || ckWARN(WARN_REGEXP)) {
13686                             const int w =
13687                                 RExC_parse >= rangebegin ?
13688                                 RExC_parse - rangebegin : 0;
13689                             if (strict) {
13690                                 vFAIL4("False [] range \"%*.*s\"",
13691                                     w, w, rangebegin);
13692                             }
13693                             else {
13694                                 vWARN4(RExC_parse,
13695                                     "False [] range \"%*.*s\"",
13696                                     w, w, rangebegin);
13697                             }
13698                         }
13699                         if (!SIZE_ONLY) {
13700                             cp_list = add_cp_to_invlist(cp_list, '-');
13701                         }
13702                         element_count++;
13703                     } else
13704                         range = 1;      /* yeah, it's a range! */
13705                     continue;   /* but do it the next time */
13706                 }
13707             }
13708         }
13709
13710         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13711          * if not */
13712
13713         /* non-Latin1 code point implies unicode semantics.  Must be set in
13714          * pass1 so is there for the whole of pass 2 */
13715         if (value > 255) {
13716             RExC_uni_semantics = 1;
13717         }
13718
13719         /* Ready to process either the single value, or the completed range.
13720          * For single-valued non-inverted ranges, we consider the possibility
13721          * of multi-char folds.  (We made a conscious decision to not do this
13722          * for the other cases because it can often lead to non-intuitive
13723          * results.  For example, you have the peculiar case that:
13724          *  "s s" =~ /^[^\xDF]+$/i => Y
13725          *  "ss"  =~ /^[^\xDF]+$/i => N
13726          *
13727          * See [perl #89750] */
13728         if (FOLD && allow_multi_folds && value == prevvalue) {
13729             if (value == LATIN_SMALL_LETTER_SHARP_S
13730                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13731                                                         value)))
13732             {
13733                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13734
13735                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13736                 STRLEN foldlen;
13737
13738                 UV folded = _to_uni_fold_flags(
13739                                 value,
13740                                 foldbuf,
13741                                 &foldlen,
13742                                 FOLD_FLAGS_FULL
13743                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
13744                                             : (ASCII_FOLD_RESTRICTED)
13745                                               ? FOLD_FLAGS_NOMIX_ASCII
13746                                               : 0)
13747                                 );
13748
13749                 /* Here, <folded> should be the first character of the
13750                  * multi-char fold of <value>, with <foldbuf> containing the
13751                  * whole thing.  But, if this fold is not allowed (because of
13752                  * the flags), <fold> will be the same as <value>, and should
13753                  * be processed like any other character, so skip the special
13754                  * handling */
13755                 if (folded != value) {
13756
13757                     /* Skip if we are recursed, currently parsing the class
13758                      * again.  Otherwise add this character to the list of
13759                      * multi-char folds. */
13760                     if (! RExC_in_multi_char_class) {
13761                         AV** this_array_ptr;
13762                         AV* this_array;
13763                         STRLEN cp_count = utf8_length(foldbuf,
13764                                                       foldbuf + foldlen);
13765                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13766
13767                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13768
13769
13770                         if (! multi_char_matches) {
13771                             multi_char_matches = newAV();
13772                         }
13773
13774                         /* <multi_char_matches> is actually an array of arrays.
13775                          * There will be one or two top-level elements: [2],
13776                          * and/or [3].  The [2] element is an array, each
13777                          * element thereof is a character which folds to TWO
13778                          * characters; [3] is for folds to THREE characters.
13779                          * (Unicode guarantees a maximum of 3 characters in any
13780                          * fold.)  When we rewrite the character class below,
13781                          * we will do so such that the longest folds are
13782                          * written first, so that it prefers the longest
13783                          * matching strings first.  This is done even if it
13784                          * turns out that any quantifier is non-greedy, out of
13785                          * programmer laziness.  Tom Christiansen has agreed
13786                          * that this is ok.  This makes the test for the
13787                          * ligature 'ffi' come before the test for 'ff' */
13788                         if (av_exists(multi_char_matches, cp_count)) {
13789                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
13790                                                              cp_count, FALSE);
13791                             this_array = *this_array_ptr;
13792                         }
13793                         else {
13794                             this_array = newAV();
13795                             av_store(multi_char_matches, cp_count,
13796                                      (SV*) this_array);
13797                         }
13798                         av_push(this_array, multi_fold);
13799                     }
13800
13801                     /* This element should not be processed further in this
13802                      * class */
13803                     element_count--;
13804                     value = save_value;
13805                     prevvalue = save_prevvalue;
13806                     continue;
13807                 }
13808             }
13809         }
13810
13811         /* Deal with this element of the class */
13812         if (! SIZE_ONLY) {
13813 #ifndef EBCDIC
13814             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13815 #else
13816             SV* this_range = _new_invlist(1);
13817             _append_range_to_invlist(this_range, prevvalue, value);
13818
13819             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13820              * If this range was specified using something like 'i-j', we want
13821              * to include only the 'i' and the 'j', and not anything in
13822              * between, so exclude non-ASCII, non-alphabetics from it.
13823              * However, if the range was specified with something like
13824              * [\x89-\x91] or [\x89-j], all code points within it should be
13825              * included.  literal_endpoint==2 means both ends of the range used
13826              * a literal character, not \x{foo} */
13827             if (literal_endpoint == 2
13828                 && ((prevvalue >= 'a' && value <= 'z')
13829                     || (prevvalue >= 'A' && value <= 'Z')))
13830             {
13831                 _invlist_intersection(this_range, PL_ASCII,
13832                                       &this_range);
13833                 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13834                                       &this_range);
13835             }
13836             _invlist_union(cp_list, this_range, &cp_list);
13837             literal_endpoint = 0;
13838 #endif
13839         }
13840
13841         range = 0; /* this range (if it was one) is done now */
13842     } /* End of loop through all the text within the brackets */
13843
13844     /* If anything in the class expands to more than one character, we have to
13845      * deal with them by building up a substitute parse string, and recursively
13846      * calling reg() on it, instead of proceeding */
13847     if (multi_char_matches) {
13848         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13849         I32 cp_count;
13850         STRLEN len;
13851         char *save_end = RExC_end;
13852         char *save_parse = RExC_parse;
13853         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13854                                        a "|" */
13855         I32 reg_flags;
13856
13857         assert(! invert);
13858 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13859            because too confusing */
13860         if (invert) {
13861             sv_catpv(substitute_parse, "(?:");
13862         }
13863 #endif
13864
13865         /* Look at the longest folds first */
13866         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13867
13868             if (av_exists(multi_char_matches, cp_count)) {
13869                 AV** this_array_ptr;
13870                 SV* this_sequence;
13871
13872                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13873                                                  cp_count, FALSE);
13874                 while ((this_sequence = av_pop(*this_array_ptr)) !=
13875                                                                 &PL_sv_undef)
13876                 {
13877                     if (! first_time) {
13878                         sv_catpv(substitute_parse, "|");
13879                     }
13880                     first_time = FALSE;
13881
13882                     sv_catpv(substitute_parse, SvPVX(this_sequence));
13883                 }
13884             }
13885         }
13886
13887         /* If the character class contains anything else besides these
13888          * multi-character folds, have to include it in recursive parsing */
13889         if (element_count) {
13890             sv_catpv(substitute_parse, "|[");
13891             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13892             sv_catpv(substitute_parse, "]");
13893         }
13894
13895         sv_catpv(substitute_parse, ")");
13896 #if 0
13897         if (invert) {
13898             /* This is a way to get the parse to skip forward a whole named
13899              * sequence instead of matching the 2nd character when it fails the
13900              * first */
13901             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13902         }
13903 #endif
13904
13905         RExC_parse = SvPV(substitute_parse, len);
13906         RExC_end = RExC_parse + len;
13907         RExC_in_multi_char_class = 1;
13908         RExC_emit = (regnode *)orig_emit;
13909
13910         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13911
13912         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13913
13914         RExC_parse = save_parse;
13915         RExC_end = save_end;
13916         RExC_in_multi_char_class = 0;
13917         SvREFCNT_dec_NN(multi_char_matches);
13918         return ret;
13919     }
13920
13921     /* If the character class contains only a single element, it may be
13922      * optimizable into another node type which is smaller and runs faster.
13923      * Check if this is the case for this class */
13924     if ((element_count == 1 && ! ret_invlist)
13925         || UNLIKELY(posixl_matches_all))
13926     {
13927         U8 op = END;
13928         U8 arg = 0;
13929
13930         if (UNLIKELY(posixl_matches_all)) {
13931             op = SANY;
13932         }
13933         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
13934                                                    \w or [:digit:] or \p{foo}
13935                                                  */
13936
13937             /* All named classes are mapped into POSIXish nodes, with its FLAG
13938              * argument giving which class it is */
13939             switch ((I32)namedclass) {
13940                 case ANYOF_UNIPROP:
13941                     break;
13942
13943                 /* These don't depend on the charset modifiers.  They always
13944                  * match under /u rules */
13945                 case ANYOF_NHORIZWS:
13946                 case ANYOF_HORIZWS:
13947                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13948                     /* FALLTHROUGH */
13949
13950                 case ANYOF_NVERTWS:
13951                 case ANYOF_VERTWS:
13952                     op = POSIXU;
13953                     goto join_posix;
13954
13955                 /* The actual POSIXish node for all the rest depends on the
13956                  * charset modifier.  The ones in the first set depend only on
13957                  * ASCII or, if available on this platform, locale */
13958                 case ANYOF_ASCII:
13959                 case ANYOF_NASCII:
13960 #ifdef HAS_ISASCII
13961                     op = (LOC) ? POSIXL : POSIXA;
13962 #else
13963                     op = POSIXA;
13964 #endif
13965                     goto join_posix;
13966
13967                 case ANYOF_NCASED:
13968                 case ANYOF_LOWER:
13969                 case ANYOF_NLOWER:
13970                 case ANYOF_UPPER:
13971                 case ANYOF_NUPPER:
13972                     /* under /a could be alpha */
13973                     if (FOLD) {
13974                         if (ASCII_RESTRICTED) {
13975                             namedclass = ANYOF_ALPHA + (namedclass % 2);
13976                         }
13977                         else if (! LOC) {
13978                             break;
13979                         }
13980                     }
13981                     /* FALLTHROUGH */
13982
13983                 /* The rest have more possibilities depending on the charset.
13984                  * We take advantage of the enum ordering of the charset
13985                  * modifiers to get the exact node type, */
13986                 default:
13987                     op = POSIXD + get_regex_charset(RExC_flags);
13988                     if (op > POSIXA) { /* /aa is same as /a */
13989                         op = POSIXA;
13990                     }
13991 #ifndef HAS_ISBLANK
13992                     if (op == POSIXL
13993                         && (namedclass == ANYOF_BLANK
13994                             || namedclass == ANYOF_NBLANK))
13995                     {
13996                         op = POSIXA;
13997                     }
13998 #endif
13999
14000                 join_posix:
14001                     /* The odd numbered ones are the complements of the
14002                      * next-lower even number one */
14003                     if (namedclass % 2 == 1) {
14004                         invert = ! invert;
14005                         namedclass--;
14006                     }
14007                     arg = namedclass_to_classnum(namedclass);
14008                     break;
14009             }
14010         }
14011         else if (value == prevvalue) {
14012
14013             /* Here, the class consists of just a single code point */
14014
14015             if (invert) {
14016                 if (! LOC && value == '\n') {
14017                     op = REG_ANY; /* Optimize [^\n] */
14018                     *flagp |= HASWIDTH|SIMPLE;
14019                     RExC_naughty++;
14020                 }
14021             }
14022             else if (value < 256 || UTF) {
14023
14024                 /* Optimize a single value into an EXACTish node, but not if it
14025                  * would require converting the pattern to UTF-8. */
14026                 op = compute_EXACTish(pRExC_state);
14027             }
14028         } /* Otherwise is a range */
14029         else if (! LOC) {   /* locale could vary these */
14030             if (prevvalue == '0') {
14031                 if (value == '9') {
14032                     arg = _CC_DIGIT;
14033                     op = POSIXA;
14034                 }
14035             }
14036         }
14037
14038         /* Here, we have changed <op> away from its initial value iff we found
14039          * an optimization */
14040         if (op != END) {
14041
14042             /* Throw away this ANYOF regnode, and emit the calculated one,
14043              * which should correspond to the beginning, not current, state of
14044              * the parse */
14045             const char * cur_parse = RExC_parse;
14046             RExC_parse = (char *)orig_parse;
14047             if ( SIZE_ONLY) {
14048                 if (! LOC) {
14049
14050                     /* To get locale nodes to not use the full ANYOF size would
14051                      * require moving the code above that writes the portions
14052                      * of it that aren't in other nodes to after this point.
14053                      * e.g.  ANYOF_POSIXL_SET */
14054                     RExC_size = orig_size;
14055                 }
14056             }
14057             else {
14058                 RExC_emit = (regnode *)orig_emit;
14059                 if (PL_regkind[op] == POSIXD) {
14060                     if (invert) {
14061                         op += NPOSIXD - POSIXD;
14062                     }
14063                 }
14064             }
14065
14066             ret = reg_node(pRExC_state, op);
14067
14068             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14069                 if (! SIZE_ONLY) {
14070                     FLAGS(ret) = arg;
14071                 }
14072                 *flagp |= HASWIDTH|SIMPLE;
14073             }
14074             else if (PL_regkind[op] == EXACT) {
14075                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14076             }
14077
14078             RExC_parse = (char *) cur_parse;
14079
14080             SvREFCNT_dec(posixes);
14081             SvREFCNT_dec(cp_list);
14082             return ret;
14083         }
14084     }
14085
14086     if (SIZE_ONLY)
14087         return ret;
14088     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14089
14090     /* If folding, we calculate all characters that could fold to or from the
14091      * ones already on the list */
14092     if (FOLD && cp_list) {
14093         UV start, end;  /* End points of code point ranges */
14094
14095         SV* fold_intersection = NULL;
14096
14097         /* If the highest code point is within Latin1, we can use the
14098          * compiled-in Alphas list, and not have to go out to disk.  This
14099          * yields two false positives, the masculine and feminine ordinal
14100          * indicators, which are weeded out below using the
14101          * IS_IN_SOME_FOLD_L1() macro */
14102         if (invlist_highest(cp_list) < 256) {
14103             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
14104                                                            &fold_intersection);
14105         }
14106         else {
14107
14108             /* Here, there are non-Latin1 code points, so we will have to go
14109              * fetch the list of all the characters that participate in folds
14110              */
14111             if (! PL_utf8_foldable) {
14112                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14113                                        &PL_sv_undef, 1, 0);
14114                 PL_utf8_foldable = _get_swash_invlist(swash);
14115                 SvREFCNT_dec_NN(swash);
14116             }
14117
14118             /* This is a hash that for a particular fold gives all characters
14119              * that are involved in it */
14120             if (! PL_utf8_foldclosures) {
14121
14122                 /* If we were unable to find any folds, then we likely won't be
14123                  * able to find the closures.  So just create an empty list.
14124                  * Folding will effectively be restricted to the non-Unicode
14125                  * rules hard-coded into Perl.  (This case happens legitimately
14126                  * during compilation of Perl itself before the Unicode tables
14127                  * are generated) */
14128                 if (_invlist_len(PL_utf8_foldable) == 0) {
14129                     PL_utf8_foldclosures = newHV();
14130                 }
14131                 else {
14132                     /* If the folds haven't been read in, call a fold function
14133                      * to force that */
14134                     if (! PL_utf8_tofold) {
14135                         U8 dummy[UTF8_MAXBYTES_CASE+1];
14136
14137                         /* This string is just a short named one above \xff */
14138                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14139                         assert(PL_utf8_tofold); /* Verify that worked */
14140                     }
14141                     PL_utf8_foldclosures =
14142                                     _swash_inversion_hash(PL_utf8_tofold);
14143                 }
14144             }
14145
14146             /* Only the characters in this class that participate in folds need
14147              * be checked.  Get the intersection of this class and all the
14148              * possible characters that are foldable.  This can quickly narrow
14149              * down a large class */
14150             _invlist_intersection(PL_utf8_foldable, cp_list,
14151                                   &fold_intersection);
14152         }
14153
14154         /* Now look at the foldable characters in this class individually */
14155         invlist_iterinit(fold_intersection);
14156         while (invlist_iternext(fold_intersection, &start, &end)) {
14157             UV j;
14158
14159             /* Locale folding for Latin1 characters is deferred until runtime */
14160             if (LOC && start < 256) {
14161                 start = 256;
14162             }
14163
14164             /* Look at every character in the range */
14165             for (j = start; j <= end; j++) {
14166
14167                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14168                 STRLEN foldlen;
14169                 SV** listp;
14170
14171                 if (j < 256) {
14172
14173                     /* We have the latin1 folding rules hard-coded here so that
14174                      * an innocent-looking character class, like /[ks]/i won't
14175                      * have to go out to disk to find the possible matches.
14176                      * XXX It would be better to generate these via regen, in
14177                      * case a new version of the Unicode standard adds new
14178                      * mappings, though that is not really likely, and may be
14179                      * caught by the default: case of the switch below. */
14180
14181                     if (IS_IN_SOME_FOLD_L1(j)) {
14182
14183                         /* ASCII is always matched; non-ASCII is matched only
14184                          * under Unicode rules */
14185                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
14186                             cp_list =
14187                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
14188                         }
14189                         else {
14190                             depends_list =
14191                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
14192                         }
14193                     }
14194
14195                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14196                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14197                     {
14198                         /* Certain Latin1 characters have matches outside
14199                          * Latin1.  To get here, <j> is one of those
14200                          * characters.   None of these matches is valid for
14201                          * ASCII characters under /aa, which is why the 'if'
14202                          * just above excludes those.  These matches only
14203                          * happen when the target string is utf8.  The code
14204                          * below adds the single fold closures for <j> to the
14205                          * inversion list. */
14206                         switch (j) {
14207                             case 'k':
14208                             case 'K':
14209                                 cp_list =
14210                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
14211                                 break;
14212                             case 's':
14213                             case 'S':
14214                                 cp_list = add_cp_to_invlist(cp_list,
14215                                                     LATIN_SMALL_LETTER_LONG_S);
14216                                 break;
14217                             case MICRO_SIGN:
14218                                 cp_list = add_cp_to_invlist(cp_list,
14219                                                     GREEK_CAPITAL_LETTER_MU);
14220                                 cp_list = add_cp_to_invlist(cp_list,
14221                                                     GREEK_SMALL_LETTER_MU);
14222                                 break;
14223                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14224                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14225                                 cp_list =
14226                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
14227                                 break;
14228                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14229                                 cp_list = add_cp_to_invlist(cp_list,
14230                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14231                                 break;
14232                             case LATIN_SMALL_LETTER_SHARP_S:
14233                                 cp_list = add_cp_to_invlist(cp_list,
14234                                                 LATIN_CAPITAL_LETTER_SHARP_S);
14235                                 break;
14236                             case 'F': case 'f':
14237                             case 'I': case 'i':
14238                             case 'L': case 'l':
14239                             case 'T': case 't':
14240                             case 'A': case 'a':
14241                             case 'H': case 'h':
14242                             case 'J': case 'j':
14243                             case 'N': case 'n':
14244                             case 'W': case 'w':
14245                             case 'Y': case 'y':
14246                                 /* These all are targets of multi-character
14247                                  * folds from code points that require UTF8 to
14248                                  * express, so they can't match unless the
14249                                  * target string is in UTF-8, so no action here
14250                                  * is necessary, as regexec.c properly handles
14251                                  * the general case for UTF-8 matching and
14252                                  * multi-char folds */
14253                                 break;
14254                             default:
14255                                 /* Use deprecated warning to increase the
14256                                  * chances of this being output */
14257                                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14258                                 break;
14259                         }
14260                     }
14261                     continue;
14262                 }
14263
14264                 /* Here is an above Latin1 character.  We don't have the rules
14265                  * hard-coded for it.  First, get its fold.  This is the simple
14266                  * fold, as the multi-character folds have been handled earlier
14267                  * and separated out */
14268                 _to_uni_fold_flags(j, foldbuf, &foldlen,
14269                                                ((LOC)
14270                                                ? FOLD_FLAGS_LOCALE
14271                                                : (ASCII_FOLD_RESTRICTED)
14272                                                   ? FOLD_FLAGS_NOMIX_ASCII
14273                                                   : 0));
14274
14275                 /* Single character fold of above Latin1.  Add everything in
14276                  * its fold closure to the list that this node should match.
14277                  * The fold closures data structure is a hash with the keys
14278                  * being the UTF-8 of every character that is folded to, like
14279                  * 'k', and the values each an array of all code points that
14280                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14281                  * Multi-character folds are not included */
14282                 if ((listp = hv_fetch(PL_utf8_foldclosures,
14283                                       (char *) foldbuf, foldlen, FALSE)))
14284                 {
14285                     AV* list = (AV*) *listp;
14286                     IV k;
14287                     for (k = 0; k <= av_len(list); k++) {
14288                         SV** c_p = av_fetch(list, k, FALSE);
14289                         UV c;
14290                         if (c_p == NULL) {
14291                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14292                         }
14293                         c = SvUV(*c_p);
14294
14295                         /* /aa doesn't allow folds between ASCII and non-; /l
14296                          * doesn't allow them between above and below 256 */
14297                         if ((ASCII_FOLD_RESTRICTED
14298                                   && (isASCII(c) != isASCII(j)))
14299                             || (LOC && c < 256)) {
14300                             continue;
14301                         }
14302
14303                         /* Folds involving non-ascii Latin1 characters
14304                          * under /d are added to a separate list */
14305                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14306                         {
14307                             cp_list = add_cp_to_invlist(cp_list, c);
14308                         }
14309                         else {
14310                           depends_list = add_cp_to_invlist(depends_list, c);
14311                         }
14312                     }
14313                 }
14314             }
14315         }
14316         SvREFCNT_dec_NN(fold_intersection);
14317     }
14318
14319     /* And combine the result (if any) with any inversion list from posix
14320      * classes.  The lists are kept separate up to now because we don't want to
14321      * fold the classes (folding of those is automatically handled by the swash
14322      * fetching code) */
14323     if (posixes) {
14324         if (! DEPENDS_SEMANTICS) {
14325             if (cp_list) {
14326                 _invlist_union(cp_list, posixes, &cp_list);
14327                 SvREFCNT_dec_NN(posixes);
14328             }
14329             else {
14330                 cp_list = posixes;
14331             }
14332         }
14333         else {
14334             /* Under /d, we put into a separate list the Latin1 things that
14335              * match only when the target string is utf8 */
14336             SV* nonascii_but_latin1_properties = NULL;
14337             _invlist_intersection(posixes, PL_UpperLatin1,
14338                                   &nonascii_but_latin1_properties);
14339             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14340                               &posixes);
14341             if (cp_list) {
14342                 _invlist_union(cp_list, posixes, &cp_list);
14343                 SvREFCNT_dec_NN(posixes);
14344             }
14345             else {
14346                 cp_list = posixes;
14347             }
14348
14349             if (depends_list) {
14350                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14351                                &depends_list);
14352                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14353             }
14354             else {
14355                 depends_list = nonascii_but_latin1_properties;
14356             }
14357         }
14358     }
14359
14360     /* And combine the result (if any) with any inversion list from properties.
14361      * The lists are kept separate up to now so that we can distinguish the two
14362      * in regards to matching above-Unicode.  A run-time warning is generated
14363      * if a Unicode property is matched against a non-Unicode code point. But,
14364      * we allow user-defined properties to match anything, without any warning,
14365      * and we also suppress the warning if there is a portion of the character
14366      * class that isn't a Unicode property, and which matches above Unicode, \W
14367      * or [\x{110000}] for example.
14368      * (Note that in this case, unlike the Posix one above, there is no
14369      * <depends_list>, because having a Unicode property forces Unicode
14370      * semantics */
14371     if (properties) {
14372         bool warn_super = ! has_user_defined_property;
14373         if (cp_list) {
14374
14375             /* If it matters to the final outcome, see if a non-property
14376              * component of the class matches above Unicode.  If so, the
14377              * warning gets suppressed.  This is true even if just a single
14378              * such code point is specified, as though not strictly correct if
14379              * another such code point is matched against, the fact that they
14380              * are using above-Unicode code points indicates they should know
14381              * the issues involved */
14382             if (warn_super) {
14383                 bool non_prop_matches_above_Unicode =
14384                             runtime_posix_matches_above_Unicode
14385                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
14386                 if (invert) {
14387                     non_prop_matches_above_Unicode =
14388                                             !  non_prop_matches_above_Unicode;
14389                 }
14390                 warn_super = ! non_prop_matches_above_Unicode;
14391             }
14392
14393             _invlist_union(properties, cp_list, &cp_list);
14394             SvREFCNT_dec_NN(properties);
14395         }
14396         else {
14397             cp_list = properties;
14398         }
14399
14400         if (warn_super) {
14401             OP(ret) = ANYOF_WARN_SUPER;
14402         }
14403     }
14404
14405     /* Here, we have calculated what code points should be in the character
14406      * class.
14407      *
14408      * Now we can see about various optimizations.  Fold calculation (which we
14409      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14410      * would invert to include K, which under /i would match k, which it
14411      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14412      * folded until runtime */
14413
14414     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14415      * at compile time.  Besides not inverting folded locale now, we can't
14416      * invert if there are things such as \w, which aren't known until runtime
14417      * */
14418     if (invert
14419         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_POSIXL)))
14420         && ! depends_list
14421         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14422     {
14423         _invlist_invert(cp_list);
14424
14425         /* Any swash can't be used as-is, because we've inverted things */
14426         if (swash) {
14427             SvREFCNT_dec_NN(swash);
14428             swash = NULL;
14429         }
14430
14431         /* Clear the invert flag since have just done it here */
14432         invert = FALSE;
14433     }
14434
14435     if (ret_invlist) {
14436         *ret_invlist = cp_list;
14437         SvREFCNT_dec(swash);
14438
14439         /* Discard the generated node */
14440         if (SIZE_ONLY) {
14441             RExC_size = orig_size;
14442         }
14443         else {
14444             RExC_emit = orig_emit;
14445         }
14446         return orig_emit;
14447     }
14448
14449     /* If we didn't do folding, it's because some information isn't available
14450      * until runtime; set the run-time fold flag for these.  (We don't have to
14451      * worry about properties folding, as that is taken care of by the swash
14452      * fetching) */
14453     if (FOLD && LOC)
14454     {
14455        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14456     }
14457
14458     /* Some character classes are equivalent to other nodes.  Such nodes take
14459      * up less room and generally fewer operations to execute than ANYOF nodes.
14460      * Above, we checked for and optimized into some such equivalents for
14461      * certain common classes that are easy to test.  Getting to this point in
14462      * the code means that the class didn't get optimized there.  Since this
14463      * code is only executed in Pass 2, it is too late to save space--it has
14464      * been allocated in Pass 1, and currently isn't given back.  But turning
14465      * things into an EXACTish node can allow the optimizer to join it to any
14466      * adjacent such nodes.  And if the class is equivalent to things like /./,
14467      * expensive run-time swashes can be avoided.  Now that we have more
14468      * complete information, we can find things necessarily missed by the
14469      * earlier code.  I (khw) am not sure how much to look for here.  It would
14470      * be easy, but perhaps too slow, to check any candidates against all the
14471      * node types they could possibly match using _invlistEQ(). */
14472
14473     if (cp_list
14474         && ! invert
14475         && ! depends_list
14476         && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
14477         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14478     {
14479         UV start, end;
14480         U8 op = END;  /* The optimzation node-type */
14481         const char * cur_parse= RExC_parse;
14482
14483         invlist_iterinit(cp_list);
14484         if (! invlist_iternext(cp_list, &start, &end)) {
14485
14486             /* Here, the list is empty.  This happens, for example, when a
14487              * Unicode property is the only thing in the character class, and
14488              * it doesn't match anything.  (perluniprops.pod notes such
14489              * properties) */
14490             op = OPFAIL;
14491             *flagp |= HASWIDTH|SIMPLE;
14492         }
14493         else if (start == end) {    /* The range is a single code point */
14494             if (! invlist_iternext(cp_list, &start, &end)
14495
14496                     /* Don't do this optimization if it would require changing
14497                      * the pattern to UTF-8 */
14498                 && (start < 256 || UTF))
14499             {
14500                 /* Here, the list contains a single code point.  Can optimize
14501                  * into an EXACT node */
14502
14503                 value = start;
14504
14505                 if (! FOLD) {
14506                     op = EXACT;
14507                 }
14508                 else if (LOC) {
14509
14510                     /* A locale node under folding with one code point can be
14511                      * an EXACTFL, as its fold won't be calculated until
14512                      * runtime */
14513                     op = EXACTFL;
14514                 }
14515                 else {
14516
14517                     /* Here, we are generally folding, but there is only one
14518                      * code point to match.  If we have to, we use an EXACT
14519                      * node, but it would be better for joining with adjacent
14520                      * nodes in the optimization pass if we used the same
14521                      * EXACTFish node that any such are likely to be.  We can
14522                      * do this iff the code point doesn't participate in any
14523                      * folds.  For example, an EXACTF of a colon is the same as
14524                      * an EXACT one, since nothing folds to or from a colon. */
14525                     if (value < 256) {
14526                         if (IS_IN_SOME_FOLD_L1(value)) {
14527                             op = EXACT;
14528                         }
14529                     }
14530                     else {
14531                         if (! PL_utf8_foldable) {
14532                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14533                                                 &PL_sv_undef, 1, 0);
14534                             PL_utf8_foldable = _get_swash_invlist(swash);
14535                             SvREFCNT_dec_NN(swash);
14536                         }
14537                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14538                             op = EXACT;
14539                         }
14540                     }
14541
14542                     /* If we haven't found the node type, above, it means we
14543                      * can use the prevailing one */
14544                     if (op == END) {
14545                         op = compute_EXACTish(pRExC_state);
14546                     }
14547                 }
14548             }
14549         }
14550         else if (start == 0) {
14551             if (end == UV_MAX) {
14552                 op = SANY;
14553                 *flagp |= HASWIDTH|SIMPLE;
14554                 RExC_naughty++;
14555             }
14556             else if (end == '\n' - 1
14557                     && invlist_iternext(cp_list, &start, &end)
14558                     && start == '\n' + 1 && end == UV_MAX)
14559             {
14560                 op = REG_ANY;
14561                 *flagp |= HASWIDTH|SIMPLE;
14562                 RExC_naughty++;
14563             }
14564         }
14565         invlist_iterfinish(cp_list);
14566
14567         if (op != END) {
14568             RExC_parse = (char *)orig_parse;
14569             RExC_emit = (regnode *)orig_emit;
14570
14571             ret = reg_node(pRExC_state, op);
14572
14573             RExC_parse = (char *)cur_parse;
14574
14575             if (PL_regkind[op] == EXACT) {
14576                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14577             }
14578
14579             SvREFCNT_dec_NN(cp_list);
14580             return ret;
14581         }
14582     }
14583
14584     /* Here, <cp_list> contains all the code points we can determine at
14585      * compile time that match under all conditions.  Go through it, and
14586      * for things that belong in the bitmap, put them there, and delete from
14587      * <cp_list>.  While we are at it, see if everything above 255 is in the
14588      * list, and if so, set a flag to speed up execution */
14589
14590     populate_ANYOF_from_invlist(ret, &cp_list);
14591
14592     if (invert) {
14593         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14594     }
14595
14596     /* Here, the bitmap has been populated with all the Latin1 code points that
14597      * always match.  Can now add to the overall list those that match only
14598      * when the target string is UTF-8 (<depends_list>). */
14599     if (depends_list) {
14600         if (cp_list) {
14601             _invlist_union(cp_list, depends_list, &cp_list);
14602             SvREFCNT_dec_NN(depends_list);
14603         }
14604         else {
14605             cp_list = depends_list;
14606         }
14607     }
14608
14609     /* If there is a swash and more than one element, we can't use the swash in
14610      * the optimization below. */
14611     if (swash && element_count > 1) {
14612         SvREFCNT_dec_NN(swash);
14613         swash = NULL;
14614     }
14615
14616     set_ANYOF_arg(pRExC_state, ret, cp_list,
14617                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14618                    ? listsv : NULL,
14619                   swash, has_user_defined_property);
14620
14621     *flagp |= HASWIDTH|SIMPLE;
14622     return ret;
14623 }
14624
14625 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14626
14627 STATIC void
14628 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14629                 regnode* const node,
14630                 SV* const cp_list,
14631                 SV* const runtime_defns,
14632                 SV* const swash,
14633                 const bool has_user_defined_property)
14634 {
14635     /* Sets the arg field of an ANYOF-type node 'node', using information about
14636      * the node passed-in.  If there is nothing outside the node's bitmap, the
14637      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14638      * the count returned by add_data(), having allocated and stored an array,
14639      * av, that that count references, as follows:
14640      *  av[0] stores the character class description in its textual form.
14641      *        This is used later (regexec.c:Perl_regclass_swash()) to
14642      *        initialize the appropriate swash, and is also useful for dumping
14643      *        the regnode.  This is set to &PL_sv_undef if the textual
14644      *        description is not needed at run-time (as happens if the other
14645      *        elements completely define the class)
14646      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14647      *        computed from av[0].  But if no further computation need be done,
14648      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14649      *  av[2] stores the cp_list inversion list for use in addition or instead
14650      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14651      *        (Otherwise everything needed is already in av[0] and av[1])
14652      *  av[3] is set if any component of the class is from a user-defined
14653      *        property; used only if av[2] exists */
14654
14655     UV n;
14656
14657     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14658
14659     if (! cp_list && ! runtime_defns) {
14660         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14661     }
14662     else {
14663         AV * const av = newAV();
14664         SV *rv;
14665
14666         av_store(av, 0, (runtime_defns)
14667                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14668         if (swash) {
14669             av_store(av, 1, swash);
14670             SvREFCNT_dec_NN(cp_list);
14671         }
14672         else {
14673             av_store(av, 1, &PL_sv_undef);
14674             if (cp_list) {
14675                 av_store(av, 2, cp_list);
14676                 av_store(av, 3, newSVuv(has_user_defined_property));
14677             }
14678         }
14679
14680         rv = newRV_noinc(MUTABLE_SV(av));
14681         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14682         RExC_rxi->data->data[n] = (void*)rv;
14683         ARG_SET(node, n);
14684     }
14685 }
14686
14687
14688 /* reg_skipcomment()
14689
14690    Absorbs an /x style # comments from the input stream.
14691    Returns true if there is more text remaining in the stream.
14692    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14693    terminates the pattern without including a newline.
14694
14695    Note its the callers responsibility to ensure that we are
14696    actually in /x mode
14697
14698 */
14699
14700 STATIC bool
14701 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14702 {
14703     bool ended = 0;
14704
14705     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14706
14707     while (RExC_parse < RExC_end)
14708         if (*RExC_parse++ == '\n') {
14709             ended = 1;
14710             break;
14711         }
14712     if (!ended) {
14713         /* we ran off the end of the pattern without ending
14714            the comment, so we have to add an \n when wrapping */
14715         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14716         return 0;
14717     } else
14718         return 1;
14719 }
14720
14721 /* nextchar()
14722
14723    Advances the parse position, and optionally absorbs
14724    "whitespace" from the inputstream.
14725
14726    Without /x "whitespace" means (?#...) style comments only,
14727    with /x this means (?#...) and # comments and whitespace proper.
14728
14729    Returns the RExC_parse point from BEFORE the scan occurs.
14730
14731    This is the /x friendly way of saying RExC_parse++.
14732 */
14733
14734 STATIC char*
14735 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14736 {
14737     char* const retval = RExC_parse++;
14738
14739     PERL_ARGS_ASSERT_NEXTCHAR;
14740
14741     for (;;) {
14742         if (RExC_end - RExC_parse >= 3
14743             && *RExC_parse == '('
14744             && RExC_parse[1] == '?'
14745             && RExC_parse[2] == '#')
14746         {
14747             while (*RExC_parse != ')') {
14748                 if (RExC_parse == RExC_end)
14749                     FAIL("Sequence (?#... not terminated");
14750                 RExC_parse++;
14751             }
14752             RExC_parse++;
14753             continue;
14754         }
14755         if (RExC_flags & RXf_PMf_EXTENDED) {
14756             if (isSPACE(*RExC_parse)) {
14757                 RExC_parse++;
14758                 continue;
14759             }
14760             else if (*RExC_parse == '#') {
14761                 if ( reg_skipcomment( pRExC_state ) )
14762                     continue;
14763             }
14764         }
14765         return retval;
14766     }
14767 }
14768
14769 /*
14770 - reg_node - emit a node
14771 */
14772 STATIC regnode *                        /* Location. */
14773 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14774 {
14775     dVAR;
14776     regnode *ptr;
14777     regnode * const ret = RExC_emit;
14778     GET_RE_DEBUG_FLAGS_DECL;
14779
14780     PERL_ARGS_ASSERT_REG_NODE;
14781
14782     if (SIZE_ONLY) {
14783         SIZE_ALIGN(RExC_size);
14784         RExC_size += 1;
14785         return(ret);
14786     }
14787     if (RExC_emit >= RExC_emit_bound)
14788         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14789                    op, RExC_emit, RExC_emit_bound);
14790
14791     NODE_ALIGN_FILL(ret);
14792     ptr = ret;
14793     FILL_ADVANCE_NODE(ptr, op);
14794 #ifdef RE_TRACK_PATTERN_OFFSETS
14795     if (RExC_offsets) {         /* MJD */
14796         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14797               "reg_node", __LINE__,
14798               PL_reg_name[op],
14799               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14800                 ? "Overwriting end of array!\n" : "OK",
14801               (UV)(RExC_emit - RExC_emit_start),
14802               (UV)(RExC_parse - RExC_start),
14803               (UV)RExC_offsets[0]));
14804         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14805     }
14806 #endif
14807     RExC_emit = ptr;
14808     return(ret);
14809 }
14810
14811 /*
14812 - reganode - emit a node with an argument
14813 */
14814 STATIC regnode *                        /* Location. */
14815 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14816 {
14817     dVAR;
14818     regnode *ptr;
14819     regnode * const ret = RExC_emit;
14820     GET_RE_DEBUG_FLAGS_DECL;
14821
14822     PERL_ARGS_ASSERT_REGANODE;
14823
14824     if (SIZE_ONLY) {
14825         SIZE_ALIGN(RExC_size);
14826         RExC_size += 2;
14827         /*
14828            We can't do this:
14829
14830            assert(2==regarglen[op]+1);
14831
14832            Anything larger than this has to allocate the extra amount.
14833            If we changed this to be:
14834
14835            RExC_size += (1 + regarglen[op]);
14836
14837            then it wouldn't matter. Its not clear what side effect
14838            might come from that so its not done so far.
14839            -- dmq
14840         */
14841         return(ret);
14842     }
14843     if (RExC_emit >= RExC_emit_bound)
14844         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14845                    op, RExC_emit, RExC_emit_bound);
14846
14847     NODE_ALIGN_FILL(ret);
14848     ptr = ret;
14849     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14850 #ifdef RE_TRACK_PATTERN_OFFSETS
14851     if (RExC_offsets) {         /* MJD */
14852         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14853               "reganode",
14854               __LINE__,
14855               PL_reg_name[op],
14856               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14857               "Overwriting end of array!\n" : "OK",
14858               (UV)(RExC_emit - RExC_emit_start),
14859               (UV)(RExC_parse - RExC_start),
14860               (UV)RExC_offsets[0]));
14861         Set_Cur_Node_Offset;
14862     }
14863 #endif
14864     RExC_emit = ptr;
14865     return(ret);
14866 }
14867
14868 /*
14869 - reguni - emit (if appropriate) a Unicode character
14870 */
14871 STATIC STRLEN
14872 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14873 {
14874     dVAR;
14875
14876     PERL_ARGS_ASSERT_REGUNI;
14877
14878     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14879 }
14880
14881 /*
14882 - reginsert - insert an operator in front of already-emitted operand
14883 *
14884 * Means relocating the operand.
14885 */
14886 STATIC void
14887 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14888 {
14889     dVAR;
14890     regnode *src;
14891     regnode *dst;
14892     regnode *place;
14893     const int offset = regarglen[(U8)op];
14894     const int size = NODE_STEP_REGNODE + offset;
14895     GET_RE_DEBUG_FLAGS_DECL;
14896
14897     PERL_ARGS_ASSERT_REGINSERT;
14898     PERL_UNUSED_ARG(depth);
14899 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14900     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14901     if (SIZE_ONLY) {
14902         RExC_size += size;
14903         return;
14904     }
14905
14906     src = RExC_emit;
14907     RExC_emit += size;
14908     dst = RExC_emit;
14909     if (RExC_open_parens) {
14910         int paren;
14911         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14912         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14913             if ( RExC_open_parens[paren] >= opnd ) {
14914                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14915                 RExC_open_parens[paren] += size;
14916             } else {
14917                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14918             }
14919             if ( RExC_close_parens[paren] >= opnd ) {
14920                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14921                 RExC_close_parens[paren] += size;
14922             } else {
14923                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14924             }
14925         }
14926     }
14927
14928     while (src > opnd) {
14929         StructCopy(--src, --dst, regnode);
14930 #ifdef RE_TRACK_PATTERN_OFFSETS
14931         if (RExC_offsets) {     /* MJD 20010112 */
14932             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14933                   "reg_insert",
14934                   __LINE__,
14935                   PL_reg_name[op],
14936                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14937                     ? "Overwriting end of array!\n" : "OK",
14938                   (UV)(src - RExC_emit_start),
14939                   (UV)(dst - RExC_emit_start),
14940                   (UV)RExC_offsets[0]));
14941             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14942             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14943         }
14944 #endif
14945     }
14946
14947
14948     place = opnd;               /* Op node, where operand used to be. */
14949 #ifdef RE_TRACK_PATTERN_OFFSETS
14950     if (RExC_offsets) {         /* MJD */
14951         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14952               "reginsert",
14953               __LINE__,
14954               PL_reg_name[op],
14955               (UV)(place - RExC_emit_start) > RExC_offsets[0]
14956               ? "Overwriting end of array!\n" : "OK",
14957               (UV)(place - RExC_emit_start),
14958               (UV)(RExC_parse - RExC_start),
14959               (UV)RExC_offsets[0]));
14960         Set_Node_Offset(place, RExC_parse);
14961         Set_Node_Length(place, 1);
14962     }
14963 #endif
14964     src = NEXTOPER(place);
14965     FILL_ADVANCE_NODE(place, op);
14966     Zero(src, offset, regnode);
14967 }
14968
14969 /*
14970 - regtail - set the next-pointer at the end of a node chain of p to val.
14971 - SEE ALSO: regtail_study
14972 */
14973 /* TODO: All three parms should be const */
14974 STATIC void
14975 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14976 {
14977     dVAR;
14978     regnode *scan;
14979     GET_RE_DEBUG_FLAGS_DECL;
14980
14981     PERL_ARGS_ASSERT_REGTAIL;
14982 #ifndef DEBUGGING
14983     PERL_UNUSED_ARG(depth);
14984 #endif
14985
14986     if (SIZE_ONLY)
14987         return;
14988
14989     /* Find last node. */
14990     scan = p;
14991     for (;;) {
14992         regnode * const temp = regnext(scan);
14993         DEBUG_PARSE_r({
14994             SV * const mysv=sv_newmortal();
14995             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14996             regprop(RExC_rx, mysv, scan);
14997             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14998                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14999                     (temp == NULL ? "->" : ""),
15000                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15001             );
15002         });
15003         if (temp == NULL)
15004             break;
15005         scan = temp;
15006     }
15007
15008     if (reg_off_by_arg[OP(scan)]) {
15009         ARG_SET(scan, val - scan);
15010     }
15011     else {
15012         NEXT_OFF(scan) = val - scan;
15013     }
15014 }
15015
15016 #ifdef DEBUGGING
15017 /*
15018 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15019 - Look for optimizable sequences at the same time.
15020 - currently only looks for EXACT chains.
15021
15022 This is experimental code. The idea is to use this routine to perform
15023 in place optimizations on branches and groups as they are constructed,
15024 with the long term intention of removing optimization from study_chunk so
15025 that it is purely analytical.
15026
15027 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15028 to control which is which.
15029
15030 */
15031 /* TODO: All four parms should be const */
15032
15033 STATIC U8
15034 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
15035 {
15036     dVAR;
15037     regnode *scan;
15038     U8 exact = PSEUDO;
15039 #ifdef EXPERIMENTAL_INPLACESCAN
15040     I32 min = 0;
15041 #endif
15042     GET_RE_DEBUG_FLAGS_DECL;
15043
15044     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15045
15046
15047     if (SIZE_ONLY)
15048         return exact;
15049
15050     /* Find last node. */
15051
15052     scan = p;
15053     for (;;) {
15054         regnode * const temp = regnext(scan);
15055 #ifdef EXPERIMENTAL_INPLACESCAN
15056         if (PL_regkind[OP(scan)] == EXACT) {
15057             bool has_exactf_sharp_s;    /* Unexamined in this routine */
15058             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
15059                 return EXACT;
15060         }
15061 #endif
15062         if ( exact ) {
15063             switch (OP(scan)) {
15064                 case EXACT:
15065                 case EXACTF:
15066                 case EXACTFA_NO_TRIE:
15067                 case EXACTFA:
15068                 case EXACTFU:
15069                 case EXACTFU_SS:
15070                 case EXACTFL:
15071                         if( exact == PSEUDO )
15072                             exact= OP(scan);
15073                         else if ( exact != OP(scan) )
15074                             exact= 0;
15075                 case NOTHING:
15076                     break;
15077                 default:
15078                     exact= 0;
15079             }
15080         }
15081         DEBUG_PARSE_r({
15082             SV * const mysv=sv_newmortal();
15083             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15084             regprop(RExC_rx, mysv, scan);
15085             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15086                 SvPV_nolen_const(mysv),
15087                 REG_NODE_NUM(scan),
15088                 PL_reg_name[exact]);
15089         });
15090         if (temp == NULL)
15091             break;
15092         scan = temp;
15093     }
15094     DEBUG_PARSE_r({
15095         SV * const mysv_val=sv_newmortal();
15096         DEBUG_PARSE_MSG("");
15097         regprop(RExC_rx, mysv_val, val);
15098         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15099                       SvPV_nolen_const(mysv_val),
15100                       (IV)REG_NODE_NUM(val),
15101                       (IV)(val - scan)
15102         );
15103     });
15104     if (reg_off_by_arg[OP(scan)]) {
15105         ARG_SET(scan, val - scan);
15106     }
15107     else {
15108         NEXT_OFF(scan) = val - scan;
15109     }
15110
15111     return exact;
15112 }
15113 #endif
15114
15115 /*
15116  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15117  */
15118 #ifdef DEBUGGING
15119
15120 static void
15121 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15122 {
15123     int bit;
15124     int set=0;
15125
15126     for (bit=0; bit<32; bit++) {
15127         if (flags & (1<<bit)) {
15128             if (!set++ && lead)
15129                 PerlIO_printf(Perl_debug_log, "%s",lead);
15130             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15131         }
15132     }
15133     if (lead)  {
15134         if (set)
15135             PerlIO_printf(Perl_debug_log, "\n");
15136         else
15137             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15138     }
15139 }
15140
15141 static void
15142 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15143 {
15144     int bit;
15145     int set=0;
15146     regex_charset cs;
15147
15148     for (bit=0; bit<32; bit++) {
15149         if (flags & (1<<bit)) {
15150             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15151                 continue;
15152             }
15153             if (!set++ && lead)
15154                 PerlIO_printf(Perl_debug_log, "%s",lead);
15155             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15156         }
15157     }
15158     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15159             if (!set++ && lead) {
15160                 PerlIO_printf(Perl_debug_log, "%s",lead);
15161             }
15162             switch (cs) {
15163                 case REGEX_UNICODE_CHARSET:
15164                     PerlIO_printf(Perl_debug_log, "UNICODE");
15165                     break;
15166                 case REGEX_LOCALE_CHARSET:
15167                     PerlIO_printf(Perl_debug_log, "LOCALE");
15168                     break;
15169                 case REGEX_ASCII_RESTRICTED_CHARSET:
15170                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15171                     break;
15172                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15173                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15174                     break;
15175                 default:
15176                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15177                     break;
15178             }
15179     }
15180     if (lead)  {
15181         if (set)
15182             PerlIO_printf(Perl_debug_log, "\n");
15183         else
15184             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15185     }
15186 }
15187 #endif
15188
15189 void
15190 Perl_regdump(pTHX_ const regexp *r)
15191 {
15192 #ifdef DEBUGGING
15193     dVAR;
15194     SV * const sv = sv_newmortal();
15195     SV *dsv= sv_newmortal();
15196     RXi_GET_DECL(r,ri);
15197     GET_RE_DEBUG_FLAGS_DECL;
15198
15199     PERL_ARGS_ASSERT_REGDUMP;
15200
15201     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15202
15203     /* Header fields of interest. */
15204     if (r->anchored_substr) {
15205         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15206             RE_SV_DUMPLEN(r->anchored_substr), 30);
15207         PerlIO_printf(Perl_debug_log,
15208                       "anchored %s%s at %"IVdf" ",
15209                       s, RE_SV_TAIL(r->anchored_substr),
15210                       (IV)r->anchored_offset);
15211     } else if (r->anchored_utf8) {
15212         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15213             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15214         PerlIO_printf(Perl_debug_log,
15215                       "anchored utf8 %s%s at %"IVdf" ",
15216                       s, RE_SV_TAIL(r->anchored_utf8),
15217                       (IV)r->anchored_offset);
15218     }
15219     if (r->float_substr) {
15220         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15221             RE_SV_DUMPLEN(r->float_substr), 30);
15222         PerlIO_printf(Perl_debug_log,
15223                       "floating %s%s at %"IVdf"..%"UVuf" ",
15224                       s, RE_SV_TAIL(r->float_substr),
15225                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15226     } else if (r->float_utf8) {
15227         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15228             RE_SV_DUMPLEN(r->float_utf8), 30);
15229         PerlIO_printf(Perl_debug_log,
15230                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15231                       s, RE_SV_TAIL(r->float_utf8),
15232                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15233     }
15234     if (r->check_substr || r->check_utf8)
15235         PerlIO_printf(Perl_debug_log,
15236                       (const char *)
15237                       (r->check_substr == r->float_substr
15238                        && r->check_utf8 == r->float_utf8
15239                        ? "(checking floating" : "(checking anchored"));
15240     if (r->extflags & RXf_NOSCAN)
15241         PerlIO_printf(Perl_debug_log, " noscan");
15242     if (r->extflags & RXf_CHECK_ALL)
15243         PerlIO_printf(Perl_debug_log, " isall");
15244     if (r->check_substr || r->check_utf8)
15245         PerlIO_printf(Perl_debug_log, ") ");
15246
15247     if (ri->regstclass) {
15248         regprop(r, sv, ri->regstclass);
15249         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15250     }
15251     if (r->extflags & RXf_ANCH) {
15252         PerlIO_printf(Perl_debug_log, "anchored");
15253         if (r->extflags & RXf_ANCH_BOL)
15254             PerlIO_printf(Perl_debug_log, "(BOL)");
15255         if (r->extflags & RXf_ANCH_MBOL)
15256             PerlIO_printf(Perl_debug_log, "(MBOL)");
15257         if (r->extflags & RXf_ANCH_SBOL)
15258             PerlIO_printf(Perl_debug_log, "(SBOL)");
15259         if (r->extflags & RXf_ANCH_GPOS)
15260             PerlIO_printf(Perl_debug_log, "(GPOS)");
15261         PerlIO_putc(Perl_debug_log, ' ');
15262     }
15263     if (r->extflags & RXf_GPOS_SEEN)
15264         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15265     if (r->intflags & PREGf_SKIP)
15266         PerlIO_printf(Perl_debug_log, "plus ");
15267     if (r->intflags & PREGf_IMPLICIT)
15268         PerlIO_printf(Perl_debug_log, "implicit ");
15269     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15270     if (r->extflags & RXf_EVAL_SEEN)
15271         PerlIO_printf(Perl_debug_log, "with eval ");
15272     PerlIO_printf(Perl_debug_log, "\n");
15273     DEBUG_FLAGS_r({
15274         regdump_extflags("r->extflags: ",r->extflags);
15275         regdump_intflags("r->intflags: ",r->intflags);
15276     });
15277 #else
15278     PERL_ARGS_ASSERT_REGDUMP;
15279     PERL_UNUSED_CONTEXT;
15280     PERL_UNUSED_ARG(r);
15281 #endif  /* DEBUGGING */
15282 }
15283
15284 /*
15285 - regprop - printable representation of opcode
15286 */
15287
15288 void
15289 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
15290 {
15291 #ifdef DEBUGGING
15292     dVAR;
15293     int k;
15294
15295     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15296     static const char * const anyofs[] = {
15297 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15298     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15299     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15300     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15301     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15302     || _CC_VERTSPACE != 16
15303   #error Need to adjust order of anyofs[]
15304 #endif
15305         "\\w",
15306         "\\W",
15307         "\\d",
15308         "\\D",
15309         "[:alpha:]",
15310         "[:^alpha:]",
15311         "[:lower:]",
15312         "[:^lower:]",
15313         "[:upper:]",
15314         "[:^upper:]",
15315         "[:punct:]",
15316         "[:^punct:]",
15317         "[:print:]",
15318         "[:^print:]",
15319         "[:alnum:]",
15320         "[:^alnum:]",
15321         "[:graph:]",
15322         "[:^graph:]",
15323         "[:cased:]",
15324         "[:^cased:]",
15325         "\\s",
15326         "\\S",
15327         "[:blank:]",
15328         "[:^blank:]",
15329         "[:xdigit:]",
15330         "[:^xdigit:]",
15331         "[:space:]",
15332         "[:^space:]",
15333         "[:cntrl:]",
15334         "[:^cntrl:]",
15335         "[:ascii:]",
15336         "[:^ascii:]",
15337         "\\v",
15338         "\\V"
15339     };
15340     RXi_GET_DECL(prog,progi);
15341     GET_RE_DEBUG_FLAGS_DECL;
15342
15343     PERL_ARGS_ASSERT_REGPROP;
15344
15345     sv_setpvs(sv, "");
15346
15347     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15348         /* It would be nice to FAIL() here, but this may be called from
15349            regexec.c, and it would be hard to supply pRExC_state. */
15350         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
15351     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15352
15353     k = PL_regkind[OP(o)];
15354
15355     if (k == EXACT) {
15356         sv_catpvs(sv, " ");
15357         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15358          * is a crude hack but it may be the best for now since
15359          * we have no flag "this EXACTish node was UTF-8"
15360          * --jhi */
15361         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15362                   PERL_PV_ESCAPE_UNI_DETECT |
15363                   PERL_PV_ESCAPE_NONASCII   |
15364                   PERL_PV_PRETTY_ELLIPSES   |
15365                   PERL_PV_PRETTY_LTGT       |
15366                   PERL_PV_PRETTY_NOCLEAR
15367                   );
15368     } else if (k == TRIE) {
15369         /* print the details of the trie in dumpuntil instead, as
15370          * progi->data isn't available here */
15371         const char op = OP(o);
15372         const U32 n = ARG(o);
15373         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15374                (reg_ac_data *)progi->data->data[n] :
15375                NULL;
15376         const reg_trie_data * const trie
15377             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15378
15379         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15380         DEBUG_TRIE_COMPILE_r(
15381             Perl_sv_catpvf(aTHX_ sv,
15382                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15383                 (UV)trie->startstate,
15384                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15385                 (UV)trie->wordcount,
15386                 (UV)trie->minlen,
15387                 (UV)trie->maxlen,
15388                 (UV)TRIE_CHARCOUNT(trie),
15389                 (UV)trie->uniquecharcount
15390             )
15391         );
15392         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15393             sv_catpvs(sv, "[");
15394             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15395                                                    ? ANYOF_BITMAP(o)
15396                                                    : TRIE_BITMAP(trie));
15397             sv_catpvs(sv, "]");
15398         }
15399
15400     } else if (k == CURLY) {
15401         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15402             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15403         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15404     }
15405     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15406         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15407     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
15408         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15409         if ( RXp_PAREN_NAMES(prog) ) {
15410             if ( k != REF || (OP(o) < NREF)) {
15411                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15412                 SV **name= av_fetch(list, ARG(o), 0 );
15413                 if (name)
15414                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15415             }
15416             else {
15417                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15418                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15419                 I32 *nums=(I32*)SvPVX(sv_dat);
15420                 SV **name= av_fetch(list, nums[0], 0 );
15421                 I32 n;
15422                 if (name) {
15423                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15424                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15425                                     (n ? "," : ""), (IV)nums[n]);
15426                     }
15427                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15428                 }
15429             }
15430         }
15431     } else if (k == GOSUB)
15432         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
15433     else if (k == VERB) {
15434         if (!o->flags)
15435             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15436                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15437     } else if (k == LOGICAL)
15438         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
15439     else if (k == ANYOF) {
15440         const U8 flags = ANYOF_FLAGS(o);
15441         int do_sep = 0;
15442
15443
15444         if (flags & ANYOF_LOCALE)
15445             sv_catpvs(sv, "{loc}");
15446         if (flags & ANYOF_LOC_FOLD)
15447             sv_catpvs(sv, "{i}");
15448         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15449         if (flags & ANYOF_INVERT)
15450             sv_catpvs(sv, "^");
15451
15452         /* output what the standard cp 0-255 bitmap matches */
15453         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15454
15455         /* output any special charclass tests (used entirely under use
15456          * locale) * */
15457         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15458             int i;
15459             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15460                 if (ANYOF_POSIXL_TEST(o,i)) {
15461                     sv_catpv(sv, anyofs[i]);
15462                     do_sep = 1;
15463                 }
15464             }
15465         }
15466
15467         if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
15468             || ANYOF_NONBITMAP(o))
15469         {
15470             if (do_sep) {
15471                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15472                 if (flags & ANYOF_INVERT)
15473                     /*make sure the invert info is in each */
15474                     sv_catpvs(sv, "^");
15475             }
15476
15477         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
15478             sv_catpvs(sv, "{non-utf8-latin1-all}");
15479         }
15480
15481         /* output information about the unicode matching */
15482         if (flags & ANYOF_ABOVE_LATIN1_ALL)
15483             sv_catpvs(sv, "{unicode_all}");
15484         else if (ANYOF_NONBITMAP(o)) {
15485             SV *lv; /* Set if there is something outside the bit map. */
15486             bool byte_output = FALSE;   /* If something in the bitmap has been
15487                                            output */
15488
15489             if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15490                 sv_catpvs(sv, "{outside bitmap}");
15491             }
15492             else {
15493                 sv_catpvs(sv, "{utf8}");
15494             }
15495
15496             /* Get the stuff that wasn't in the bitmap */
15497             (void) regclass_swash(prog, o, FALSE, &lv, NULL);
15498             if (lv && lv != &PL_sv_undef) {
15499                 char *s = savesvpv(lv);
15500                 char * const origs = s;
15501
15502                 while (*s && *s != '\n')
15503                     s++;
15504
15505                 if (*s == '\n') {
15506                     const char * const t = ++s;
15507
15508                     if (byte_output) {
15509                         sv_catpvs(sv, " ");
15510                     }
15511
15512                     while (*s) {
15513                         if (*s == '\n') {
15514
15515                             /* Truncate very long output */
15516                             if (s - origs > 256) {
15517                                 Perl_sv_catpvf(aTHX_ sv,
15518                                                "%.*s...",
15519                                                (int) (s - origs - 1),
15520                                                t);
15521                                 goto out_dump;
15522                             }
15523                             *s = ' ';
15524                         }
15525                         else if (*s == '\t') {
15526                             *s = '-';
15527                         }
15528                         s++;
15529                     }
15530                     if (s[-1] == ' ')
15531                         s[-1] = 0;
15532
15533                     sv_catpv(sv, t);
15534                 }
15535
15536             out_dump:
15537
15538                 Safefree(origs);
15539                 SvREFCNT_dec_NN(lv);
15540             }
15541         }
15542         }
15543
15544         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15545     }
15546     else if (k == POSIXD || k == NPOSIXD) {
15547         U8 index = FLAGS(o) * 2;
15548         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
15549             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15550         }
15551         else {
15552             if (*anyofs[index] != '[')  {
15553                 sv_catpv(sv, "[");
15554             }
15555             sv_catpv(sv, anyofs[index]);
15556             if (*anyofs[index] != '[')  {
15557                 sv_catpv(sv, "]");
15558             }
15559         }
15560     }
15561     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15562         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15563 #else
15564     PERL_UNUSED_CONTEXT;
15565     PERL_UNUSED_ARG(sv);
15566     PERL_UNUSED_ARG(o);
15567     PERL_UNUSED_ARG(prog);
15568 #endif  /* DEBUGGING */
15569 }
15570
15571 SV *
15572 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15573 {                               /* Assume that RE_INTUIT is set */
15574     dVAR;
15575     struct regexp *const prog = ReANY(r);
15576     GET_RE_DEBUG_FLAGS_DECL;
15577
15578     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15579     PERL_UNUSED_CONTEXT;
15580
15581     DEBUG_COMPILE_r(
15582         {
15583             const char * const s = SvPV_nolen_const(prog->check_substr
15584                       ? prog->check_substr : prog->check_utf8);
15585
15586             if (!PL_colorset) reginitcolors();
15587             PerlIO_printf(Perl_debug_log,
15588                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15589                       PL_colors[4],
15590                       prog->check_substr ? "" : "utf8 ",
15591                       PL_colors[5],PL_colors[0],
15592                       s,
15593                       PL_colors[1],
15594                       (strlen(s) > 60 ? "..." : ""));
15595         } );
15596
15597     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15598 }
15599
15600 /*
15601    pregfree()
15602
15603    handles refcounting and freeing the perl core regexp structure. When
15604    it is necessary to actually free the structure the first thing it
15605    does is call the 'free' method of the regexp_engine associated to
15606    the regexp, allowing the handling of the void *pprivate; member
15607    first. (This routine is not overridable by extensions, which is why
15608    the extensions free is called first.)
15609
15610    See regdupe and regdupe_internal if you change anything here.
15611 */
15612 #ifndef PERL_IN_XSUB_RE
15613 void
15614 Perl_pregfree(pTHX_ REGEXP *r)
15615 {
15616     SvREFCNT_dec(r);
15617 }
15618
15619 void
15620 Perl_pregfree2(pTHX_ REGEXP *rx)
15621 {
15622     dVAR;
15623     struct regexp *const r = ReANY(rx);
15624     GET_RE_DEBUG_FLAGS_DECL;
15625
15626     PERL_ARGS_ASSERT_PREGFREE2;
15627
15628     if (r->mother_re) {
15629         ReREFCNT_dec(r->mother_re);
15630     } else {
15631         CALLREGFREE_PVT(rx); /* free the private data */
15632         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15633         Safefree(r->xpv_len_u.xpvlenu_pv);
15634     }
15635     if (r->substrs) {
15636         SvREFCNT_dec(r->anchored_substr);
15637         SvREFCNT_dec(r->anchored_utf8);
15638         SvREFCNT_dec(r->float_substr);
15639         SvREFCNT_dec(r->float_utf8);
15640         Safefree(r->substrs);
15641     }
15642     RX_MATCH_COPY_FREE(rx);
15643 #ifdef PERL_ANY_COW
15644     SvREFCNT_dec(r->saved_copy);
15645 #endif
15646     Safefree(r->offs);
15647     SvREFCNT_dec(r->qr_anoncv);
15648     rx->sv_u.svu_rx = 0;
15649 }
15650
15651 /*  reg_temp_copy()
15652
15653     This is a hacky workaround to the structural issue of match results
15654     being stored in the regexp structure which is in turn stored in
15655     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15656     could be PL_curpm in multiple contexts, and could require multiple
15657     result sets being associated with the pattern simultaneously, such
15658     as when doing a recursive match with (??{$qr})
15659
15660     The solution is to make a lightweight copy of the regexp structure
15661     when a qr// is returned from the code executed by (??{$qr}) this
15662     lightweight copy doesn't actually own any of its data except for
15663     the starp/end and the actual regexp structure itself.
15664
15665 */
15666
15667
15668 REGEXP *
15669 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15670 {
15671     struct regexp *ret;
15672     struct regexp *const r = ReANY(rx);
15673     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15674
15675     PERL_ARGS_ASSERT_REG_TEMP_COPY;
15676
15677     if (!ret_x)
15678         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15679     else {
15680         SvOK_off((SV *)ret_x);
15681         if (islv) {
15682             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15683                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15684                made both spots point to the same regexp body.) */
15685             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15686             assert(!SvPVX(ret_x));
15687             ret_x->sv_u.svu_rx = temp->sv_any;
15688             temp->sv_any = NULL;
15689             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15690             SvREFCNT_dec_NN(temp);
15691             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15692                ing below will not set it. */
15693             SvCUR_set(ret_x, SvCUR(rx));
15694         }
15695     }
15696     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15697        sv_force_normal(sv) is called.  */
15698     SvFAKE_on(ret_x);
15699     ret = ReANY(ret_x);
15700
15701     SvFLAGS(ret_x) |= SvUTF8(rx);
15702     /* We share the same string buffer as the original regexp, on which we
15703        hold a reference count, incremented when mother_re is set below.
15704        The string pointer is copied here, being part of the regexp struct.
15705      */
15706     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15707            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15708     if (r->offs) {
15709         const I32 npar = r->nparens+1;
15710         Newx(ret->offs, npar, regexp_paren_pair);
15711         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15712     }
15713     if (r->substrs) {
15714         Newx(ret->substrs, 1, struct reg_substr_data);
15715         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15716
15717         SvREFCNT_inc_void(ret->anchored_substr);
15718         SvREFCNT_inc_void(ret->anchored_utf8);
15719         SvREFCNT_inc_void(ret->float_substr);
15720         SvREFCNT_inc_void(ret->float_utf8);
15721
15722         /* check_substr and check_utf8, if non-NULL, point to either their
15723            anchored or float namesakes, and don't hold a second reference.  */
15724     }
15725     RX_MATCH_COPIED_off(ret_x);
15726 #ifdef PERL_ANY_COW
15727     ret->saved_copy = NULL;
15728 #endif
15729     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15730     SvREFCNT_inc_void(ret->qr_anoncv);
15731
15732     return ret_x;
15733 }
15734 #endif
15735
15736 /* regfree_internal()
15737
15738    Free the private data in a regexp. This is overloadable by
15739    extensions. Perl takes care of the regexp structure in pregfree(),
15740    this covers the *pprivate pointer which technically perl doesn't
15741    know about, however of course we have to handle the
15742    regexp_internal structure when no extension is in use.
15743
15744    Note this is called before freeing anything in the regexp
15745    structure.
15746  */
15747
15748 void
15749 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15750 {
15751     dVAR;
15752     struct regexp *const r = ReANY(rx);
15753     RXi_GET_DECL(r,ri);
15754     GET_RE_DEBUG_FLAGS_DECL;
15755
15756     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15757
15758     DEBUG_COMPILE_r({
15759         if (!PL_colorset)
15760             reginitcolors();
15761         {
15762             SV *dsv= sv_newmortal();
15763             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15764                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15765             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15766                 PL_colors[4],PL_colors[5],s);
15767         }
15768     });
15769 #ifdef RE_TRACK_PATTERN_OFFSETS
15770     if (ri->u.offsets)
15771         Safefree(ri->u.offsets);             /* 20010421 MJD */
15772 #endif
15773     if (ri->code_blocks) {
15774         int n;
15775         for (n = 0; n < ri->num_code_blocks; n++)
15776             SvREFCNT_dec(ri->code_blocks[n].src_regex);
15777         Safefree(ri->code_blocks);
15778     }
15779
15780     if (ri->data) {
15781         int n = ri->data->count;
15782
15783         while (--n >= 0) {
15784           /* If you add a ->what type here, update the comment in regcomp.h */
15785             switch (ri->data->what[n]) {
15786             case 'a':
15787             case 'r':
15788             case 's':
15789             case 'S':
15790             case 'u':
15791                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15792                 break;
15793             case 'f':
15794                 Safefree(ri->data->data[n]);
15795                 break;
15796             case 'l':
15797             case 'L':
15798                 break;
15799             case 'T':
15800                 { /* Aho Corasick add-on structure for a trie node.
15801                      Used in stclass optimization only */
15802                     U32 refcount;
15803                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15804                     OP_REFCNT_LOCK;
15805                     refcount = --aho->refcount;
15806                     OP_REFCNT_UNLOCK;
15807                     if ( !refcount ) {
15808                         PerlMemShared_free(aho->states);
15809                         PerlMemShared_free(aho->fail);
15810                          /* do this last!!!! */
15811                         PerlMemShared_free(ri->data->data[n]);
15812                         PerlMemShared_free(ri->regstclass);
15813                     }
15814                 }
15815                 break;
15816             case 't':
15817                 {
15818                     /* trie structure. */
15819                     U32 refcount;
15820                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15821                     OP_REFCNT_LOCK;
15822                     refcount = --trie->refcount;
15823                     OP_REFCNT_UNLOCK;
15824                     if ( !refcount ) {
15825                         PerlMemShared_free(trie->charmap);
15826                         PerlMemShared_free(trie->states);
15827                         PerlMemShared_free(trie->trans);
15828                         if (trie->bitmap)
15829                             PerlMemShared_free(trie->bitmap);
15830                         if (trie->jump)
15831                             PerlMemShared_free(trie->jump);
15832                         PerlMemShared_free(trie->wordinfo);
15833                         /* do this last!!!! */
15834                         PerlMemShared_free(ri->data->data[n]);
15835                     }
15836                 }
15837                 break;
15838             default:
15839                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15840             }
15841         }
15842         Safefree(ri->data->what);
15843         Safefree(ri->data);
15844     }
15845
15846     Safefree(ri);
15847 }
15848
15849 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15850 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15851 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
15852
15853 /*
15854    re_dup - duplicate a regexp.
15855
15856    This routine is expected to clone a given regexp structure. It is only
15857    compiled under USE_ITHREADS.
15858
15859    After all of the core data stored in struct regexp is duplicated
15860    the regexp_engine.dupe method is used to copy any private data
15861    stored in the *pprivate pointer. This allows extensions to handle
15862    any duplication it needs to do.
15863
15864    See pregfree() and regfree_internal() if you change anything here.
15865 */
15866 #if defined(USE_ITHREADS)
15867 #ifndef PERL_IN_XSUB_RE
15868 void
15869 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15870 {
15871     dVAR;
15872     I32 npar;
15873     const struct regexp *r = ReANY(sstr);
15874     struct regexp *ret = ReANY(dstr);
15875
15876     PERL_ARGS_ASSERT_RE_DUP_GUTS;
15877
15878     npar = r->nparens+1;
15879     Newx(ret->offs, npar, regexp_paren_pair);
15880     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15881
15882     if (ret->substrs) {
15883         /* Do it this way to avoid reading from *r after the StructCopy().
15884            That way, if any of the sv_dup_inc()s dislodge *r from the L1
15885            cache, it doesn't matter.  */
15886         const bool anchored = r->check_substr
15887             ? r->check_substr == r->anchored_substr
15888             : r->check_utf8 == r->anchored_utf8;
15889         Newx(ret->substrs, 1, struct reg_substr_data);
15890         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15891
15892         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15893         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15894         ret->float_substr = sv_dup_inc(ret->float_substr, param);
15895         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15896
15897         /* check_substr and check_utf8, if non-NULL, point to either their
15898            anchored or float namesakes, and don't hold a second reference.  */
15899
15900         if (ret->check_substr) {
15901             if (anchored) {
15902                 assert(r->check_utf8 == r->anchored_utf8);
15903                 ret->check_substr = ret->anchored_substr;
15904                 ret->check_utf8 = ret->anchored_utf8;
15905             } else {
15906                 assert(r->check_substr == r->float_substr);
15907                 assert(r->check_utf8 == r->float_utf8);
15908                 ret->check_substr = ret->float_substr;
15909                 ret->check_utf8 = ret->float_utf8;
15910             }
15911         } else if (ret->check_utf8) {
15912             if (anchored) {
15913                 ret->check_utf8 = ret->anchored_utf8;
15914             } else {
15915                 ret->check_utf8 = ret->float_utf8;
15916             }
15917         }
15918     }
15919
15920     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15921     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15922
15923     if (ret->pprivate)
15924         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15925
15926     if (RX_MATCH_COPIED(dstr))
15927         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15928     else
15929         ret->subbeg = NULL;
15930 #ifdef PERL_ANY_COW
15931     ret->saved_copy = NULL;
15932 #endif
15933
15934     /* Whether mother_re be set or no, we need to copy the string.  We
15935        cannot refrain from copying it when the storage points directly to
15936        our mother regexp, because that's
15937                1: a buffer in a different thread
15938                2: something we no longer hold a reference on
15939                so we need to copy it locally.  */
15940     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15941     ret->mother_re   = NULL;
15942 }
15943 #endif /* PERL_IN_XSUB_RE */
15944
15945 /*
15946    regdupe_internal()
15947
15948    This is the internal complement to regdupe() which is used to copy
15949    the structure pointed to by the *pprivate pointer in the regexp.
15950    This is the core version of the extension overridable cloning hook.
15951    The regexp structure being duplicated will be copied by perl prior
15952    to this and will be provided as the regexp *r argument, however
15953    with the /old/ structures pprivate pointer value. Thus this routine
15954    may override any copying normally done by perl.
15955
15956    It returns a pointer to the new regexp_internal structure.
15957 */
15958
15959 void *
15960 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15961 {
15962     dVAR;
15963     struct regexp *const r = ReANY(rx);
15964     regexp_internal *reti;
15965     int len;
15966     RXi_GET_DECL(r,ri);
15967
15968     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15969
15970     len = ProgLen(ri);
15971
15972     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15973     Copy(ri->program, reti->program, len+1, regnode);
15974
15975     reti->num_code_blocks = ri->num_code_blocks;
15976     if (ri->code_blocks) {
15977         int n;
15978         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15979                 struct reg_code_block);
15980         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15981                 struct reg_code_block);
15982         for (n = 0; n < ri->num_code_blocks; n++)
15983              reti->code_blocks[n].src_regex = (REGEXP*)
15984                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15985     }
15986     else
15987         reti->code_blocks = NULL;
15988
15989     reti->regstclass = NULL;
15990
15991     if (ri->data) {
15992         struct reg_data *d;
15993         const int count = ri->data->count;
15994         int i;
15995
15996         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15997                 char, struct reg_data);
15998         Newx(d->what, count, U8);
15999
16000         d->count = count;
16001         for (i = 0; i < count; i++) {
16002             d->what[i] = ri->data->what[i];
16003             switch (d->what[i]) {
16004                 /* see also regcomp.h and regfree_internal() */
16005             case 'a': /* actually an AV, but the dup function is identical.  */
16006             case 'r':
16007             case 's':
16008             case 'S':
16009             case 'u': /* actually an HV, but the dup function is identical.  */
16010                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16011                 break;
16012             case 'f':
16013                 /* This is cheating. */
16014                 Newx(d->data[i], 1, regnode_ssc);
16015                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16016                 reti->regstclass = (regnode*)d->data[i];
16017                 break;
16018             case 'T':
16019                 /* Trie stclasses are readonly and can thus be shared
16020                  * without duplication. We free the stclass in pregfree
16021                  * when the corresponding reg_ac_data struct is freed.
16022                  */
16023                 reti->regstclass= ri->regstclass;
16024                 /* Fall through */
16025             case 't':
16026                 OP_REFCNT_LOCK;
16027                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16028                 OP_REFCNT_UNLOCK;
16029                 /* Fall through */
16030             case 'l':
16031             case 'L':
16032                 d->data[i] = ri->data->data[i];
16033                 break;
16034             default:
16035                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
16036             }
16037         }
16038
16039         reti->data = d;
16040     }
16041     else
16042         reti->data = NULL;
16043
16044     reti->name_list_idx = ri->name_list_idx;
16045
16046 #ifdef RE_TRACK_PATTERN_OFFSETS
16047     if (ri->u.offsets) {
16048         Newx(reti->u.offsets, 2*len+1, U32);
16049         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16050     }
16051 #else
16052     SetProgLen(reti,len);
16053 #endif
16054
16055     return (void*)reti;
16056 }
16057
16058 #endif    /* USE_ITHREADS */
16059
16060 #ifndef PERL_IN_XSUB_RE
16061
16062 /*
16063  - regnext - dig the "next" pointer out of a node
16064  */
16065 regnode *
16066 Perl_regnext(pTHX_ regnode *p)
16067 {
16068     dVAR;
16069     I32 offset;
16070
16071     if (!p)
16072         return(NULL);
16073
16074     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16075         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
16076     }
16077
16078     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16079     if (offset == 0)
16080         return(NULL);
16081
16082     return(p+offset);
16083 }
16084 #endif
16085
16086 STATIC void
16087 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16088 {
16089     va_list args;
16090     STRLEN l1 = strlen(pat1);
16091     STRLEN l2 = strlen(pat2);
16092     char buf[512];
16093     SV *msv;
16094     const char *message;
16095
16096     PERL_ARGS_ASSERT_RE_CROAK2;
16097
16098     if (l1 > 510)
16099         l1 = 510;
16100     if (l1 + l2 > 510)
16101         l2 = 510 - l1;
16102     Copy(pat1, buf, l1 , char);
16103     Copy(pat2, buf + l1, l2 , char);
16104     buf[l1 + l2] = '\n';
16105     buf[l1 + l2 + 1] = '\0';
16106     va_start(args, pat2);
16107     msv = vmess(buf, &args);
16108     va_end(args);
16109     message = SvPV_const(msv,l1);
16110     if (l1 > 512)
16111         l1 = 512;
16112     Copy(message, buf, l1 , char);
16113     /* l1-1 to avoid \n */
16114     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16115 }
16116
16117 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16118
16119 #ifndef PERL_IN_XSUB_RE
16120 void
16121 Perl_save_re_context(pTHX)
16122 {
16123     dVAR;
16124
16125     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16126     if (PL_curpm) {
16127         const REGEXP * const rx = PM_GETRE(PL_curpm);
16128         if (rx) {
16129             U32 i;
16130             for (i = 1; i <= RX_NPARENS(rx); i++) {
16131                 char digits[TYPE_CHARS(long)];
16132                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
16133                 GV *const *const gvp
16134                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16135
16136                 if (gvp) {
16137                     GV * const gv = *gvp;
16138                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16139                         save_scalar(gv);
16140                 }
16141             }
16142         }
16143     }
16144 }
16145 #endif
16146
16147 #ifdef DEBUGGING
16148
16149 STATIC void
16150 S_put_byte(pTHX_ SV *sv, int c)
16151 {
16152     PERL_ARGS_ASSERT_PUT_BYTE;
16153
16154     if (!isPRINT(c)) {
16155         switch (c) {
16156             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16157             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16158             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16159             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16160             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16161
16162             default:
16163                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16164                 break;
16165         }
16166     }
16167     else {
16168         const char string = c;
16169         if (c == '-' || c == ']' || c == '\\' || c == '^')
16170             sv_catpvs(sv, "\\");
16171         sv_catpvn(sv, &string, 1);
16172     }
16173 }
16174
16175 STATIC bool
16176 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16177 {
16178     /* Appends to 'sv' a displayable version of the innards of the bracketed
16179      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16180      * output anything */
16181
16182     int i;
16183     int rangestart = -1;
16184     bool has_output_anything = FALSE;
16185
16186     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16187
16188     for (i = 0; i <= 256; i++) {
16189         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16190             if (rangestart == -1)
16191                 rangestart = i;
16192         } else if (rangestart != -1) {
16193             int j = i - 1;
16194             if (i <= rangestart + 3) {  /* Individual chars in short ranges */
16195                 for (; rangestart < i; rangestart++)
16196                     put_byte(sv, rangestart);
16197             }
16198             else if (   j > 255
16199                      || ! isALPHANUMERIC(rangestart)
16200                      || ! isALPHANUMERIC(j)
16201                      || isDIGIT(rangestart) != isDIGIT(j)
16202                      || isUPPER(rangestart) != isUPPER(j)
16203                      || isLOWER(rangestart) != isLOWER(j)
16204
16205                         /* This final test should get optimized out except
16206                          * on EBCDIC platforms, where it causes ranges that
16207                          * cross discontinuities like i/j to be shown as hex
16208                          * instead of the misleading, e.g. H-K (since that
16209                          * range includes more than H, I, J, K). */
16210                      || (j - rangestart)
16211                          != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
16212             {
16213                 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
16214                                rangestart,
16215                                (j < 256) ? j : 255);
16216             }
16217             else { /* Here, the ends of the range are both digits, or both
16218                       uppercase, or both lowercase; and there's no
16219                       discontinuity in the range (which could happen on EBCDIC
16220                       platforms) */
16221                 put_byte(sv, rangestart);
16222                 sv_catpvs(sv, "-");
16223                 put_byte(sv, j);
16224             }
16225             rangestart = -1;
16226             has_output_anything = TRUE;
16227         }
16228     }
16229
16230     return has_output_anything;
16231 }
16232
16233 #define CLEAR_OPTSTART \
16234     if (optstart) STMT_START { \
16235             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16236             optstart=NULL; \
16237     } STMT_END
16238
16239 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16240
16241 STATIC const regnode *
16242 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16243             const regnode *last, const regnode *plast,
16244             SV* sv, I32 indent, U32 depth)
16245 {
16246     dVAR;
16247     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16248     const regnode *next;
16249     const regnode *optstart= NULL;
16250
16251     RXi_GET_DECL(r,ri);
16252     GET_RE_DEBUG_FLAGS_DECL;
16253
16254     PERL_ARGS_ASSERT_DUMPUNTIL;
16255
16256 #ifdef DEBUG_DUMPUNTIL
16257     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16258         last ? last-start : 0,plast ? plast-start : 0);
16259 #endif
16260
16261     if (plast && plast < last)
16262         last= plast;
16263
16264     while (PL_regkind[op] != END && (!last || node < last)) {
16265         /* While that wasn't END last time... */
16266         NODE_ALIGN(node);
16267         op = OP(node);
16268         if (op == CLOSE || op == WHILEM)
16269             indent--;
16270         next = regnext((regnode *)node);
16271
16272         /* Where, what. */
16273         if (OP(node) == OPTIMIZED) {
16274             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16275                 optstart = node;
16276             else
16277                 goto after_print;
16278         } else
16279             CLEAR_OPTSTART;
16280
16281         regprop(r, sv, node);
16282         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16283                       (int)(2*indent + 1), "", SvPVX_const(sv));
16284
16285         if (OP(node) != OPTIMIZED) {
16286             if (next == NULL)           /* Next ptr. */
16287                 PerlIO_printf(Perl_debug_log, " (0)");
16288             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
16289                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16290             else
16291                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16292             (void)PerlIO_putc(Perl_debug_log, '\n');
16293         }
16294
16295       after_print:
16296         if (PL_regkind[(U8)op] == BRANCHJ) {
16297             assert(next);
16298             {
16299                 const regnode *nnode = (OP(next) == LONGJMP
16300                                        ? regnext((regnode *)next)
16301                                        : next);
16302                 if (last && nnode > last)
16303                     nnode = last;
16304                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16305             }
16306         }
16307         else if (PL_regkind[(U8)op] == BRANCH) {
16308             assert(next);
16309             DUMPUNTIL(NEXTOPER(node), next);
16310         }
16311         else if ( PL_regkind[(U8)op]  == TRIE ) {
16312             const regnode *this_trie = node;
16313             const char op = OP(node);
16314             const U32 n = ARG(node);
16315             const reg_ac_data * const ac = op>=AHOCORASICK ?
16316                (reg_ac_data *)ri->data->data[n] :
16317                NULL;
16318             const reg_trie_data * const trie =
16319                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16320 #ifdef DEBUGGING
16321             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16322 #endif
16323             const regnode *nextbranch= NULL;
16324             I32 word_idx;
16325             sv_setpvs(sv, "");
16326             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16327                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16328
16329                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16330                    (int)(2*(indent+3)), "",
16331                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
16332                             PL_colors[0], PL_colors[1],
16333                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
16334                             PERL_PV_PRETTY_ELLIPSES    |
16335                             PERL_PV_PRETTY_LTGT
16336                             )
16337                             : "???"
16338                 );
16339                 if (trie->jump) {
16340                     U16 dist= trie->jump[word_idx+1];
16341                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16342                                   (UV)((dist ? this_trie + dist : next) - start));
16343                     if (dist) {
16344                         if (!nextbranch)
16345                             nextbranch= this_trie + trie->jump[0];
16346                         DUMPUNTIL(this_trie + dist, nextbranch);
16347                     }
16348                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16349                         nextbranch= regnext((regnode *)nextbranch);
16350                 } else {
16351                     PerlIO_printf(Perl_debug_log, "\n");
16352                 }
16353             }
16354             if (last && next > last)
16355                 node= last;
16356             else
16357                 node= next;
16358         }
16359         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16360             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16361                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16362         }
16363         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16364             assert(next);
16365             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16366         }
16367         else if ( op == PLUS || op == STAR) {
16368             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16369         }
16370         else if (PL_regkind[(U8)op] == ANYOF) {
16371             /* arglen 1 + class block */
16372             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16373                     ? ANYOF_POSIXL_SKIP : ANYOF_SKIP);
16374             node = NEXTOPER(node);
16375         }
16376         else if (PL_regkind[(U8)op] == EXACT) {
16377             /* Literal string, where present. */
16378             node += NODE_SZ_STR(node) - 1;
16379             node = NEXTOPER(node);
16380         }
16381         else {
16382             node = NEXTOPER(node);
16383             node += regarglen[(U8)op];
16384         }
16385         if (op == CURLYX || op == OPEN)
16386             indent++;
16387     }
16388     CLEAR_OPTSTART;
16389 #ifdef DEBUG_DUMPUNTIL
16390     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16391 #endif
16392     return node;
16393 }
16394
16395 #endif  /* DEBUGGING */
16396
16397 /*
16398  * Local variables:
16399  * c-indentation-style: bsd
16400  * c-basic-offset: 4
16401  * indent-tabs-mode: nil
16402  * End:
16403  *
16404  * ex: set ts=8 sts=4 sw=4 et:
16405  */